In [88]:
import time
import os
import gc
import calendar
import glob
from pyhere import here

import PIL
PIL.Image.MAX_IMAGE_PIXELS = None

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torchvision.transforms as T
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from skimage import io
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier

from scipy.stats import spearmanr
from scipy.stats import pearsonr

In [2]:
torch.cuda.is_available()

True

In [3]:
print(torch.__version__)

1.10.2


In [4]:
os.getcwd()

'C:\\Users\\Cullen\\Desktop\\GitHub\\strip_ai'

## Set Parameters

In [5]:
num_features = 4096

First we define the pytorch model that we will use to extract the features.

In [6]:
class RCF(nn.Module):
    """A model for extracting Random Convolution Features (RCF) from input imagery."""
    def __init__(self, num_features=16, kernel_size=3, num_input_channels=3):
        super(RCF, self).__init__()
        # We create `num_features / 2` filters so require `num_features` to be divisible by 2
        assert num_features % 2 == 0, "Please enter an even number of features."
        # Applies a 2D convolution over an input image composed of several input planes.
        self.conv1 = nn.Conv2d(
            num_input_channels,
            num_features // 2,
            kernel_size=kernel_size,
            stride=1,
            padding=0,
            dilation=1,
            bias=True,
        )
        # Fills the input Tensor 'conv1.weight' with values drawn from the normal distribution
        nn.init.normal_(self.conv1.weight, mean=0.0, std=1.0) 
        # Fills the input Tensor 'conv1.bias' with the value 'val = -1'.
        nn.init.constant_(self.conv1.bias, -1.0)
    def forward(self, x):
        # The rectified linear activation function or ReLU for short is a piecewise linear function 
        # that will output the input directly if it is positive, otherwise, it will output zero.
        x1a = F.relu(self.conv1(x), inplace=True)
        # The below step is where we take the inverse which is appended later
        x1b = F.relu(-self.conv1(x), inplace=True)
        # Applies a 2D adaptive average pooling over an input signal composed of several input planes.
        x1a = F.adaptive_avg_pool2d(x1a, (1, 1)).squeeze()
        x1b = F.adaptive_avg_pool2d(x1b, (1, 1)).squeeze()
        if len(x1a.shape) == 1:  # case where we passed a single input
            return torch.cat((x1a, x1b), dim=0)
        elif len(x1a.shape) == 2:  # case where we passed a batch of > 1 inputs
            return torch.cat((x1a, x1b), dim=1)

Next, we initialize the model and pytorch components

In [7]:
device = torch.device("cuda")
model = RCF(num_features).eval().to(device)

In [8]:
directory = f"{os.getcwd()}\\data\\train\\"
path = directory+"*.tif"
files = glob.glob(path)

In [9]:
directory

'C:\\Users\\Cullen\\Desktop\\GitHub\\strip_ai\\data\\train\\'

In [10]:
path

'C:\\Users\\Cullen\\Desktop\\GitHub\\strip_ai\\data\\train\\*.tif'

In [28]:
cropper = T.RandomCrop(size=(128, 128))
img_tens = T.ToTensor()

In [49]:
class CustomDataset(Dataset):
    def __init__(self, fns):
        self.fns = fns
    def __len__(self):
        return len(self.fns)
    def __getitem__(self, idx):
        fn = self.fns[idx]
        if fn is None:
            return None
        else:
            image = PIL.Image.open(fn)
            image = cropper(image)
            out_image = img_tens(image)
            return out_image

In [50]:
dataset = CustomDataset(files)

In [1]:
# img = dataset[0]

In [2]:
# img

In [53]:
dataloader = DataLoader(
    dataset,
    batch_size=8,
    shuffle=False,
    num_workers=os.cpu_count()*2,
    collate_fn=lambda x: x,
    pin_memory=False,
)

In [54]:
len(files)

754

In [61]:
x_all = np.zeros((len(files), num_features), dtype=float)
tic = time.time()
i = 0
if __name__ == '__main__':
    dataloader = DataLoader(
        dataset,
        batch_size=1,
        shuffle=True,
        num_workers = 0, # os.cpu_count(), 
    )
    for images in dataloader:
        for image in images:
            if i % 50 == 0:
                print(
                    f"{i}/{len(files)} -- {i / len(files) * 100:0.2f}%"
                    + f" -- {time.time()-tic:0.2f} seconds"
                )
                tic = time.time()
            image = image.to(device)
            with torch.no_grad():
                feats = model(image.unsqueeze(0)).cpu().numpy()
            x_all[i] = feats
            i += 1
            torch.cuda.empty_cache()

0/754 -- 0.00% -- 0.44 seconds
50/754 -- 6.63% -- 2.43 seconds
100/754 -- 13.26% -- 9.58 seconds
150/754 -- 19.89% -- 11.69 seconds
200/754 -- 26.53% -- 0.77 seconds
250/754 -- 33.16% -- 1.39 seconds
300/754 -- 39.79% -- 0.81 seconds
350/754 -- 46.42% -- 8.94 seconds
400/754 -- 53.05% -- 8.50 seconds
450/754 -- 59.68% -- 22.29 seconds
500/754 -- 66.31% -- 8.24 seconds
550/754 -- 72.94% -- 2.81 seconds
600/754 -- 79.58% -- 7.25 seconds
650/754 -- 86.21% -- 6.50 seconds
700/754 -- 92.84% -- 22.61 seconds
750/754 -- 99.47% -- 4.39 seconds


In [65]:
x_all_df = pd.DataFrame(x_all)
x_all_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0.0,0.0,8.239051,0.0,0.0,0.023780,0.0,0.0,3.433865,0.000000,...,0.0,1.412491,1.830873,0.0,4.947946,1.412119,0.0,0.0,0.0,13.129076
1,0.0,0.0,8.239051,0.0,0.0,0.023780,0.0,0.0,3.433865,0.000000,...,0.0,1.412491,1.830873,0.0,4.947946,1.412119,0.0,0.0,0.0,13.129076
2,0.0,0.0,8.030476,0.0,0.0,0.213077,0.0,0.0,3.572743,0.000000,...,0.0,1.356128,1.645532,0.0,4.994600,1.588138,0.0,0.0,0.0,12.729886
3,0.0,0.0,8.184902,0.0,0.0,0.000043,0.0,0.0,3.407203,0.000000,...,0.0,1.418545,1.839663,0.0,4.955635,1.421385,0.0,0.0,0.0,13.099815
4,0.0,0.0,7.828653,0.0,0.0,0.000047,0.0,0.0,3.236813,0.000000,...,0.0,1.396019,1.796988,0.0,4.779365,1.396417,0.0,0.0,0.0,12.599577
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,0.0,0.0,8.030476,0.0,0.0,0.213077,0.0,0.0,3.572743,0.000000,...,0.0,1.356128,1.645532,0.0,4.994600,1.588138,0.0,0.0,0.0,12.729886
750,0.0,0.0,8.164559,0.0,0.0,0.091386,0.0,0.0,3.483464,0.000000,...,0.0,1.392361,1.764680,0.0,4.964608,1.474983,0.0,0.0,0.0,12.986511
751,0.0,0.0,8.030476,0.0,0.0,0.213077,0.0,0.0,3.572743,0.000000,...,0.0,1.356128,1.645532,0.0,4.994600,1.588138,0.0,0.0,0.0,12.729886
752,0.0,0.0,8.238100,0.0,0.0,0.023785,0.0,0.0,3.433432,0.000000,...,0.0,1.412433,1.830744,0.0,4.947489,1.412088,0.0,0.0,0.0,13.127678


In [68]:
train_df = pd.read_csv(here("data", "train.csv"))
train_df

Unnamed: 0,image_id,center_id,patient_id,image_num,label
0,006388_0,11,006388,0,CE
1,008e5c_0,11,008e5c,0,CE
2,00c058_0,11,00c058,0,LAA
3,01adc5_0,11,01adc5,0,LAA
4,026c97_0,4,026c97,0,CE
...,...,...,...,...,...
749,fe9645_0,3,fe9645,0,CE
750,fe9bec_0,4,fe9bec,0,LAA
751,ff14e0_0,6,ff14e0,0,CE
752,ffec5c_0,7,ffec5c,0,LAA


In [69]:
x_all_df["label"] = train_df['label']

In [70]:
x_all = x_all_df.drop(["label"], axis = 1)

In [71]:
y_all = x_all_df['label']

In [74]:
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size = 0.2, random_state = 42
)

In [77]:
ridge_cv_random = RidgeClassifierCV(cv = 5, alphas = np.logspace(-8, 8, base = 10, num = 17))
ridge_cv_random.fit(x_train, y_train)

In [87]:
ridge_cv_random.alpha_

100.0

In [85]:
ridge_cv_random.score(x_train, y_train)

0.7396351575456053

In [86]:
ridge_cv_random.score(x_test, y_test)

0.6754966887417219

In [78]:
ridge_cv_random.best_score_

0.7413085399449035

In [82]:
y_pred = ridge_cv_random.predict(x_train)    
# r2_score(y_train, y_pred)

In [105]:
rf_class = RandomForestClassifier(max_depth=8, random_state=0, n_estimators = 3000)

In [106]:
rf_class.fit(x_train, y_train)

In [107]:
rf_class.score(x_train, y_train)

0.8225538971807629

In [108]:
rf_class.score(x_test, y_test)

0.6754966887417219