In [86]:
from pathlib import Path
from tqdm import tqdm, tqdm_notebook

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt



import tensorflow as tf
from tensorflow import keras
from keras import Sequential

import keras
from keras.models import Sequential
from keras.layers import GRU, LSTM
import numpy as np

In [87]:
# put your own path to the data root directory (see example in `Data architecture` section)
data_dir = Path("..")

# load the training and testing data sets
train_features_dir = data_dir / "train_input" / "moco_features"
test_features_dir = data_dir / "test_input" / "moco_features"
df_train = pd.read_csv(data_dir  / "supplementary_data" / "train_metadata.csv")
df_test = pd.read_csv(data_dir  / "supplementary_data" / "test_metadata.csv")

# concatenate y_train and df_train
y_train = pd.read_csv(data_dir  / "train_output_76GDcgx.csv")
df_train = df_train.merge(y_train, on="Sample ID")

print(f"Training data dimensions: {df_train.shape}")  # (344, 4)
df_train.head()

Training data dimensions: (344, 4)


Unnamed: 0,Sample ID,Patient ID,Center ID,Target
0,ID_001.npy,P_001,C_1,0
1,ID_002.npy,P_002,C_2,1
2,ID_005.npy,P_005,C_5,0
3,ID_006.npy,P_006,C_5,0
4,ID_007.npy,P_007,C_2,1


In [88]:
X_train = []
y_train = []
centers_train = []
patients_train = []

for sample, label, center, patient in tqdm(
    df_train[["Sample ID", "Target", "Center ID", "Patient ID"]].values
):
    # load the coordinates and features (1000, 3+2048)
    _features = np.load(train_features_dir / sample)
    # get coordinates (zoom level, tile x-coord on the slide, tile y-coord on the slide)
    # and the MoCo V2 features
    coordinates, features = _features[:, :3], _features[:, 3:]  # Ks
    # slide-level averaging
    #X_train.append(np.mean(features, axis=0))
    
    X_train.append(features)
    y_train.append([label]*1000)
    centers_train.append(center)
    patients_train.append(patient)

# convert to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
centers_train = np.array(centers_train)
patients_train = np.array(patients_train)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 344/344 [00:01<00:00, 185.98it/s]


In [89]:

def reduce_y(y):
    y_reduced = list()
    for y_ in y:
        y_reduced.append(np.array([y_[0]]))
    return np.array(y_reduced)


def train_val_test_split(X, y, train_size=0.8, val_size=0.1, test_size=0.1, random_state=42):
    """
    Split the input data into training, testing, and validation sets
    Args:
        X: input data
        y: labels
        train_size: proportion of data to be used for training
        val_size: proportion of data to be used for validation
        test_size: proportion of data to be used for testing
        random_state: seed for random number generator
    Returns:
        X_train, X_val, X_test, y_train, y_val, y_test
    """
    np.random.seed(random_state)
    n_samples = X.shape[0]
    idx = np.random.permutation(n_samples)
    X, y = X[idx], y[idx]
    train_end = int(train_size * n_samples)
    val_end = int((train_size + val_size) * n_samples)
    X_train, y_train = X[:train_end], y[:train_end]
    X_val, y_val = X[train_end:val_end], y[train_end:val_end]
    X_test, y_test = X[val_end:], y[val_end:]
    return X_train, X_val, X_test, reduce_y(y_train), reduce_y(y_val), reduce_y(y_test)

In [90]:
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X_train, y_train, train_size=0.8, val_size=.2, test_size=0)


X_train = torch.from_numpy(X_train)
y_train = torch.from_numpy(y_train)
y_train = y_train.float()



X_val = torch.from_numpy(X_val)
y_val = torch.from_numpy(y_val)
y_val = y_val.float()


X_test = torch.from_numpy(X_test)
y_test = torch.from_numpy(y_test)
y_test = y_test.float()

In [91]:
X_train.shape

torch.Size([275, 1000, 2048])

In [108]:
import torch
import torch.nn as nn

# Define input shape
input_shape = (1000, 2048)

# Create the GRU model
class GRUModel(nn.Module):
    def __init__(self):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size=2048, hidden_size=16, num_layers=1)
        self.fc = nn.Linear(16, 1)
        self.dropout = nn.Dropout(p=0.4)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x, _ = self.gru(x)
        x = self.dropout(x[:,-1,:])
        x = self.fc(x)
        x = self.sigmoid(x)
        return x

# Initialize the model
model = GRUModel()

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())



In [109]:
from sklearn.metrics import roc_auc_score

In [110]:
y_train.shape

torch.Size([275, 1])

In [112]:
# Training loop
for epoch in tqdm_notebook(range(60)):
    # Forward pass
    output = model(X_train)
    # Compute loss
    loss = criterion(output, y_train)
    # Zero gradients
    optimizer.zero_grad()
    # Backward pass and update weights
    loss.backward()
    optimizer.step()
    print(f'======================== EPOCH - {epoch+1} ===========================')
    print('\nTrain\n')
    print(f'Training Loss: {loss.item()}')
    train_score = roc_auc_score(y_train, (output > 0.5).long())
    print(f'Train ROC AUC score: {train_score}')
    
    print('\nVal\n')
    with torch.no_grad():
        output = model(X_val)
        predicted = (output > 0.5).long()
        accuracy = (predicted == y_val).float().mean()
        #print(f'Validation Accuracy: {accuracy}')
        val_loss = criterion(output, output)
        print(f'Val loss : {val_loss}')
        
        #######
    
        test_score = roc_auc_score(y_val, predicted)
        print(f'Val ROC AUC score: {test_score}')
    


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm_notebook(range(60)):


  0%|          | 0/60 [00:00<?, ?it/s]

Training Loss: 0.7836887240409851
Train ROC AUC score: 0.4824850123427765
Val loss : 0.7470728754997253
Test ROC AUC score: 0.4865546218487395
Training Loss: 0.7761906981468201
Train ROC AUC score: 0.5279181850240978
Val loss : 0.7553071975708008
Test ROC AUC score: 0.4857142857142857
Training Loss: 0.7703194618225098
Train ROC AUC score: 0.49068414247090625
Val loss : 0.7549670934677124
Test ROC AUC score: 0.5008403361344538
Training Loss: 0.7603592872619629
Train ROC AUC score: 0.5019395791700952
Val loss : 0.7590664625167847
Test ROC AUC score: 0.5
Training Loss: 0.759238064289093
Train ROC AUC score: 0.494475138121547
Val loss : 0.7619166970252991
Test ROC AUC score: 0.5
Training Loss: 0.7468439340591431
Train ROC AUC score: 0.494475138121547
Val loss : 0.7607215046882629
Test ROC AUC score: 0.5
Training Loss: 0.7411972880363464
Train ROC AUC score: 0.5
Val loss : 0.7591248750686646
Test ROC AUC score: 0.5
Training Loss: 0.7406487464904785
Train ROC AUC score: 0.5
Val loss : 0.7575

Val loss : 0.7232863903045654
Test ROC AUC score: 0.5
Training Loss: 0.7055976390838623
Train ROC AUC score: 0.5
Val loss : 0.7253056764602661
Test ROC AUC score: 0.5
Training Loss: 0.704584538936615
Train ROC AUC score: 0.5
Val loss : 0.7237136960029602
Test ROC AUC score: 0.5
Training Loss: 0.7053648829460144
Train ROC AUC score: 0.5
Val loss : 0.722476065158844
Test ROC AUC score: 0.5
Training Loss: 0.7018441557884216
Train ROC AUC score: 0.5
Val loss : 0.7243876457214355
Test ROC AUC score: 0.5
Training Loss: 0.7021026015281677
Train ROC AUC score: 0.5
Val loss : 0.7181814312934875
Test ROC AUC score: 0.5
Training Loss: 0.7058561444282532
Train ROC AUC score: 0.5
Val loss : 0.7217242121696472
Test ROC AUC score: 0.5
Training Loss: 0.7057611346244812
Train ROC AUC score: 0.5
Val loss : 0.7214482426643372
Test ROC AUC score: 0.5
Training Loss: 0.7029200792312622
Train ROC AUC score: 0.5
Val loss : 0.7244160771369934
Test ROC AUC score: 0.5
Training Loss: 0.7022610902786255
Train ROC 

In [98]:
X_test = []

# load the data from `df_test` (~ 1 minute)
for sample in tqdm(df_test["Sample ID"].values):
    _features = np.load(test_features_dir / sample)
    coordinates, features = _features[:, :3], _features[:, 3:]
    X_test.append(features)

X_test = np.array(X_test)
X_test = torch.from_numpy(X_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 149/149 [00:00<00:00, 271.81it/s]


In [99]:
# Evaluation
with torch.no_grad():
    output = model(X_test)
    predicted = (output > 0.5).long()
    accuracy = (predicted == y_test).float().mean()
    print(f'Accuracy: {accuracy}')

Accuracy: nan


In [100]:
preds_test = np.squeeze(model(X_test).detach().numpy())


In [101]:
preds_test.shape

(149,)

In [102]:
submission = pd.DataFrame(
    {"Sample ID": df_test["Sample ID"].values, "Target": preds_test}
).sort_values(
    "Sample ID"
)  # extra step to sort the sample IDs

# sanity checks
assert all(submission["Target"].between(0, 1)), "`Target` values must be in [0, 1]"
assert submission.shape == (149, 2), "Your submission file must be of shape (149, 2)"
assert list(submission.columns) == [
    "Sample ID",
    "Target",
], "Your submission file must have columns `Sample ID` and `Target`"

# save the submission as a csv file
submission.to_csv(data_dir / "benchmark_test_output.csv", index=None)
submission.head()

Unnamed: 0,Sample ID,Target
0,ID_003.npy,0.135344
1,ID_004.npy,0.091483
2,ID_008.npy,0.137148
3,ID_009.npy,0.088691
4,ID_010.npy,0.116247
