In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader
import torch.nn.functional as F

from torch.utils.data import Dataset
torch.manual_seed(0)
X_train = np.load('X_train.npy')
y_train = np.load('y_train.npy')


In [2]:
# Define your custom dataset
class MyDataset(Dataset):
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.images[idx], self.labels[idx]

# Create dataset
train_dataset = MyDataset(X_train, y_train)
# Create DataLoaders for training and testing
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
del X_train

In [3]:
# Simpler model

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(65, 2, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(128*240, 65)  
        self.dropout = nn.Dropout(p=0.5)  # Dropout layer
        self.fc3 = nn.Linear(65, 2)
        

    def forward(self, x):
        x = x.view(-1, 65, 256, 240)
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = x.view(x.size(0), -1)  # Flatten layer
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x
# Initialize the network and print its architecture
model = Net()
print(model)

Net(
  (conv1): Conv2d(65, 2, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=30720, out_features=65, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=65, out_features=2, bias=True)
)


In [4]:
# Use CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(weight = torch.tensor([0.5,1.0]).to(device))
optimizer = optim.Adam(model.parameters(), lr=1e-4)


# Initialize empty lists to store losses
train_losses = []

# Training loop
for epoch in range(15):
    running_loss = 0.0
    model.train()
    for i, data in enumerate(train_dataloader, 0):
        inputs, labels = data[0].to(device), data[1].to(device)
        inputs = inputs.float()
        labels = labels.type(torch.LongTensor)   # casting to long
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        
        l1_lambda = 0.0005
        l1_norm = sum(p.abs().sum() for p in model.parameters())
        loss = criterion(outputs, labels)
        loss+=l1_lambda*l1_norm
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        # Compute training loss
        train_losses.append(loss.item())
    print(f'Epoch {epoch+1}, training loss: {running_loss/len(train_dataloader):.4f}')

# Now without regularization
optimizer = optim.Adam(model.parameters(), lr=1e-3)
# Training loop
for epoch in range(20):
    running_loss = 0.0
    model.train()
    for i, data in enumerate(train_dataloader, 0):
        inputs, labels = data[0].to(device), data[1].to(device)
        inputs = inputs.float()
        labels = labels.type(torch.LongTensor)   # casting to long
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        # Compute training loss
        train_losses.append(loss.item())
    print(f'Epoch {epoch+1}, training loss: {running_loss/len(train_dataloader):.4f}')

print('Finished Training')

Epoch 1, training loss: 3.0144
Epoch 2, training loss: 2.0268
Epoch 3, training loss: 1.3780
Epoch 4, training loss: 1.0252
Epoch 5, training loss: 0.8956
Epoch 6, training loss: 0.8356
Epoch 7, training loss: 0.8230
Epoch 8, training loss: 0.8027
Epoch 9, training loss: 0.8040
Epoch 10, training loss: 0.8018
Epoch 11, training loss: 0.7677
Epoch 12, training loss: 0.7583
Epoch 13, training loss: 0.7618
Epoch 14, training loss: 0.7770
Epoch 15, training loss: 0.7990
Epoch 1, training loss: 0.7308
Epoch 2, training loss: 0.6901
Epoch 3, training loss: 0.6770
Epoch 4, training loss: 0.6147
Epoch 5, training loss: 0.5677
Epoch 6, training loss: 0.4948
Epoch 7, training loss: 0.4114
Epoch 8, training loss: 0.3996
Epoch 9, training loss: 0.3288
Epoch 10, training loss: 0.2399
Epoch 11, training loss: 0.2265
Epoch 12, training loss: 0.1683
Epoch 13, training loss: 0.1567
Epoch 14, training loss: 0.1151
Epoch 15, training loss: 0.1041
Epoch 16, training loss: 0.1024
Epoch 17, training loss: 0

In [5]:
class MyModel(nn.Module):
    def __init__(self, original_model):
        super(MyModel, self).__init__()
        self.conv1 = original_model.conv1
        self.fc1 = original_model.fc1

    def forward(self, x):
        x = x.view(-1, 65, 256, 240)
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = x.view(x.size(0), -1)  # Flatten layer
        x = F.relu(self.fc1(x))
        return x

# Create the new model
new_model = MyModel(model)

In [6]:
outputs = [] #stores one feature/value for each image for each patient.  Shape is (*,32,65).
labels_train = []
for data in train_dataloader:
    input_data = data[0].float()
    labels_train.append(data[1])
    output = new_model(input_data.to(device)) #Shape (*, 65)
    outputs.append(output.detach().cpu().numpy()) 
flattened_outputs = np.concatenate(outputs, axis=0)
flattened_labels = np.concatenate(labels_train, axis = 0)
del outputs, output
np.save('modified_train.npy', flattened_outputs)
np.save('modified_labels.npy', flattened_labels)

In [7]:
del train_dataloader, train_dataset

In [8]:
X_test = np.load('X_test.npy')
y_test = np.load('y_test.npy')
test_dataset = MyDataset(X_test, y_test)

outputs_2 = []
# Assuming x_test is a PyTorch tensor
testloader = torch.utils.data.DataLoader(X_test, batch_size=32)
del X_test
for data in testloader:
    input_data = data.float()
    output = new_model(input_data.to(device))
    outputs_2.append(output.detach().cpu().numpy()) 


flattened_outputs_2 = np.concatenate(outputs_2, axis=0)
del outputs_2, output

In [9]:
np.save('modified_test.npy',flattened_outputs_2)

In [1]:
import numpy as np
x_train = np.load('modified_train.npy')
y_train = np.load('modified_labels.npy')
x_test = np.load('modified_test.npy')
y_test = np.load('y_test.npy')

In [2]:
x_test[:].shape

(119, 65)

In [3]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler

# Suppose 'data' is your 2D numpy array, with each row a datapoint
# And 'labels' is a 1D numpy array containing the class label for each datapoint
data = np.vstack((x_train,x_test[:50]))  # Your data
labels = np.hstack((y_train,y_test[:50]))  # Your labels

# Split the data into training and testing sets first
data_train, data_test, labels_train, labels_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

scale_pos_weight = sum(labels_train == 0) / sum(labels_train == 1)

# Define the XGBoost model
model = XGBClassifier(
    max_depth=2,  # Maximum depth of the trees
    learning_rate=0.3,  # Learning rate (eta)
    n_estimators=50,  # Number of training rounds
    objective='binary:logistic',  # Objective function for binary classification
    scale_pos_weight = scale_pos_weight,
    random_state=42  # Random seed
)

# Train the model with early stopping
eval_set = [(data_test, labels_test)]  # Validation set for early stopping
model.fit(data_train, labels_train,eval_metric="logloss", eval_set=eval_set)

# Make predictions on the test set
preds_prob = model.predict_proba(data_test)

# 'preds_prob' now contains the predicted probabilities of the positive class for each datapoint in the test set


[0]	validation_0-logloss:0.49214
[1]	validation_0-logloss:0.36837
[2]	validation_0-logloss:0.28666
[3]	validation_0-logloss:0.23350
[4]	validation_0-logloss:0.18804
[5]	validation_0-logloss:0.14829
[6]	validation_0-logloss:0.12166
[7]	validation_0-logloss:0.09662
[8]	validation_0-logloss:0.08514
[9]	validation_0-logloss:0.07037
[10]	validation_0-logloss:0.06160
[11]	validation_0-logloss:0.05388
[12]	validation_0-logloss:0.04850
[13]	validation_0-logloss:0.04394
[14]	validation_0-logloss:0.04018
[15]	validation_0-logloss:0.03495




[16]	validation_0-logloss:0.03199
[17]	validation_0-logloss:0.02878
[18]	validation_0-logloss:0.02704
[19]	validation_0-logloss:0.02514
[20]	validation_0-logloss:0.02263
[21]	validation_0-logloss:0.02274
[22]	validation_0-logloss:0.02266
[23]	validation_0-logloss:0.02096
[24]	validation_0-logloss:0.02030
[25]	validation_0-logloss:0.01940
[26]	validation_0-logloss:0.01840
[27]	validation_0-logloss:0.01843
[28]	validation_0-logloss:0.01781
[29]	validation_0-logloss:0.01655
[30]	validation_0-logloss:0.01661
[31]	validation_0-logloss:0.01642
[32]	validation_0-logloss:0.01682
[33]	validation_0-logloss:0.01669
[34]	validation_0-logloss:0.01631
[35]	validation_0-logloss:0.01565
[36]	validation_0-logloss:0.01547
[37]	validation_0-logloss:0.01582
[38]	validation_0-logloss:0.01561
[39]	validation_0-logloss:0.01552
[40]	validation_0-logloss:0.01538
[41]	validation_0-logloss:0.01532
[42]	validation_0-logloss:0.01509
[43]	validation_0-logloss:0.01500
[44]	validation_0-logloss:0.01495
[45]	validatio

In [6]:
np.round(preds_prob,3)

array([[0.998, 0.002],
       [1.   , 0.   ],
       [0.995, 0.005],
       [0.999, 0.001],
       [1.   , 0.   ],
       [1.   , 0.   ],
       [1.   , 0.   ],
       [1.   , 0.   ],
       [1.   , 0.   ],
       [0.006, 0.994],
       [0.999, 0.001],
       [1.   , 0.   ],
       [1.   , 0.   ],
       [0.   , 1.   ],
       [0.999, 0.001],
       [0.997, 0.003],
       [0.   , 1.   ],
       [0.92 , 0.08 ],
       [1.   , 0.   ],
       [0.001, 0.999],
       [0.   , 1.   ],
       [0.999, 0.001],
       [0.01 , 0.99 ],
       [0.996, 0.004],
       [0.005, 0.995],
       [0.   , 1.   ],
       [0.002, 0.998],
       [0.999, 0.001],
       [0.998, 0.002],
       [0.998, 0.002],
       [0.007, 0.993],
       [1.   , 0.   ],
       [0.996, 0.004],
       [0.002, 0.998],
       [0.999, 0.001],
       [1.   , 0.   ],
       [0.998, 0.002],
       [0.   , 1.   ],
       [0.999, 0.001],
       [0.004, 0.996],
       [1.   , 0.   ],
       [0.991, 0.009],
       [0.999, 0.001],
       [0.9

In [7]:
y_test

array([0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1.,
       0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
       0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 0., 1.])

In [243]:
from sklearn.metrics import confusion_matrix
# Make predictions on the test set
def find_optimal_threshold(predictions, y_test):
    min_sum = float('inf')
    optimal_threshold = 0

    # Iterate over possible thresholds from 0 to 1
    for threshold in np.arange(0.0, 1, 0.001):
        # Apply threshold
        preds = (np.array(predictions)[:,1] > threshold).astype(int)

        # Compute confusion matrix
        cm = confusion_matrix(y_test, preds)

        # Compute sum of off-diagonal elements
        off_diagonal_sum = 164 - np.trace(cm)
        #print(cm)
        # Update optimal threshold if this threshold is better
        if off_diagonal_sum < min_sum and cm[1][1]/np.sum(cm[1]) >0.5:
            min_sum = off_diagonal_sum
            optimal_threshold = threshold

    return optimal_threshold

threshold = find_optimal_threshold(preds_prob, labels_test)
preds = (preds_prob[:, 1]> threshold).astype(int)

# Calculate the accuracy of the model
accuracy = accuracy_score(labels_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 100.00%


In [244]:
from sklearn.metrics import roc_auc_score
roc_auc_score(labels_test, preds)

1.0

In [245]:
confusion_matrix(labels_test, preds)

array([[68,  0],
       [ 0, 27]], dtype=int64)

In [246]:
from sklearn.metrics import precision_recall_fscore_support as score
# Calculate precision, recall, F1 score
precision, recall, fscore, _ = score(labels_test, preds)

In [247]:
precision, recall, fscore

(array([1., 1.]), array([1., 1.]), array([1., 1.]))

In [248]:
pred_test = model.predict_proba(x_test[50:])

In [249]:
threshold = find_optimal_threshold(pred_test, y_test[50:])

In [250]:
preds = (pred_test[:, 1]> threshold).astype(int)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test[50:], preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
roc_auc_score(y_test[50:], preds)

Accuracy: 66.67%


0.6522727272727273

In [251]:
confusion_matrix(y_test[50:], preds)

array([[31, 13],
       [10, 15]], dtype=int64)

In [252]:
# Calculate precision, recall, F1 score
precision, recall, fscore, _ = score(y_test[50:], preds)
precision, recall, fscore

(array([0.75609756, 0.53571429]),
 array([0.70454545, 0.6       ]),
 array([0.72941176, 0.56603774]))

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler

In [354]:
# Suppose 'data' is your 2D numpy array, with each row a datapoint
# And 'labels' is a 1D numpy array containing the class label for each datapoint
data = np.vstack((x_train,x_test[:50]))  # Your data
labels = np.hstack((y_train,y_test[:50]))  # Your labels

# Split the data into training and testing sets first
data_train, data_test, labels_train, labels_test = train_test_split(x_train, y_train, test_size=0.5, random_state=42, stratify = y_train)

scale_pos_weight = sum(labels_train == 0) / sum(labels_train == 1)

# Define the XGBoost model
model = XGBClassifier(
    max_depth=2,  # Maximum depth of the trees
    learning_rate=0.2,  # Learning rate (eta)
    n_estimators=50,  # Number of training rounds
    objective='binary:logistic',  # Objective function for binary classification
    scale_pos_weight = scale_pos_weight,
    reg_lambda = 1, #L2 regularization
    random_state=42  # Random seed
)

# Perform cross-validation
scores = cross_val_score(model, data_train, labels_train, cv=6)

print("Cross-validation scores: ", scores)
print("Average cross-validation score: ", scores.mean())

# Train the model with early stopping
eval_set = [(data_test, labels_test)]  # Validation set for early stopping
model.fit(data_train, labels_train,eval_metric="logloss", eval_set=eval_set, early_stopping_rounds = 10)

# Make predictions on the test set
preds_prob = model.predict_proba(data_test)


Cross-validation scores:  [0.95       1.         0.97435897 1.         1.         0.97435897]
Average cross-validation score:  0.9831196581196581
[0]	validation_0-logloss:0.56500
[1]	validation_0-logloss:0.47972
[2]	validation_0-logloss:0.39767
[3]	validation_0-logloss:0.33341
[4]	validation_0-logloss:0.29346
[5]	validation_0-logloss:0.25361
[6]	validation_0-logloss:0.22242
[7]	validation_0-logloss:0.19370
[8]	validation_0-logloss:0.16665
[9]	validation_0-logloss:0.15233
[10]	validation_0-logloss:0.13736
[11]	validation_0-logloss:0.12367
[12]	validation_0-logloss:0.11410
[13]	validation_0-logloss:0.10214
[14]	validation_0-logloss:0.09282
[15]	validation_0-logloss:0.08472
[16]	validation_0-logloss:0.07795
[17]	validation_0-logloss:0.07128
[18]	validation_0-logloss:0.06567
[19]	validation_0-logloss:0.06323
[20]	validation_0-logloss:0.05928
[21]	validation_0-logloss:0.05518
[22]	validation_0-logloss:0.05150
[23]	validation_0-logloss:0.04906
[24]	validation_0-logloss:0.04702
[25]	validatio



In [355]:
from sklearn.metrics import confusion_matrix
# Make predictions on the test set
def find_optimal_threshold(predictions, y_test):
    min_sum = float('inf')
    optimal_threshold = 0

    # Iterate over possible thresholds from 0 to 1
    for threshold in np.arange(0.0, 1, 0.001):
        # Apply threshold
        preds = (np.array(predictions)[:,1] > threshold).astype(int)

        # Compute confusion matrix
        cm = confusion_matrix(y_test, preds)

        # Compute sum of off-diagonal elements
        off_diagonal_sum = 164 - np.trace(cm)
        #print(cm)
        # Update optimal threshold if this threshold is better
        if off_diagonal_sum < min_sum and cm[1][1]/np.sum(cm[1]) >0.5:
            min_sum = off_diagonal_sum
            optimal_threshold = threshold

    return optimal_threshold

threshold = find_optimal_threshold(preds_prob, labels_test)
preds = (preds_prob[:, 1]> threshold).astype(int)

# Calculate the accuracy of the model
accuracy = accuracy_score(labels_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
from sklearn.metrics import roc_auc_score
roc_auc_score(labels_test, preds), confusion_matrix(labels_test, preds)

Accuracy: 100.00%


(1.0,
 array([[163,   0],
        [  0,  73]], dtype=int64))

In [356]:
from sklearn.metrics import precision_recall_fscore_support as score
# Calculate precision, recall, F1 score
precision, recall, fscore, _ = score(labels_test, preds)
precision, recall, fscore

(array([1., 1.]), array([1., 1.]), array([1., 1.]))

In [357]:
pred_test = model.predict_proba(x_test[50:])
threshold = find_optimal_threshold(pred_test, y_test[50:])
preds = (pred_test[:, 1]> threshold).astype(int)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test[50:], preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
roc_auc_score(y_test[50:], preds), confusion_matrix(y_test[50:], preds)

Accuracy: 65.22%


(0.6495454545454546,
 array([[29, 15],
        [ 9, 16]], dtype=int64))

In [358]:
# Calculate precision, recall, F1 score
precision, recall, fscore, _ = score(y_test[50:], preds)
precision, recall, fscore

(array([0.76315789, 0.51612903]),
 array([0.65909091, 0.64      ]),
 array([0.70731707, 0.57142857]))