In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
import matplotlib.pyplot as plt
from PIL import Image
import os

In [2]:
## File Imports
dataset_path = './Data/'

train_labels = pd.read_csv(os.path.join(dataset_path, 'train_labels.csv'))
test_data = pd.read_csv(os.path.join(dataset_path, 'sample_submission.csv'))

train_path = dataset_path + 'train/'
test_path = dataset_path + 'test/'

In [3]:
## Data Analysis
print(train_labels['label'].value_counts())
print("Null: ",train_labels.isnull().sum())

missing_images = []
for image_name in train_labels['id']:
    image_path = os.path.join(dataset_path, 'train', f'{image_name}.tif')
    if not os.path.exists(image_path):
        missing_images.append(image_name)

print(f"Missing images: {len(missing_images)}")

label
0    130908
1     89117
Name: count, dtype: int64
Null:  id       0
label    0
dtype: int64
Missing images: 0


In [4]:
from torchvision import transforms

# Define preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),  # Convert the image to a tensor and normalize to [0, 1]
    transforms.Normalize(mean=[0.5], std=[0.5]),  # Optionally normalize to [-1, 1]
])

In [None]:
# Example: Apply preprocessing to a single image
image_path = os.path.join(dataset_path, 'train', '000b35e7c39c6cb32224dcb3fe4c48acf34f0252.tif')
image = Image.open(image_path)
image_preprocessed = preprocess(image)

# Convert back to a displayable format and show
image_show = transforms.ToPILImage()(image_preprocessed)
plt.imshow(image_show)
plt.show()

In [5]:
from sklearn.model_selection import train_test_split

# Split the data
train_set, val_set = train_test_split(train_labels, test_size=0.2, stratify=train_labels['label'], random_state=42)

print(f"Training set size: {len(train_set)}")
print(f"Validation set size: {len(val_set)}")

Training set size: 176020
Validation set size: 44005


In [6]:
from torch.utils.data import DataLoader, Dataset
from torchvision.io import read_image

class CancerDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = self.dataframe.iloc[idx, 0]
        label = self.dataframe.iloc[idx, 1]
        img_path = os.path.join(self.img_dir, f"{img_name}.tif")
        image = Image.open(img_path)

        if self.transform:
            image = self.transform(image)

        return image, label

# Create datasets
train_dataset = CancerDataset(train_set, os.path.join(dataset_path, 'train'), transform=preprocess)
val_dataset = CancerDataset(val_set, os.path.join(dataset_path, 'train'), transform=preprocess)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [7]:
model = models.resnet18(pretrained=True)

# Modify the final fully connected layer to match the number of classes (binary classification)
model.fc = nn.Linear(model.fc.in_features, 1)  # Binary classification

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)



In [8]:
criterion = nn.BCEWithLogitsLoss()  # Combines sigmoid activation with binary cross-entropy
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [9]:
num_epochs = 25
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device, dtype=torch.float)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs.squeeze(), labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    # Print statistics for each epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}')

    # Validate the model
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device, dtype=torch.float)
            outputs = model(images)
            loss = criterion(outputs.squeeze(), labels)
            val_loss += loss.item()

    print(f'Validation Loss: {val_loss/len(val_loader):.4f}')
    scheduler.step(val_loss)

Epoch [1/25], Loss: 0.2456
Validation Loss: 0.1829
Epoch [2/25], Loss: 0.1785
Validation Loss: 0.2050
Epoch [3/25], Loss: 0.1472
Validation Loss: 0.1445
Epoch [4/25], Loss: 0.1251
Validation Loss: 0.1185
Epoch [5/25], Loss: 0.1060
Validation Loss: 0.1218
Epoch [6/25], Loss: 0.0887
Validation Loss: 0.1405
Epoch [7/25], Loss: 0.0732
Validation Loss: 0.1300
Epoch [8/25], Loss: 0.0594
Validation Loss: 0.1322
Epoch [9/25], Loss: 0.0496
Validation Loss: 0.1440
Epoch [10/25], Loss: 0.0428
Validation Loss: 0.1723
Epoch [11/25], Loss: 0.0371
Validation Loss: 0.1444
Epoch [12/25], Loss: 0.0326
Validation Loss: 0.1942
Epoch [13/25], Loss: 0.0304
Validation Loss: 0.1806
Epoch [14/25], Loss: 0.0275
Validation Loss: 0.1809
Epoch [15/25], Loss: 0.0253
Validation Loss: 0.1814
Epoch [16/25], Loss: 0.0234
Validation Loss: 0.1793
Epoch [17/25], Loss: 0.0212
Validation Loss: 0.1747
Epoch [18/25], Loss: 0.0207
Validation Loss: 0.1746
Epoch [19/25], Loss: 0.0192
Validation Loss: 0.1789
Epoch [20/25], Loss: 

In [10]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device, dtype=torch.float)
        outputs = model(images)
        preds = torch.sigmoid(outputs.squeeze()).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Convert predictions to binary labels (0 or 1)
all_preds = [1 if p > 0.5 else 0 for p in all_preds]

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f'Validation Accuracy: {accuracy:.4f}')

# Calculate AUC-ROC
auc = roc_auc_score(all_labels, all_preds)
print(f'AUC-ROC: {auc:.4f}')

# Confusion Matrix
cm = confusion_matrix(all_labels, all_preds)
print(f'Confusion Matrix:\n{cm}')

Validation Accuracy: 0.9570
AUC-ROC: 0.9550
Confusion Matrix:
[[25275   907]
 [  987 16836]]


In [11]:
# Save the model
torch.save(model.state_dict(), 'cancer_detection_model.pth')

# Load the model (for later use)
model.load_state_dict(torch.load('cancer_detection_model.pth'))
model.eval()

  model.load_state_dict(torch.load('cancer_detection_model.pth'))


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [13]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(all_labels, all_preds)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9570


In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')

Precision: 0.9489
Recall: 0.9446
F1-Score: 0.9467


In [15]:
from sklearn.metrics import roc_auc_score
auc_roc = roc_auc_score(all_labels, all_preds)
print(f'AUC-ROC: {auc_roc:.4f}')

AUC-ROC: 0.9550


In [16]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(all_labels, all_preds)
print(f'Confusion Matrix:\n{cm}')

Confusion Matrix:
[[25275   907]
 [  987 16836]]


In [17]:
from sklearn.model_selection import cross_val_score

# Assume you're using a scikit-learn compatible model or can wrap a PyTorch model
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-Validation Accuracy: {scores.mean():.4f}')

NameError: name 'X_train' is not defined