# Exploratory analysis

## Libraries and importing

In [7]:

import pandas as pd
import numpy as np
from tqdm import tqdm
import cv2


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torchvision import transforms, models

from sklearn.metrics import roc_auc_score

from torchinfo import summary

from tensorboardX import SummaryWriter


In [8]:

if torch.backends.mps.is_available():
    device = torch.device('mps')
    print('MPS is available')
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print('CUDA is available')
else:
    device = torch.device('cpu')
    print('No acceleration available')


MPS is available


## Creazione del dataset di training

In [9]:
df_train = pd.read_csv("datasets/df_train.csv")

In [5]:
# Ereditando dalla classe Dataset si ottiene un oggetto iterabile dotato di automatic batching
class CustomImageDataset(Dataset):
    def __init__(self, dataframe, root_dir, transform=None):
        self.data = dataframe
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx): # Questo membro deve essere sovrascritto (overwrite)
        
        img_name = self.data.loc[idx, 'id']  #Pesco il nome dalla colonna id
        img_path = f"{self.root_dir}/{img_name}.tif"  #Creo il path specifico
        
        # Utilizzo OpenCV per leggere l'immagine
        image = cv2.imread(img_path)     # Utilizza OpenCV per leggere l'immagine dal percorso specificato
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)     # Converte il formato dell'immagine da BGR (utilizzato da OpenCV) a RGB

        label = int(self.data.loc[idx, 'label'])  # Ottiene l'etichetta (target) dalla colonna label
        
        # Se passiamo una trasformazione, la applica all'immagine
        if self.transform:
            image = self.transform(image)

        return image, label

# Define the transformations to apply to the images
data_transforms = transforms.Compose([
    transforms.ToPILImage(),  # Convert numpy array (OpenCV image) to PIL Image
    #transforms.Resize(256),
    transforms.CenterCrop(32),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create an instance of your custom dataset
custom_dataset = CustomImageDataset(dataframe=df_train, root_dir='histopathologic-cancer-detection/train', transform=data_transforms)

# Create a DataLoader to iterate through your custom dataset
batch_size = 32
train_loader = torch.utils.data.DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)

In [6]:


# Definizione del modello
model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)  # Esempio di ResNet-18 preaddestrata

# Impostare il numero di classi in base al tuo dataset
num_classes = len(df_train['label'].unique())

# Modifica l'ultimo strato completamente connesso per adattarlo al numero di classi del tuo problema
model.fc = nn.Linear(model.fc.in_features, num_classes)

# Definizione della funzione di perdita e dell'ottimizzatore
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [7]:
summary(model)

Layer (type:depth-idx)                   Param #
ResNet                                   --
├─Conv2d: 1-1                            9,408
├─BatchNorm2d: 1-2                       128
├─ReLU: 1-3                              --
├─MaxPool2d: 1-4                         --
├─Sequential: 1-5                        --
│    └─BasicBlock: 2-1                   --
│    │    └─Conv2d: 3-1                  36,864
│    │    └─BatchNorm2d: 3-2             128
│    │    └─ReLU: 3-3                    --
│    │    └─Conv2d: 3-4                  36,864
│    │    └─BatchNorm2d: 3-5             128
│    └─BasicBlock: 2-2                   --
│    │    └─Conv2d: 3-6                  36,864
│    │    └─BatchNorm2d: 3-7             128
│    │    └─ReLU: 3-8                    --
│    │    └─Conv2d: 3-9                  36,864
│    │    └─BatchNorm2d: 3-10            128
├─Sequential: 1-6                        --
│    └─BasicBlock: 2-3                   --
│    │    └─Conv2d: 3-11                 73,728

In [12]:
writer = SummaryWriter()

writer.add_graph(model, torch.rand(1, 3, 32, 32))

writer.close()

In [15]:
num_epochs = 10

device = torch.device('mps') #torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    with tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch") as t_bar:
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch + 1}/{num_epochs}] Loss: {epoch_loss:.4f}")


Epoch 1/10:   0%|          | 0/6174 [07:46<?, ?batch/s]


Epoch [1/10] Loss: 0.4519


Epoch 2/10:   0%|          | 0/6174 [07:39<?, ?batch/s]


Epoch [2/10] Loss: 0.4131


Epoch 3/10:   0%|          | 0/6174 [07:36<?, ?batch/s]


Epoch [3/10] Loss: 0.3799


Epoch 4/10:   0%|          | 0/6174 [07:36<?, ?batch/s]


Epoch [4/10] Loss: 0.3598


Epoch 5/10:   0%|          | 0/6174 [07:34<?, ?batch/s]


Epoch [5/10] Loss: 0.3440


Epoch 6/10:   0%|          | 0/6174 [07:30<?, ?batch/s]


Epoch [6/10] Loss: 0.3345


Epoch 7/10:   0%|          | 0/6174 [2:24:31<?, ?batch/s]


Epoch [7/10] Loss: 0.3226


Epoch 8/10:   0%|          | 0/6174 [5:36:51<?, ?batch/s]


Epoch [8/10] Loss: 0.3115


Epoch 9/10:   0%|          | 0/6174 [3:23:38<?, ?batch/s]


Epoch [9/10] Loss: 0.3019


Epoch 10/10:   0%|          | 0/6174 [09:00<?, ?batch/s]

Epoch [10/10] Loss: 0.2899





In [16]:
torch.save(model.state_dict(), 'model_saved/2')

## Testing

In [17]:
# Load the test dataset ('df_test') similarly to how you loaded the training dataset
df_test = pd.read_csv("datasets/df_test.csv")  # Load your test dataset CSV file



In [35]:


# Create an instance of your custom dataset for the test data
custom_test_dataset = CustomImageDataset(dataframe=df_test, root_dir='histopathologic-cancer-detection/train', transform=data_transforms)

# Create a DataLoader for the test dataset
test_loader = torch.utils.data.DataLoader(custom_test_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle for testing

# Assuming you have a trained model 'model'
# Set the model to evaluation mode
model.eval()

# Create empty lists to store predicted probabilities and true labels
predicted_probabilities = []
true_labels = []

# Iterate through the test dataset and get predictions
for images, labels in test_loader:
    # Move images and labels to the appropriate device
    images = images.to(device)
    labels = labels.to(device)
    
    # Forward pass to get outputs/predictions
    with torch.no_grad():
        outputs = model(images)
        #probabilities = torch.sigmoid(outputs)  # Applying sigmoid to get probabilities
        _, predicted = torch.max(outputs, 1)

        

    predicted_probabilities.append(predicted.cpu().numpy())
    true_labels.append(labels.cpu().numpy())

# Concatenate the predictions and true labels
predicted_probabilities = np.concatenate(predicted_probabilities)
true_labels = np.concatenate(true_labels)



In [37]:
# Calculate ROC AUC score
auc_score = roc_auc_score(true_labels, predicted_probabilities)

print(f"ROC AUC Score on Test Dataset: {auc_score}")

ROC AUC Score on Test Dataset: 0.8517566253849945


In [20]:
true_labels

array([0, 0, 1, ..., 0, 1, 0])

In [36]:
predicted_probabilities

array([0, 0, 1, ..., 0, 1, 0])