# Exploratory analysis

## Libraries and importing

In [1]:
import os

import pandas as pd
import numpy as np
from tqdm import tqdm
import cv2


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torchvision import datasets, transforms, models


MPS is available


In [None]:

if torch.backends.mps.is_available():
    device = torch.device('mps')
    print('MPS is available')
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print('CUDA is available')
else:
    device = torch.device('cpu')
    print('No acceleration available')

#### Relevant references:
https://towardsdatascience.com/data-preparation-guide-for-detecting-histopathologic-cancer-detection-7b96d6a12004


## Esplorazione

In [2]:
df_training = pd.read_csv("histopathologic-cancer-detection/train_labels.csv")

In [3]:
df_training.head()

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0


In [4]:
df_training.shape

(220025, 2)

In [5]:
df_training['label'].value_counts()

label
0    130908
1     89117
Name: count, dtype: int64

## Creazione del dataset di training

In [6]:
# Ereditando dalla classe Dataset si ottiene un oggetto iterabile dotato di automatic batching
class CustomImageDataset(Dataset):
    def __init__(self, dataframe, root_dir, transform=None):
        self.data = dataframe
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx): # Questo membro deve essere sovrascritto (overwrite)
        
        img_name = self.data.loc[idx, 'id']  #Pesco il nome dalla colonna id
        img_path = f"{self.root_dir}/{img_name}.tif"  #Creo il path specifico
        
        # Utilizzo OpenCV per leggere l'immagine
        image = cv2.imread(img_path)     # Utilizza OpenCV per leggere l'immagine dal percorso specificato
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)     # Converte il formato dell'immagine da BGR (utilizzato da OpenCV) a RGB

        label = int(self.data.loc[idx, 'label'])  # Ottiene l'etichetta (target) dalla colonna label
        
        # Se passiamo una trasformazione, la applica all'immagine
        if self.transform:
            image = self.transform(image)

        return image, label

# Define the transformations to apply to the images
data_transforms = transforms.Compose([
    #transforms.ToPILImage(),  # Convert numpy array (OpenCV image) to PIL Image
    #transforms.Resize(256),
    #transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create an instance of your custom dataset
custom_dataset = CustomImageDataset(dataframe=df_training, root_dir='histopathologic-cancer-detection/train', transform=data_transforms)

# Create a DataLoader to iterate through your custom dataset
batch_size = 32
train_loader = torch.utils.data.DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)

In [7]:


# Definizione del modello
model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)  # Esempio di ResNet-18 preaddestrata

# Impostare il numero di classi in base al tuo dataset
num_classes = len(df_training['label'].unique())

# Modifica l'ultimo strato completamente connesso per adattarlo al numero di classi del tuo problema
model.fc = nn.Linear(model.fc.in_features, num_classes)

# Definizione della funzione di perdita e dell'ottimizzatore
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [8]:
num_epochs = 10

device = torch.device('mps') #torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    with tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch") as t_bar:
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch + 1}/{num_epochs}] Loss: {epoch_loss:.4f}")


Epoch 1/10:   0%|          | 0/6876 [09:24<?, ?batch/s]


Epoch [1/10] Loss: 0.2502


Epoch 2/10:   0%|          | 0/6876 [09:03<?, ?batch/s]


Epoch [2/10] Loss: 0.1768


Epoch 3/10:   0%|          | 0/6876 [09:40<?, ?batch/s]


Epoch [3/10] Loss: 0.1403


Epoch 4/10:   0%|          | 0/6876 [09:37<?, ?batch/s]


Epoch [4/10] Loss: 0.1138


Epoch 5/10:   0%|          | 0/6876 [09:09<?, ?batch/s]


Epoch [5/10] Loss: 0.0927


Epoch 6/10:   0%|          | 0/6876 [09:11<?, ?batch/s]


Epoch [6/10] Loss: 0.0747


Epoch 7/10:   0%|          | 0/6876 [09:15<?, ?batch/s]


Epoch [7/10] Loss: 0.0595


Epoch 8/10:   0%|          | 0/6876 [09:44<?, ?batch/s]


Epoch [8/10] Loss: 0.0498


Epoch 9/10:   0%|          | 0/6876 [09:25<?, ?batch/s]


Epoch [9/10] Loss: 0.0429


Epoch 10/10:   0%|          | 0/6876 [09:11<?, ?batch/s]

Epoch [10/10] Loss: 0.0365





In [11]:
torch.save(model.state_dict(), 'model_saved/1')

## Testing

In [9]:
def test_model_on_images(model, images_folder):

    model.eval()  # Set the model to evaluation mode
    
    # Define the transformations for test images
    data_transforms = transforms.Compose([
        transforms.ToPILImage(),  # Convert numpy array (OpenCV image) to PIL Image
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # Iterate through images in the folder
    for image_name in os.listdir(images_folder):
        image_path = os.path.join(images_folder, image_name)
        
        # Read the image using OpenCV
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert from BGR to RGB
        
        # Apply transformations
        image = data_transforms(image).unsqueeze(0)  # Add a batch dimension
        
        # Move the image to the appropriate device (CPU or GPU)
        image = image.to(device)
        
        # Get the model prediction
        with torch.no_grad():
            output = model(image)
            _, predicted = torch.max(output, 1)
            
            # Use the predicted label as needed
            print(f"Image: {image_name}, Predicted label: {predicted.item()}")


In [10]:
test_images_folder = 'histopathologic-cancer-detection/test'
#test_model_on_images(model, test_images_folder)
