In [2]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [3]:
import os
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('/Users/efepekgoz/Developer/EEEM068/Melanoma/mela/train.csv')
df.head(10)

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0
5,ISIC_0074311,IP_2950485,female,40.0,lower extremity,unknown,benign,0
6,ISIC_0074542,IP_4698288,male,25.0,lower extremity,unknown,benign,0
7,ISIC_0075663,IP_6017204,female,35.0,torso,unknown,benign,0
8,ISIC_0075914,IP_7622888,male,30.0,torso,unknown,benign,0
9,ISIC_0076262,IP_5075533,female,50.0,lower extremity,unknown,benign,0


In [5]:
df = df[["image_name","target"]]
df["target"].value_counts()

0    32542
1      584
Name: target, dtype: int64

In [6]:
image_names_from_folder = os.listdir("/Users/efepekgoz/Developer/EEEM068/Melanoma/taman/train")
i1 = np.array(image_names_from_folder)
i1 = np.sort(i1)
i1

array(['ISIC_0015719.jpg', 'ISIC_0052212.jpg', 'ISIC_0068279.jpg', ...,
       'ISIC_9999515.jpg', 'ISIC_9999666.jpg', 'ISIC_9999806.jpg'],
      dtype='<U16')

In [7]:
i2 = np.array(df["image_name"])
i2 = np.sort(i2)
i2 += ".jpg"
i2

array(['ISIC_0015719.jpg', 'ISIC_0052212.jpg', 'ISIC_0068279.jpg', ...,
       'ISIC_9999515.jpg', 'ISIC_9999666.jpg', 'ISIC_9999806.jpg'],
      dtype=object)

In [8]:
i1 == i2
len(i2)

33126

In [9]:
from collections import Counter
directory_path = '/Users/efepekgoz/Developer/EEEM068/Melanoma/taman/train'

# Initialize a list to store file extensions
file_extensions = []

# Loop through the files in the directory
for filename in os.listdir(directory_path):
    # Check if the item is a file
    if os.path.isfile(os.path.join(directory_path, filename)):
        # Split the extension from the file
        ext = os.path.splitext(filename)[1]
        file_extensions.append(ext)

# Count each extension using Counter from collections
extension_counts = Counter(file_extensions)
extension_counts


Counter({'.jpg': 33126})

In [10]:
df["image_name"] = i2
df

Unnamed: 0,image_name,target
0,ISIC_0015719.jpg,0
1,ISIC_0052212.jpg,0
2,ISIC_0068279.jpg,0
3,ISIC_0074268.jpg,0
4,ISIC_0074311.jpg,0
...,...,...
33121,ISIC_9999134.jpg,0
33122,ISIC_9999320.jpg,0
33123,ISIC_9999515.jpg,0
33124,ISIC_9999666.jpg,0


In [11]:
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
from torchvision import models, transforms
from torchvision.models import resnet50, ResNet50_Weights, efficientnet_b5, EfficientNet_B5_Weights
from sklearn.metrics import f1_score
import torchvision.transforms as transforms
import torch
from torch import nn, optim
from tqdm import tqdm



In [12]:
########################################
#   Prereqs.
#   test csv has no labels so I splitted train data 80/20 for testing
#   some data does not have jpeg version, only dicom and vice versa
#   so move jpegs into dicom's folder. Also adjust the paths for your own machine.
#
#   Modified the original train.csv
#   Dataset split into 50/50 malignant beign ratio
#   which makes 584/584 in each label
#   With augmentations this number can be increased
#
#   TODO:
#   used resnet50 got respectable .78 f1, can try different models
#   play with lr-bs, increase num epochs, add transforms,
#
########################################

In [13]:
#HYPERPARAMETERS

BATCH_SIZE = 64
LR = 0.001

In [15]:
class MelanomaDataset(Dataset):
    def __init__(self, dataframe, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform

        # Oversampling minority class
        benign = dataframe[dataframe['target'] == 0]
        malignant = dataframe[dataframe['target'] == 1]
        
        # Oversampling malignant cases to match number of benign cases
        malignant_oversampled = malignant.sample(n=len(benign), replace=True)
        combined_df = pd.concat([benign, malignant_oversampled])

        # Now drop 50% of rows from each class
        benign_reduced = combined_df[combined_df['target'] == 0].sample(frac=0.8)
        malignant_reduced = combined_df[combined_df['target'] == 1].sample(frac=0.8)
        self.balanced_df = pd.concat([benign_reduced, malignant_reduced])
    
    def __len__(self):
        return len(self.balanced_df)

    def __getitem__(self, index):
        # Ensure the image name includes the '.jpg' extension
        img_name = os.path.join(self.root_dir, self.balanced_df.iloc[index, 0])
        image = Image.open(img_name).convert('RGB')
        
        label = self.balanced_df.iloc[index, 1]

        if self.transform:
            image = self.transform(image)

        return image, label

In [16]:
normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    normalize
])

In [17]:
print("Building Dataset...")
dataset = MelanomaDataset(
    dataframe=df,
    root_dir='/Users/efepekgoz/Developer/EEEM068/Melanoma/taman/train',
    transform=transform
)
print("Dataset built!")
print("Splitting dataset...")
train_size = int(0.8 * len(dataset))
cv_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - cv_size
train_set, cv_set, test_set = random_split(dataset, [train_size, cv_size, test_size])
print("Split complete!")
print(len(train_set), len(test_set), len(cv_set))

Building Dataset...
Dataset built!
Splitting dataset...
Split complete!
41654 5208 5206


In [18]:
print("assigning dataloaders...")
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
cv_loader = DataLoader(cv_set, batch_size=8, shuffle=True)
test_loader = DataLoader(test_set, batch_size=100, shuffle=False)
print("dataloaders ready!")

assigning dataloaders...
dataloaders ready!


In [19]:
print("loading model...")
device = torch.device('cuda' if torch.cuda.is_available() else 'mps')  

# Initialize the model architecture
model = efficientnet_b5()

# Load pretrained weights separately
weights = EfficientNet_B5_Weights.DEFAULT
model.load_state_dict(weights.get_state_dict(progress=True))

# Transfer the model to the device
model = model.to(device)

print("efficientnet_b5 loaded!")
print(device)

loading model...
efficientnet_b5 loaded!
mps


In [20]:
print("arranging features for classification...")
num_ftrs = model.classifier[1].in_features
model.classifier[1] = nn.Linear(num_ftrs, 2)  # Binary classification
print("model modified for binary!")

arranging features for classification...
model modified for binary!


In [21]:
 #switch mps for cpu if not macOS
model=model.to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)
print("optimiser and loss fn ready!")

optimiser and loss fn ready!


In [22]:
#making custom logging function

class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count if self.count != 0 else 0

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)

In [23]:
print("defining train and test funcs...")
def train_model(model, train_loader, criterion, optimizer, device, epoch):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    # Initialize accumulators for averages
    losses = AverageMeter('Loss', ':.4f')
    top1 = AverageMeter('Acc', ':.2f')

    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc='Training', leave=False)
    
    for batch_idx, (images, labels) in progress_bar:
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backprop and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Compute the accuracy
        _, predicted = torch.max(outputs.data, 1)
        correct_count = (predicted == labels).sum().item()

        # Update average meters
        losses.update(loss.item(), images.size(0))
        top1.update(100.0 * correct_count / labels.size(0), labels.size(0))

        total_loss += loss.item()
        total += labels.size(0)
        correct += correct_count

        # Logging for each iteration
        if batch_idx % 10 == 0:  # You can adjust the frequency of logging
            progress_description = (
                f'Epoch: [{epoch}][{batch_idx}/{len(train_loader)}]\t'
                f'Loss {losses.val:.4f} ({losses.avg:.4f})\t'
                f'Acc {top1.val:.2f} ({top1.avg:.2f})')
            progress_bar.set_description(progress_description)

    avg_loss = total_loss / len(train_loader)
    accuracy = 100 * correct / total
    return avg_loss, accuracy


defining train and test funcs...


In [24]:
def validate_model(model, cv_loader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient computation
        for images, labels in cv_loader:
            images, labels = images.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_loss = total_loss / len(cv_loader)
    accuracy = 100 * correct / total
    return avg_loss, accuracy


In [25]:
def train_and_validate(model, train_loader, cv_loader, criterion, optimizer, device, num_epochs):
    for epoch in range(num_epochs):
        # Train the model for one epoch
        train_loss, train_accuracy = train_model(model, train_loader, criterion, optimizer, device, epoch)
        
        # Validate the model on the cross-validation set
        cv_loss, cv_accuracy = validate_model(model, cv_loader, criterion, device)
        
        # Print epoch summary
        print(f'Epoch {epoch + 1}/{num_epochs}')
        print(f'Training Loss: {train_loss:.4f} | Training Accuracy: {train_accuracy:.2f}%')
        print(f'CV Loss: {cv_loss:.4f} | CV Accuracy: {cv_accuracy:.2f}%')


In [26]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [27]:
# Assuming model, optimizer, criterion, train_loader, cv_loader, and device are already defined
num_epochs = 10
train_and_validate(model, train_loader, cv_loader, criterion, optimizer, device='mps', num_epochs=10)


                                                                                                             

KeyboardInterrupt: 

In [None]:
def test_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    all_labels = []
    all_predictions = []

    progress_bar = tqdm(test_loader, desc='Testing', leave=False)
    with torch.no_grad():
        for images, labels in progress_bar:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

            # Update the progress bar with the current loss
            progress_bar.set_description(f"Test Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(test_loader)
    accuracy = 100 * correct / total
    f1 = f1_score(all_labels, all_predictions, average='binary')  # 'binary' for binary classification

    return avg_loss, accuracy, f1
print("test and training created !")

In [None]:
num_epochs = 10
print("training begins...")
for epoch in range(num_epochs):
    train_loss, train_accuracy = train_model(model, train_loader, criterion, optimizer, device)
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_accuracy:.2f}%')


test_loss, test_accuracy, test_f1 = test_model(model, test_loader, criterion, device)
print(f'Final Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%, Test F1 Score: {test_f1:.2f}')

#print("saving model...")
# torch.save(model.state_dict(), '/models/melanoma_classification_model.pth')