In [53]:
!pip install torchextractor



In [54]:
import torch
import sys
import numpy as np
import pickle as pkl
from os.path import join as oj
from datetime import datetime
import torch.optim as optim
import os
from torch.utils.data import TensorDataset, ConcatDataset
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, f1_score
import argparse
from PIL import Image
from tqdm import tqdm
from torch import nn
from numpy.random import randint
import torchvision.models as models
import time
import copy
import gc
import json
import pandas as pd
import torch
import torchvision.models as models
from torchvision import transforms
from PIL import Image

## Mount Google Drive and create paths for directories


In [55]:
from google.colab import drive
drive.mount("/content/drive")
dir_path = "/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224"
#dir_path = "/content/drive/MyDrive/Projects/ISIC_224"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
model_path = oj(dir_path, "models", "initial_classifier")
model_training_path = oj(model_path, "training_224")
data_path = oj(dir_path, "data")

not_cancer_path = oj(data_path, "processed", "no_cancer_224")
cancer_path = oj(data_path, "processed", "cancer_224")

### Arguments for training

In [57]:
import numpy as np
import argparse
import torch
import torchvision.models as models
import torch.nn as nn

mean = np.asarray([0.485, 0.456, 0.406])
std = np.asarray([0.229, 0.224, 0.225])

# Define arguments
class Args:
    def __init__(self):
        self.batch_size = 16
        self.epochs = 10
        self.lr = 0.00001
        self.momentum = 0.9
        self.seed = 42
        self.regularizer_rate = 0.0

args = Args()

regularizer_rate = args.regularizer_rate
num_epochs = args.epochs
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.manual_seed(args.seed)
model = models.vgg16(pretrained=True)
model.classifier[-1] = nn.Linear(4096, 2)
model = model.to(device)
params_to_update = model.classifier.parameters()

###Clean up the image directories



*   Remove empty images
*   Remove duplicates which appear in a new folder but not the original.
*   Ensure image sizes are all 224x224



In [58]:
def clean_up_empty_files(path):
    list_files= os.listdir(path)
    num_files = len(list_files)
    for i in tqdm(range(num_files)):
        if os.path.getsize(oj(path, list_files[i])) < 100:
            os.remove(oj(path, list_files[i]))
            print("File " + str(i) + "deleted!")
'''
def clean_up_duplicates(path1, path2):
    newfiles = os.listdir(path1)
    oldfiles = os.listdir(path2)
    diff = [f for f in newfiles if f not in oldfiles]
    for i in tqdm(diff):
        os.remove(oj(path1, i))
        print("File " + str(i) + "deleted!")

def check_img_sizes(path):
    list_files= os.listdir(path)
    num_files = len(list_files)
    for i in tqdm(range(num_files)):
        im = Image.open(oj(path, list_files[i]))
        if im.width != 224 or im.height != 224:
            print(list_files[i])
'''
# clean_up_empty_files(cancer_path)
# clean_up_empty_files(not_cancer_path)

# newpath = oj(data_path, "no_cancer_224_inpainted")
# oldpath = oj(data_path, "processed", "no_cancer_224")
# clean_up_duplicates(newpath, oldpath)

# check_img_sizes(not_cancer_path)


'\ndef clean_up_duplicates(path1, path2):\n    newfiles = os.listdir(path1)\n    oldfiles = os.listdir(path2)\n    diff = [f for f in newfiles if f not in oldfiles]\n    for i in tqdm(diff):\n        os.remove(oj(path1, i))\n        print("File " + str(i) + "deleted!")\n\ndef check_img_sizes(path):\n    list_files= os.listdir(path)\n    num_files = len(list_files)\n    for i in tqdm(range(num_files)):\n        im = Image.open(oj(path, list_files[i]))\n        if im.width != 224 or im.height != 224:\n            print(list_files[i])\n'

Torch dataset class

In [59]:
class CancerDataset(torch.utils.data.Dataset):
    def __init__(self, path: str = None, is_cancer: int = None, data_files = None, labels = None):
        """
        Expects path and is_cancer both to be supplied if the relevant images all lie in the same directory and have the same class
        or a list of full filepaths and list of all labels are both supplied using data_files and labels otherwise.
        """
        if path:
            self.path = path
            self.data_files = os.listdir(self.path)
            self.is_cancer = is_cancer

        else:
            self.path = ''
            self.data_files = data_files
            self.labels = labels
            self.is_cancer = None

    def __getitem__(self, i):
        # Read in the image, convert to float between [0,1] and standardise.
        img = Image.open(oj(self.path, self.data_files[i]))
        img_array = np.asarray(img)/255.0
        img_array -= mean[None, None, :]
        img_array /= std[None, None, :]
        img.close()
        torch_img = torch.from_numpy(img_array.swapaxes(0,2).swapaxes(1,2)).float()
        # Take the global class if supplied, otherwise extract the relevant label from the list of labels.
        is_cancer = self.is_cancer if self.is_cancer is not None else self.labels[i]
        return (torch_img, is_cancer)

    def __len__(self):
        return len(self.data_files)

Functions for training

In [60]:
def gradient_sum(im, target, model, crit, device='cuda'):
    '''assume that eveything is already on cuda'''
    im.requires_grad = True
    grad_params = torch.abs(torch.autograd.grad(crit(model(im), target), im,create_graph = True)[0].sum(dim=1)).sum()
    return grad_params

def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, resume_training=False):
    since = time.time()
    # train_loss_history = []
    # train_acc_history = []
    # train_cd_history= []

    best_loss = 10.0
    patience = 3
    cur_patience = 0
    if len(os.listdir(model_training_path)) > 0 and resume_training:
        model_list = [(f, os.path.getmtime(oj(model_training_path,f))) for f in os.listdir(model_training_path) if f.endswith('.pt')]
        model_list.sort(key=lambda tup: tup[1], reverse=True)  # sorts in place from most to least recent
        model_name = model_list[0][0]
        model.classifier.load_state_dict(torch.load(oj(model_training_path, model_name)))
        print("Model loaded!")
    for epoch in range(1, num_epochs + 1):
        print('Epoch {}/{}'.format(epoch, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        optimizer.step()
        model.train()  # Set model to training mode
        phase = 'train'
        running_loss = 0.0
        running_loss_cd = 0.0
        running_corrects = 0

        # Iterate over data.
        for i, (inputs, labels) in tqdm(enumerate(dataloaders[phase])):

            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            # track history if only in train
            with torch.set_grad_enabled(phase == 'train'):
                # need to do calc beforehand because we do need the gradients
                if phase == 'train' and regularizer_rate !=0:
                    inputs.requires_grad = True
                    add_loss = gradient_sum(inputs, labels, model, criterion)
                    if add_loss!=0:
                        (regularizer_rate*add_loss).backward()
                        optimizer.step()
                    #print(torch.cuda.memory_allocated()/(np.power(10,9)))
                    optimizer.zero_grad()
                    running_loss_cd += add_loss.item() * inputs.size(0)

                    #inputs.require_grad = False

                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)
                if phase == 'train':
                    (loss).backward()
                    optimizer.step()

            # statistics
            running_loss += loss.item() * inputs.size(0)

            running_corrects += torch.sum(preds == labels.data)

        epoch_loss = running_loss / dataset_sizes[phase]
        epoch_cd_loss = running_loss_cd / dataset_sizes[phase]

        epoch_acc = running_corrects.double() / dataset_sizes[phase]

        print('{} Loss: {:.4f} Acc: {:.4f} CD Loss : {:.4f}'.format(
            phase, epoch_loss, epoch_acc, epoch_cd_loss))

        # train_loss_history.append(epoch_loss)
        # train_cd_history.append(epoch_cd_loss)
        # train_acc_history.append(epoch_acc.item())
        torch.save(model.classifier.state_dict(), oj(model_training_path, datetime.now().strftime("%Y%m%d%H%M%S") + ".pt"))

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60)
    )
    print('Best val loss: {:4f}'.format(best_loss))

    # load best model weights
    return model

Functions for evaluation

In [61]:
from sklearn.metrics import auc,average_precision_score, roc_curve,roc_auc_score,precision_recall_curve, f1_score

def get_output(model, dataset):
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=16,
                                             shuffle=False, num_workers=2)
    model = model.eval()
    y = []
    y_hat = []
    softmax= torch.nn.Softmax()
    with torch.no_grad() :
        for inputs, labels in data_loader:
            y_hat.append((labels).cpu().numpy())
            y.append(torch.nn.Softmax(dim=1)( model(inputs.cuda()))[:,1].detach().cpu().numpy()) # take the probability for cancer
    y_hat = np.concatenate( y_hat, axis=0 )
    y = np.concatenate( y, axis=0 )
    return y, y_hat # in the training set the values were switched

def get_auc_f1(model, dataset,fname = None, ):
    if fname !=None:
        with open(fname, 'rb') as f:
            weights = torch.load(f)
        if "classifier.0.weight" in weights.keys(): #for the gradient models we unfortunately saved all of the weights
            model.load_state_dict(weights)
        else:
            model.classifier.load_state_dict(weights)
        y, y_hat = get_output(model.classifier, dataset)
    else:
        y, y_hat = get_output(model, dataset)
    auc =roc_auc_score(y_hat, y)
    f1 = np.asarray([f1_score(y_hat, y > x) for x in np.linspace(0.1,1, num = 10) if (y >x).any() and (y<x).any()]).max()
    return auc, f1

### Initial Classifier Training

Combine datasets and split to train-test

In [62]:
cancer_dataset = CancerDataset(path=cancer_path, is_cancer=1)
not_cancer_dataset = CancerDataset(path=not_cancer_path, is_cancer=0)
complete_dataset = ConcatDataset((cancer_dataset, not_cancer_dataset))

num_total = len(complete_dataset)
num_train = int(0.8 * num_total)
num_test = num_total - num_train
torch.manual_seed(0);
print("num_train:", num_train)
print("num_test:", num_test)

train_dataset, test_dataset = torch.utils.data.random_split(complete_dataset, [num_train, num_test])
datasets = {'train' : train_dataset, 'test':test_dataset}
dataset_sizes = {'train' : len(train_dataset), 'test':len(test_dataset)}




dataloaders = {x: torch.utils.data.DataLoader(datasets[x], batch_size=args.batch_size,
                                             shuffle=True, num_workers=2)
              for x in ['train', 'test']}

num_train: 9340
num_test: 2336


Record the specific files in the training/test sets.


In [63]:
def list_to_file(li, filename):
  with open(filename, 'w') as f:
    for item in li:
      f.write("%s\n" % item)

def extract_filenames(train_subset, test_subset):
  # Extract the relevant indices of the concat dataset
  train_idx, test_idx = train_subset.indices, test_subset.indices

  # Extract the filenames for the cancer_dataset and not_cancer_dataset and concatenate with their directory path.
  # Each original dataset is stored by the ConcatDataset class. So even though train_subset is a subset, the info for the whole cancer dataset is stored in train_subset.dataset.datasets[0]
  cancer_filepaths      = [oj(train_subset.dataset.datasets[0].path, file) for file in train_subset.dataset.datasets[0].data_files]
  not_cancer_filepaths  = [oj(train_subset.dataset.datasets[1].path, file) for file in train_subset.dataset.datasets[1].data_files]

  filepaths = cancer_filepaths + not_cancer_filepaths    # Append the lists together, this combined list is what the indices are based on.

  train_files = [filepaths[i] for i in train_idx]
  test_files  = [filepaths[i] for i in test_idx]

  return train_files, test_files

In [64]:
# # Call the function and get the full file paths.
train_files, test_files = extract_filenames(train_dataset, test_dataset)
list_to_file(train_files, oj(dir_path, 'models', 'train_files.txt'))   # Write the training filepaths to a text file.
list_to_file(test_files,  oj(dir_path, 'models', 'test_files.txt'))    # Write the testing filepaths to a text file.

### Weights for training

Since the classes are unbalanced, we need to account for this in the loss function while training

In [65]:
cancer_ratio = len(cancer_dataset)/len(complete_dataset)

not_cancer_ratio = 1 - cancer_ratio
cancer_weight = 1/cancer_ratio
not_cancer_weight = 1/ not_cancer_ratio
weights = np.asarray([not_cancer_weight, cancer_weight])
weights /= weights.sum()
weights = torch.tensor(weights).to(device)

criterion = nn.CrossEntropyLoss(weight = weights.double().float())

optimizer_ft = optim.SGD(params_to_update, lr=args.lr, momentum=args.momentum)

Train and save the model

In [66]:
model = train_model(model, dataloaders, criterion, optimizer_ft, num_epochs=num_epochs, resume_training=False)
pid = datetime.now().strftime('%Y%m%d%H%M%S')
torch.save(model.classifier.state_dict(),oj(dir_path, model_path, pid + ".pt"))

Epoch 1/10
----------


584it [02:24,  4.04it/s]


train Loss: 0.3922 Acc: 0.8481 CD Loss : 0.0000
Epoch 2/10
----------


584it [02:22,  4.11it/s]


train Loss: 0.2763 Acc: 0.8686 CD Loss : 0.0000
Epoch 3/10
----------


584it [02:22,  4.11it/s]


train Loss: 0.2540 Acc: 0.8703 CD Loss : 0.0000
Epoch 4/10
----------


584it [02:22,  4.11it/s]


train Loss: 0.2453 Acc: 0.8732 CD Loss : 0.0000
Epoch 5/10
----------


584it [02:22,  4.11it/s]


train Loss: 0.2409 Acc: 0.8733 CD Loss : 0.0000
Epoch 6/10
----------


584it [02:22,  4.11it/s]


train Loss: 0.2375 Acc: 0.8790 CD Loss : 0.0000
Epoch 7/10
----------


584it [02:22,  4.11it/s]


train Loss: 0.2319 Acc: 0.8816 CD Loss : 0.0000
Epoch 8/10
----------


584it [02:22,  4.11it/s]


train Loss: 0.2293 Acc: 0.8804 CD Loss : 0.0000
Epoch 9/10
----------


584it [02:22,  4.11it/s]


train Loss: 0.2278 Acc: 0.8831 CD Loss : 0.0000
Epoch 10/10
----------


584it [02:22,  4.11it/s]


train Loss: 0.2274 Acc: 0.8851 CD Loss : 0.0000
Training complete in 23m 59s
Best val loss: 10.000000


In [68]:
auc, f1 = get_auc_f1(model, test_dataset)
print("AUC: ", auc)
print("F1: ", f1)


AUC:  0.9525435191270193
F1:  0.49044585987261147


In [69]:

results_file_path = oj(dir_path, "auc_f1_224_no_malig_patch.txt")
print(results_file_path)
with open(results_file_path, 'w') as f:
    f.write('AUC: ' + str(auc) + "\n")
    f.write('F1: ' + str(f1) + "\n")

/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/auc_f1_224_no_malig_patch.txt


## Reload the model to skip retraining and test with patched images

In [70]:
cancer_patch_path = '/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/malignant_patched_224'
not_cancer_path = '/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/data/processed/no_cancer_224'

In [71]:
cancer_patch_dataset = CancerDataset(path=cancer_patch_path, is_cancer=1)
not_cancer_dataset = CancerDataset(path=not_cancer_path, is_cancer=0)
complete_patch_dataset = ConcatDataset((cancer_patch_dataset, not_cancer_dataset))

num_total = len(complete_patch_dataset)
num_train = int(0.8 * num_total)
num_test = num_total - num_train
torch.manual_seed(0);
print("num_train:", num_train)
print("num_test:", num_test)

train_patch_dataset, test_patch_dataset = torch.utils.data.random_split(complete_patch_dataset, [num_train, num_test])
datasets = {'train' : train_patch_dataset, 'test':test_patch_dataset}
dataset_sizes = {'train' : len(train_patch_dataset), 'test':len(test_patch_dataset)}

# only with malig patches
#cancer_patch_dataset = test_patch_dataset

dataloaders = {x: torch.utils.data.DataLoader(datasets[x], batch_size=args.batch_size,
                                             shuffle=True, num_workers=2)
              for x in ['train', 'test']}

num_train: 9340
num_test: 2336


In [72]:
def list_to_file(li, filename):
  with open(filename, 'w') as f:
    for item in li:
      f.write("%s\n" % item)

def extract_filenames(train_subset, test_subset):
  # Extract the relevant indices of the concat dataset
  train_idx, test_idx = train_subset.indices, test_subset.indices

  # Extract the filenames for the cancer_dataset and not_cancer_dataset and concatenate with their directory path.
  # Each original dataset is stored by the ConcatDataset class. So even though train_subset is a subset, the info for the whole cancer dataset is stored in train_subset.dataset.datasets[0]
  cancer_filepaths      = [oj(train_subset.dataset.datasets[0].path, file) for file in train_subset.dataset.datasets[0].data_files]
  not_cancer_filepaths  = [oj(train_subset.dataset.datasets[1].path, file) for file in train_subset.dataset.datasets[1].data_files]

  filepaths = cancer_filepaths + not_cancer_filepaths    # Append the lists together, this combined list is what the indices are based on.

  train_files = [filepaths[i] for i in train_idx]
  test_files  = [filepaths[i] for i in test_idx]

  return train_files, test_files

In [73]:
# # Call the function and get the full file paths.
train_patch_files, test_patch_files = extract_filenames(train_patch_dataset, test_patch_dataset)
list_to_file(train_files, oj(dir_path, 'models', 'train_files.txt'))   # Write the training filepaths to a text file.
list_to_file(test_files,  oj(dir_path, 'models', 'test_files.txt'))    # Write the testing filepaths to a text file.

In [74]:
def get_auc_f1(model, dataset,fname = None, ):
    if fname !=None:
        with open(fname, 'rb') as f:
            weights = torch.load(f)
        if "classifier.0.weight" in weights.keys(): #for the gradient models they saved all of the weights
            model.load_state_dict(weights)
        else:
            model.classifier.load_state_dict(weights)
        y, y_hat = get_output(model.classifier, dataset)
    else:
        y, y_hat = get_output(model, dataset)
    auc =roc_auc_score(y_hat, y)
    f1 = np.asarray([f1_score(y_hat, y > x) for x in np.linspace(0.1,1, num = 10) if (y >x).any() and (y<x).any()]).max()
    return auc, f1

In [75]:
def save_predictions(model, dataset, filename):
    predictions = []
    true_labels = []

    # Iterate over the dataset
    for inputs, labels in dataset:
        inputs = inputs.unsqueeze(0)  # Add batch dimension
        inputs = inputs.to(device)  # Move data to appropriate device
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)

        # Append predictions to the list
        predictions.append(predicted.item())

        # Check if labels are integers or tensors
        if isinstance(labels, torch.Tensor):
            true_labels.append(labels.item())
        else:
            true_labels.append(labels)  # Assume labels are integers

    # Create a DataFrame to store predictions and true labels
    df = pd.DataFrame({
        'Prediction': predictions,
        'True Label': true_labels
    })

    # Save DataFrame to CSV file
    df.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")

In [76]:
model = models.vgg16(pretrained=True)
model.classifier[-1] = nn.Linear(4096, 2)  # Modify classifier

# Load the saved parameters into the model
saved_model_path = "/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/models/initial_classifier/20240601213813.pt"
model.classifier.load_state_dict(torch.load(saved_model_path))

# Move model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# cancer_dataset and complete_dataset are already defined
cancer_ratio = len(cancer_dataset) / len(complete_dataset)
not_cancer_ratio = 1 - cancer_ratio
cancer_weight = 1 / cancer_ratio
not_cancer_weight = 1 / not_cancer_ratio
weights = torch.tensor([not_cancer_weight, cancer_weight], device=device, dtype=torch.float)

criterion = nn.CrossEntropyLoss(weight=weights)

# Define arguments
class Args:
    def __init__(self):
        self.batch_size = 16
        self.epochs = 10
        self.lr = 0.00001
        self.momentum = 0.9
        self.seed = 42
        self.regularizer_rate = 0.0

args = Args()

regularizer_rate = args.regularizer_rate
num_epochs = args.epochs

torch.manual_seed(args.seed)

params_to_update = model.classifier.parameters()

optimizer_ft = optim.SGD(params_to_update, lr=args.lr, momentum=args.momentum)




In [77]:
auc, f1 = get_auc_f1(model, test_patch_dataset)
print("AUC: ", auc)
print("F1: ", f1)

AUC:  0.9566642722868366
F1:  0.5233644859813084


In [78]:
save_predictions(model, test_patch_dataset, '/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/models/initial_classifier/patch_predictions.csv')

Predictions saved to /content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/models/initial_classifier/patch_predictions.csv


## Load Model and extract activations from last layer

In [79]:
# Load pre-trained VGG16 model
model = models.vgg16(pretrained=True)

# Modify the classifier
model.classifier[-1] = torch.nn.Linear(4096, 2)

# Load the saved parameters into the model
saved_model_path = "/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/models/initial_classifier/20240601213813.pt"
model.classifier.load_state_dict(torch.load(saved_model_path))

# Move model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define preprocessing transforms
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def preprocess_and_extract_activations(image_path):
    # Load and preprocess the image
    image = Image.open(image_path)
    image_tensor = preprocess(image).unsqueeze(0).to(device)

    # Flatten the tensor before passing it to the linear layers
    image_tensor = model.features(image_tensor)
    image_tensor = model.avgpool(image_tensor)
    image_tensor = torch.flatten(image_tensor, 1)  # Flatten the tensor

    # Pass the tensor through the first 5 layers
    for layer_idx, layer in enumerate(model.classifier[:5], start=1):
        image_tensor = layer(image_tensor)

    # Extract activations from the sixth linear layer
    activations = model.classifier[5](image_tensor)
    activations = activations.squeeze().cpu().detach().numpy()
    #print("Activation size:", activations.shape)

    return activations

# Function to recursively traverse folders and process images
def process_images_in_folder(folder_path):
    all_activations = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(('.jpg')):
                image_path = os.path.join(root, file)
                activations = preprocess_and_extract_activations(image_path)
                if activations is not None:
                    all_activations.append(activations)
    return all_activations

# Folder path containing  images
patch_cancer_folder_path = "/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/malignant_patched_224"
no_patch_cancer_folder_path = "/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/malignant_nopatch_224"

# Extract activations for all images in the folder
wp_cancer_all_activations = process_images_in_folder(patch_cancer_folder_path)
wo_cancer_all_activations = process_images_in_folder(no_patch_cancer_folder_path)

if wp_cancer_all_activations:
    print("wp_cancer_all_activations shape:", np.vstack(wp_cancer_all_activations).shape)
else:
    print("No activations found in wp_cancer_all_activations")

if wo_cancer_all_activations:
    print("wo_cancer_all_activations shape:", np.vstack(wo_cancer_all_activations).shape)
else:
    print("No activations found in wo_cancer_all_activations")

#folder_path = "/content/drive/MyDrive/Projects/ISIC_224/malignant_nopatch_224"

# Extract activations for all images in the folder
#all_activations = process_images_in_folder(folder_path)



UnidentifiedImageError: cannot identify image file '/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/malignant_nopatch_224/ISIC_0010792.jpg'

In [None]:
#np.vstack(all_activations).shape

In [None]:
# Save activations for malignant wit patches as npy array (Sari likes npy, I like csv)
np.save('/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/cancer_test_wp_activations.npy',np.vstack(wp_cancer_all_activations))
np.save('/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/cancer_test_wo_activations.npy',np.vstack(wo_cancer_all_activations))

In [None]:
'''
import shutil
folder_path = "/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/malignant_patched_224"
files= os.listdir(folder_path)
for imgname in files:
    shutil.copyfile('/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/data/processed/cancer_224/'+imgname,'/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/malignant_nopatch_224/'+imgname)
'''

In [None]:
folder_path = "/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/malignant_patched_224"
files= os.listdir(folder_path)
num_mal_test = len(files)
patch_no_patch = np.hstack([np.zeros((1,num_mal_test)),np.ones((1,num_mal_test))])
wp = np.load('/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/cancer_test_wp_activations.npy')
wo = np.load('/content/drive/MyDrive/Masterthesis/Datasets/ISIC_224/cancer_test_wo_activations.npy')
wop_activations = np.vstack([wo,wp])

In [None]:
wop_activations.shape

In [None]:
patch_no_patch.shape

In [None]:
two_arrays = np.concatenate((patch_no_patch.T, wop_activations), axis=1) # 900x568
corr = np.corrcoef(two_arrays.T)

In [None]:
#activations_corr.shape

In [None]:
import matplotlib.pyplot as plt
activations_corr = np.abs(corr[0][1:])
_ = plt.hist(activations_corr, bins='auto')

In [None]:
print(model.classifier)


In [None]:
def calculate_correlations(original_activations, patched_activations):
    # Convert activation lists to arrays
    original_activations_array = np.array([act[0] for act in original_activations])
    patched_activations_array = np.array([act[0] for act in patched_activations])

    # Ensure activations arrays have the correct shape
    original_activations_array = original_activations_array.reshape(len(original_activations_array), -1)
    patched_activations_array = patched_activations_array.reshape(len(patched_activations_array), -1)

    # Calculate correlations
    correlations = np.corrcoef(original_activations_array.T, patched_activations_array.T)

    # Extract correlations for each neuron
    neuron_correlations = correlations[:4096, 4096:]

    return neuron_correlations


# Extract activations for original and patched MAL images
original_activations = preprocess_and_extract_activations(model, test_dataset)
patched_activations = preprocess_and_extract_activations(model, test_patch_dataset)

# Calculate correlations for each neuron
neuron_correlations = calculate_correlations(original_activations, patched_activations)

# Print correlations for the first neuron as an example
print("Correlations for the first neuron:", neuron_correlations[0])