In [1]:
#!pip3 install torch torchvision torchaudio
#!pip3 install -U scikit-learn scipy matplotlib

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchvision
from torchvision import datasets, models, transforms
import torch.nn.functional as nnF
from sklearn import metrics as skl_metrics
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, train_test_split
import pandas as pd
import numpy as np
import numpy.typing as npt
import csv
import os
import time
import copy
import glob
import json
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

#!pip install grad-cam
#from pytorch_grad_cam import GradCAM, EigenCAM, FullGrad
#from pytorch_grad_cam.utils.image import show_cam_on_image

## This allows inbuilt cudnn auto-tuner to find the best algorithm to use for your hardware
import torch.backends.cudnn as cudnn
cudnn.benchmark = True

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
## General parameters
verbose = False
check = False

random_state = 42

#toxoFolderPath = '/data/raw/Toxoplasmosis/classification'
toxoFolderPath = '/content/drive/MyDrive/REU/mini_eye_data'

## Execution parameters
num_epochs = 100     # Epochs to be trained
#lr=0.001            # Learning rate for the training
lr=1e-3
#momentum=0.9        # Momentum for the training
returned='best'     # Model to be returned in the training. ['last' (default), 'best']


In [4]:
class MyDataset(Dataset):
    """Class that contains a dataset.

    Attributes
    ----------
    data : numpy.ndarray
        Array containing the data of the dataset
    targets : numpy.ndarray
        Array containing tha targets associated with the data
    """
    def __init__(self, data:npt.NDArray, targets:npt.NDArray):
        self.data=data
        self.targets=targets

    def __len__(self):
        """Returns the number of data points
        """
        return self.data.shape[0]

    def __getitem__(self, idx:int):
        x = self.data[idx]
        y = self.targets[idx]
        return x, y

class MyLazyImageDataset(Dataset):
    """Class that contains an image dataset.
    It has lazy loading of the images, saving system memory at the cost of time.

    Attributes
    ----------
    data : numpy.ndarray
        Array containing the data of the dataset
    targets : numpy.ndarray
        Array containing tha targets associated with the data
    transformations : torchvision.transforms
        Transformations to be applied to the images.

    """
    def __init__(self, data:npt.NDArray, targets:npt.NDArray, transformations:torchvision.transforms):
        self.data=data
        self.targets=targets
        self.transformations=transformations

    def __len__(self):
        """Returns the number of data points
        """
        return self.data.shape[0]

    def __getitem__(self, idx):
        x = self.transformations(Image.open(self.data[idx]))
        y = self.targets[idx]
        return x, y


class DataContainer:
    """Class that contains a dataset. Allows data splitting plus dataset and dataloader creation
    Class .

    Attributes
    ----------
    data : numpy.ndarray
        Array containing the data of the dataset
    targets : numpy.ndarray
        Array containing tha targets associated with the data

    Methods
    -------

    """
    data = []
    targets = []

    train_index = []
    final_val_index = []
    test_index = []
    kfold_indexes = []

    def __init__(self, 
                 data:npt.NDArray, 
                 targets:npt.NDArray, 
                 split_test:bool=True, 
                 test_size:float=0.1, 
                 split_final_val:bool=True, 
                 final_val_size:float=0.1, 
                 split_kfold:bool=True, 
                 k_splits:int=5, 
                 random_state:int=None):
        self.data = data
        self.targets = targets
        self.split_test=split_test
        self.test_size=test_size
        self.split_final_val=split_final_val
        self.final_val_size=final_val_size
        self.split_kfold=split_kfold
        self.k_splits = k_splits
        self.random_state = random_state

        # Generate test indexes over the dataset
        if self.split_test:
            test_len=round(self.data.shape[0] * self.test_size)
            sss = StratifiedShuffleSplit(n_splits=1, test_size=test_len, random_state=self.random_state)
            self.train_index, self.test_index = list(sss.split(self.data, self.targets))[0]
        else:
            self.train_index, self.test_index = np.array(range(0, self.data.shape[0])), []
        
        # Generate final-validation indexes over the dataset.
        # Intersection between taining and final-validation must be empty
        if self.split_final_val:
            # Number of examples to be separated refered to the original data size
            final_val_len=round(self.data.shape[0] * self.final_val_size)
            sss = StratifiedShuffleSplit(n_splits=1, test_size=final_val_len, random_state=self.random_state)
            # The final_val is seprated from the data minus test subset. 
            train_index, final_val_index = list(sss.split(self.data[self.train_index], self.targets[self.train_index]))[0]
            # But the indexes obtained must be refered to the original dataset and not the training subset
            self.final_val_index = self.train_index[final_val_index]
            self.train_index = self.train_index[train_index]
        else:
            self.train_index, self.final_val_index = self.train_index, []

        # Generate train-validation k-fold indexes over the training subset
        if self.split_kfold:
            strtfdKFold = StratifiedKFold(n_splits=self.k_splits, shuffle=True, random_state=self.random_state)
            self.kfold_indexes = list(strtfdKFold.split(self.data[self.train_index], self.targets[self.train_index]))
        else:
            self.kfold_indexes = []
    
    # This function is needed for the possible situation in which a lazy kind of dataset may be needed 
    # (for instance for huge image datasets that need lazy load due to memory constraints)
    def getDataset_(self, data:npt.NDArray, targets:npt.NDArray):
        return MyDataset(data, targets)
    

    def save_partition (self, file_path:str):
        """Saves the partition of the dataset in the given file path
        """
        toSave = (self.train_index, self.final_val_index, self.test_index, self.kfold_indexes)
        with open(file_path, 'wb') as file:
            pd.to_pickle(toSave, file)
            print(f'Object successfully saved to "{file_path}"')

    def load_partition (self, file_path:str):
        """Loads the partition of the dataset from the given file path
        """
        self.train_index, self.final_val_index, self.test_index, self.kfold_indexes = pd.read_pickle(file_path)


    ############### Checks ###############
    
    def check_intersection (self, verbose:bool=False):
        """Method that returns if there is any element of one subset (train, final_val, test) in the others.
        If verbose is True, prints tuple (test_in_train, val_in_train, val_in_test)
        
        Returns
        ----------
            True if the intersections are 0, False otherwise
        """
        test_in_train = sum(1 if x in self.train_index else 0 for x in self.test_index)
        val_in_train = sum(1 if x in self.final_val_index else 0 for x in self.test_index)
        val_in_test = sum(1 if x in self.train_index else 0 for x in self.final_val_index)
        if verbose: print (test_in_train, val_in_train, val_in_test)
        return test_in_train + val_in_train + val_in_test == 0
    
    def check_folds (self, verbose:bool=False):
        """Method that returns if the folds were correctly formed: right size AND validation not intersecting AND sum(validation) == training
        If verbose is True, prints tuple (size_ok, val_intersection_ok, all_val_eq_training_ok)

        Returns
        ----------    
            True if the intersections are 0, False otherwise
        """
        if self.kfold_indexes is None or len(self.kfold_indexes) == 0:
            raise FileNotFoundError('The controller was created without kfold splitting')
        size_ok = True
        val_acum = []
        val_intersection = 0
        for (train, val) in self.kfold_indexes:
            if train.shape[0] + val.shape[0] != self.train_index.shape[0]:
                size_ok = False
            for v in val:
                if v in val_acum:
                    val_intersection += 1
                else:
                    val_acum.append(v)
        val_intersection_ok = val_intersection==0
        val_acum_ok = len(val_acum)==self.train_index.shape[0]
        if verbose: print (size_ok, val_intersection_ok, val_acum_ok)
        return size_ok and val_intersection_ok and val_acum_ok
        
    ############### Data ###############
    
    def get_data(self):
        return (self.data, self.targets)

    def get_full_train(self):
        return (self.data[self.train_index], self.targets[self.train_index])

    def get_final_val(self):
        if self.final_val_index is None or self.final_val_index.shape[0] == 0:
            raise FileNotFoundError('The controller was created without final_val splitting')
        return (self.data[self.final_val_index], self.targets[self.final_val_index])
    
    def get_test(self):
        if self.test_index is None or self.test_index.shape[0] == 0:
            raise FileNotFoundError('The controller was created without test splitting')
        return (self.data[self.test_index], self.targets[self.test_index])
    
    ############### Indexes ###############
    
    def get_fold_indexes (self, fold:int) -> tuple[list[int], list[int]]:
        if self.kfold_indexes is None or len(self.kfold_indexes) == 0:
            raise FileNotFoundError('The controller was created without kfold splitting')
        fold_train_indexes, fold_val_indexes = self.kfold_indexes[fold]
        train_indexes = self.train_index[fold_train_indexes]
        val_indexes = self.train_index[fold_val_indexes]
        return (train_indexes, val_indexes)

    ############### Dataset ###############
    
    def get_data_dataset(self) -> Dataset:
        data, targets = self.get_data()
        data_dataset = self.getDataset_(data, targets)
        return data_dataset

    def get_full_train_dataset(self) -> Dataset:
        train_data, train_targets = self.get_full_train()
        return self.getDataset_(train_data, train_targets)

    def get_final_val_dataset(self) -> Dataset:
        final_val_data, final_val_targets = self.get_final_val()
        return self.getDataset_(final_val_data, final_val_targets)
    
    def get_test_dataset(self) -> Dataset:
        test_data, test_targets = self.get_test()
        return self.getDataset_(test_data, test_targets)
    
    def get_fold_datasets (self, fold:int) -> tuple[Dataset, Dataset]:
        train_indexes, val_indexes = self.get_fold_indexes(fold)
        train_dataset = self.getDataset_(self.data[train_indexes], self.targets[train_indexes])
        val_dataset = self.getDataset_(self.data[val_indexes], self.targets[val_indexes])
        return (train_dataset, val_dataset)
    
    ############### Dataloader ###############
    
    def get_data_dataloader(self, batch_size:int=32, shuffle:bool=False, num_workers:int=2) -> DataLoader:
        """Returns a dataloader with the full data.

        Args
        ----------
        batch_size : int
            Size of the batch. Optional. Default 32
        shuffle : bool
            If True, the order of the examples will be random. Optional. Default False
        num_workers: int
            Number of process to use. Optional. Default 2
        """
        data_dataset = self.get_data_dataset()
        return DataLoader(data_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)

    def get_full_train_dataloader(self, batch_size:int=32, shuffle:bool=False, num_workers:int=2) -> DataLoader:
        """Returns a dataloader with the full training subset.

        Args
        ----------
        batch_size : int
            Size of the batch. Optional. Default 32
        shuffle : bool
            If True, the order of the examples will be random. Optional. Default False
        num_workers: int
            Number of process to use. Optional. Default 2
        """
        train_dataset = self.get_full_train_dataset()
        return DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)

    def get_final_val_dataloader(self, batch_size:int=32, shuffle:bool=False, num_workers:int=2) -> DataLoader:
        """Returns a dataloader with the final validation subset.

        Args
        ----------
        batch_size : int
            Size of the batch. Optional. Default 32
        shuffle : bool
            If True, the order of the examples will be random. Optional. Default False
        num_workers: int
            Number of process to use. Optional. Default 2
        """
        final_val_dataset = self.get_final_val_dataset()
        return DataLoader(final_val_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
    
    def get_test_dataloader(self, batch_size:int=32, shuffle:bool=True, num_workers:int=2) -> DataLoader:
        """Returns a dataloader with the test subset.

        Args
        ----------
        batch_size : int
            Size of the batch. Optional. Default 32
        shuffle : bool
            If True, the order of the examples will be random. Optional. Default False
        num_workers: int
            Number of process to use. Optional. Default 2
        """
        test_dataset = self.get_test_dataset()
        return DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)

    def get_fold_dataloaders(self, fold:int, batch_size:int=32, shuffle:bool=True, num_workers:int=2) -> tuple[DataLoader, DataLoader]:
        train_dataset, val_dataset = self.get_fold_datasets(fold)
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
        return (train_dataloader, val_dataloader)


class ImageFolderDataContainer(DataContainer):
    """Class to process the extraction of a image dataset from a folder path.
    The labels are extracted from the image path as the index of pathLabelsList
    Extends DataContainer.
    Allows data splitting plus dataset and dataloader creation.

    Attributes
    ----------
    data : numpy.ndarray
        Array containing the data of the dataset
    targets : numpy.ndarray
        Array containing tha targets associated with the data
    filepaths: numpy.ndarray[str]
        Array with the paths of the images

    Methods
    -------

    """
    filepaths = []

    def __init__(self, 
                 folderPath:str,
                 extension:str,
                 pathLabelsList: list[str],
                 lazy:bool=False, 
                 split_test:bool=True, 
                 test_size:float=0.1, 
                 split_final_val:bool=True, 
                 final_val_size:float=0.1, 
                 split_kfold:bool=True, 
                 k_splits:int=5, 
                 random_state:int=None, 
                 transformations:bool=None):
        self.lazy=lazy
        self.transformations = transformations
        if self.transformations is None:
            self.transformations = transforms.ToTensor()
        data, labels = self.extract_data(folderPath, extension, pathLabelsList)
        super().__init__(data=data, 
                         targets=labels, 
                         split_test=split_test, 
                         test_size=test_size, 
                         split_final_val=split_final_val, 
                         final_val_size=final_val_size, 
                         split_kfold=split_kfold, 
                         k_splits=k_splits, 
                         random_state=random_state)
    
    

    def extract_data (self, folderPath:str, extension:str, pathLabelsList: list[str]) -> tuple[npt.NDArray, npt.NDArray]:
        """Function that extract the data from the given path in a recursive way.
        """
        # Take the paths of all images for that magnification in the given folder and subfolders and short them alphabetically
        filepaths = glob.glob(folderPath + "/**/*" + extension.lower(), recursive=True)
        filepaths.extend(glob.glob(folderPath + "/**/*" + extension.upper(), recursive=True))
        filepaths = sorted(filepaths, key=lambda s: os.path.split(s)[-1])

        # Read all images and extract their labels from the path
        images = filepaths
        if not self.lazy:
            images = [np.asarray(self.transformations(Image.open(fpath))) for fpath in images]
        labels=[]
        possibleLabelsLength = len(pathLabelsList)
        #print(len(filepaths))
        for fpath in filepaths:
            for i in range(0, possibleLabelsLength):
                if pathLabelsList[i] in fpath:
                    labels.append(i)
                    break

        # Store filepaths, and return images and labels
        self.filepaths = np.asarray(filepaths)
        data = np.asarray(images)
        labels = np.asarray(labels)

        return data, labels
    
    def getDataset_(self, data:npt.NDArray, targets:npt.NDArray) -> Dataset:
        """Function intended for huge image datasets that need lazy load due to memory constraints, instanties a different Dataset class

        Args
        ----------
        data: : numpy.ndarray
            Array of images or paths of images depending of if it has been created as lazy or not
        targets : numpy.ndarray
            Array of target values of the data
            
        Returns
        ----------
        Dataset:
            Implementation of the class torch.utils.data.Dataset. It will have lazy loading if the controller has been instantiated as such.
        """
        if not self.lazy:
            return MyDataset(data, targets)
        else:
            return MyLazyImageDataset(data, targets, self.transformations)
    
    
    ############### Image paths ###############

    def get_data_paths(self):
        return self.filepaths

    def get_full_train_paths(self):
        return self.filepaths[self.train_index]

    def get_final_val_paths(self):
        if self.final_val_index is None or self.final_val_index.shape[0] == 0:
            raise FileNotFoundError('The controller was created without final_val splitting')
        return self.filepaths[self.final_val_index]
    
    def get_test_paths(self):
        if self.test_index is None or self.test_index.shape[0] == 0:
            raise FileNotFoundError('The controller was created without test splitting')
        return self.filepaths[self.test_index]
    
    def get_fold_paths (self, fold:int):
        train_indexes, val_indexes = self.get_fold_indexes(fold)
        train_paths = self.filepaths[train_indexes]
        val_paths = self.filepaths[val_indexes]
        return (train_paths, val_paths)

    
    ############### Save images ###############

    def save_transformed_images (self, indexes=[0], folder_path="", base_name="", extension=".png", verbose:bool=False):
        for index in indexes:
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
            filename, _ = os.path.splitext(os.path.split(self.filepaths[index])[-1])
            new_file_path = os.path.join(folder_path, base_name + filename + extension)
            
            if not self.lazy:
                img = self.data[index]
            else:
                img = np.asarray(self.transformations(Image.open(self.data[index])))
                
            plt.imshow(img.transpose((1, 2, 0)))
            if verbose:
                print ("saving image to to " + new_file_path)
            plt.savefig(new_file_path, bbox_inches='tight')
            if verbose:
                print ("saved plot to " + new_file_path)
            plt.close()


class BreakHisDataContainer(ImageFolderDataContainer):
    """Class for loading the BreakHis dataset from a Path. Extends ImageFolderDataContainer.
    The labels are extracted from the image path with 'benign' = 0  and 'malignant' = 1
    """
    pathLabelsList = ['benign', 'malignant']
    extension = ".png"

    def __init__(self, 
                 folderPath:str,
                 lazy:bool=False, 
                 magnification:str="400X", 
                 split_test:bool=True, 
                 test_size:float=0.1, 
                 split_final_val:bool=True, 
                 final_val_size:float=0.1, 
                 split_kfold:bool=True, 
                 k_splits:int=5, 
                 random_state:int=None, 
                 transformations:torchvision.transforms=None):
        self.magnification=magnification
        super().__init__(folderPath=folderPath,
                         extension=self.extension,
                         pathLabelsList=self.pathLabelsList,
                         lazy=lazy,
                         split_test=split_test, 
                         test_size=test_size, 
                         split_final_val=split_final_val, 
                         final_val_size=final_val_size, 
                         split_kfold=split_kfold, 
                         k_splits=k_splits, 
                         random_state=random_state, 
                         transformations=transformations)
    
    def extract_data (self, folderPath:str, extension:str, pathLabelsList: list[str]) -> tuple[npt.NDArray, npt.NDArray]:  
        # Take the paths of all images for that magnification in the given folder and subfolders and short them alphabetically
        filepaths = glob.glob(folderPath + "/**/*" + extension, recursive=True)
        #print(filepaths)
        if self.magnification != "ALL":
            filepaths = [fpath for fpath in filepaths if self.magnification in fpath]
        filepaths = sorted(filepaths, key=lambda s: os.path.split(s)[-1])

        # Read all images and extract their labels from the path
        images = filepaths
        if not self.lazy:
            images = [np.asarray(self.transformations(Image.open(fpath))) for fpath in images]
        labels=[]
        possibleLabelsLength = len(pathLabelsList)
        for fpath in filepaths:
            for i in range(0, possibleLabelsLength):
                if pathLabelsList[i] in fpath:
                    labels.append(i)
                    break

        # Store filepaths, and return images and labels
        self.filepaths = np.asarray(filepaths)
        data = np.asarray(images)
        labels = np.asarray(labels)

        return data, labels


class BreakHisDataContainerPatientMulticlass(BreakHisDataContainer):
    """Class for loading the BreakHis dataset from a Path. Extends BreakHisDataContainer.
    The labels are extracted from the extraction ID
    """
    def extract_data (self, folderPath:str, extension:str, pathLabelsList: list[str]) -> tuple[npt.NDArray, npt.NDArray]:     
        # Take the paths of all images for that magnification in the given folder and subfolders and short them alphabetically
        filepaths = glob.glob(folderPath + "/**/*" + extension, recursive=True)
        if self.magnification != "ALL":
            filepaths = [fpath for fpath in filepaths if self.magnification in fpath]
        filepaths = sorted(filepaths, key=lambda s: os.path.split(s)[-1])

        # Read all images and extract their labels from the path
        images = filepaths
        if not self.lazy:
            images = [np.asarray(self.transformations(Image.open(fpath))) for fpath in images]
        slide_ids = [(fpath.split("/")[-3]).split("-")[-1] for fpath in self.filepaths]
        unique_ids = list(set(slide_ids))
        labels = [unique_ids.index(id) for id in slide_ids]

        # Store filepaths, and return images and labels
        self.filepaths = np.asarray(filepaths)
        data = np.asarray(images)
        labels = np.asarray(labels)

        return data, labels

class AntsAndBeesDataContainer(ImageFolderDataContainer):
    """Image Folder Data Container for Hymenoptera dataset. Extends ImageFolderDataContainer.
    The labels are extracted from the image path with 'ants' = 0  and 'bees' = 1
    """
    pathLabelsList = ['ants', 'bees']
    extension=".jpg"
    def __init__(self, 
                 folderPath:str, 
                 lazy:bool=False, 
                 split_test:bool=True, 
                 test_size=0.1, 
                 split_final_val:bool=True, 
                 final_val_size=0.1, 
                 split_kfold:bool=True, 
                 k_splits:int=5, 
                 random_state:int=None, 
                 transformations:torchvision.transforms=None):

        super().__init__(folderPath=folderPath,
                         extension=self.extension,
                         pathLabelsList=self.pathLabelsList,
                         lazy=lazy,
                         split_test=split_test, 
                         test_size=test_size, 
                         split_final_val=split_final_val, 
                         final_val_size=final_val_size, 
                         split_kfold=split_kfold, 
                         k_splits=k_splits, 
                         random_state=random_state, 
                         transformations=transformations)


class ToxoDataContainer(ImageFolderDataContainer):
    pathLabelsList = ['unhealthy', 'healthy']
    extension=".jpg"
    def __init__(self, 
                 folderPath:str, 
                 lazy:bool=False, 
                 split_test:bool=True, 
                 test_size=0.1, 
                 split_final_val:bool=True, 
                 final_val_size=0.1, 
                 split_kfold:bool=True, 
                 k_splits:int=5, 
                 random_state:int=None, 
                 transformations:torchvision.transforms=None):

        super().__init__(folderPath=folderPath,
                         extension=self.extension,
                         pathLabelsList=self.pathLabelsList,
                         lazy=lazy,
                         split_test=split_test, 
                         test_size=test_size, 
                         split_final_val=split_final_val, 
                         final_val_size=final_val_size, 
                         split_kfold=split_kfold, 
                         k_splits=k_splits, 
                         random_state=random_state, 
                         transformations=transformations)
    

In [8]:
#################################################################################
#   The data has to have the folder structure of
#
#       unhealthy
#           |
#           |--- toxo
#           |--- retinoplasmosys
#           |--- ....
#           |--- ....
#           |--- ....
#       healthy
#           |--- normal
#
# The resulting csv will have the structure: imageName, binaryLabel, MulticlassLabel, features*
#################################################################################

def get_activation(observers, observer_name):
    def hook(model, input, output):
        observers[observer_name] = output.detach()
    return hook

def transform_images_to_features_csv (model, observers, observer_name, dataContainer, output_file='features.csv', sublabels=False, separator=','):

    # Get Input
    data_dataloader = dataContainer.get_data_dataloader(batch_size=1,shuffle=False)
    data_paths = dataContainer.get_data_paths()

    # Write file
    f = open(output_file, "w")
    for i, (input, label) in enumerate(data_dataloader):

        # Obtain full mopdel output
        output = model(input)
        features = (torch.flatten(observers[observer_name])).cpu().numpy()
        imagePath = data_paths[i]
        imageName = os.path.split(imagePath)[-1]
        sublabel_name=''
        if sublabels:
            sublabel_name = os.path.split(os.path.split(imagePath)[-2])[-1]
        label_name = dataContainer.pathLabelsList[label]

        # Convert the features in a csv string
        features_string = ""
        for feature in features:
            features_string += str(feature) + separator
        # for the loop construction, there is going an extra separator at the end
        features_string = features_string[:-1]

        csv_line = imageName + separator + label_name + separator + sublabel_name
        csv_line = csv_line + separator + features_string
        csv_line = csv_line + "\n"
        #print (csv_line)
        f.write(csv_line)

    #close the file
    f.close()


def features_csv_merge (input1_csv, input2_csv, output_csv, delimiter=','):
    ids=[]
    data = {}
    with open(input1_csv) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=delimiter)
        line_count = 0
        for row in csv_reader:
            ids.append(row[0])
            data[row[0]] = [row[1], row[2], row[3:len(row)]]
    
    with open(input2_csv) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=delimiter)
        line_count = 0
        for row in csv_reader:
            [label, sublabel, f1] = data[row[0]]
            data[row[0]] = [label, sublabel, f1, row[3:len(row)]]

    f = open(output_csv, "w")
    for id in ids:

        label, sublabel, f1, f2 = data[id]

        # Convert the features in a csv string
        features_string = ""
        for feature in f1:
            features_string += feature + delimiter
        for feature in f2:
            features_string += feature + delimiter
        # for the loop construction, there is going an extra delimiter at the end
        features_string = features_string[:-1]

        csv_line = id + delimiter + label + delimiter + sublabel
        csv_line = csv_line + delimiter + features_string
        csv_line = csv_line + "\n"
        #print (csv_line)
        f.write(csv_line)

    #close the file
    f.close()


def features_csv_to_tensor (input_csv, delimiter=','):
    ids=[]
    data=[]
    with open(input_csv) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=delimiter)
        line_count = 0
        for row in csv_reader:
            ids.append(row[0])
            if (data == []):
                data = [np.array(row)]
            else:
                data = np.append(c, [row], axis=0)



In [9]:
crop_size = 460
input_shape = 224
data_transforms =  transforms.Compose([
        #transforms.CenterCrop(crop_size),
        transforms.Resize((input_shape, input_shape)),
        transforms.ToTensor()
    ])
dataContainer = ToxoDataContainer(toxoFolderPath, transformations=data_transforms, lazy=True)
print(dataContainer.check_intersection())
print(dataContainer.check_folds())
x, y = dataContainer.get_data()
print(x.shape)

True
True
(199,)


In [10]:
crop_size = 460
input_shape = 64

data_transforms =  transforms.Compose([
        transforms.CenterCrop(crop_size),
        transforms.Resize((input_shape, input_shape)),
        transforms.ToTensor()
    ])

def get_densenet121_model_fc_linear (freeze=True, pretrained=True, output_features=1, num_init_features: int = 64, bn_size: int = 4, drop_rate: float = 0):

    weights = None
    if pretrained:
        weights = models.DenseNet121_Weights.IMAGENET1K_V1
    
    model = models.densenet121(weights=weights)

    # freeze the layers
    if freeze:
        for param in model.parameters():
            param.requires_grad = False

    # Modify the classifier head
    num_ftrs = model.classifier.in_features
    model.classifier = nn.Sequential (    
        nn.Linear(num_ftrs, output_features)
    )

    return model

In [11]:
fileName = "DenseNet121.csv"

# Model 
model = get_densenet121_model_fc_linear()
model.eval()

# Add hook (activation) to the selected layer. It needs to be a dictionary because pointers obscure wizardy
observers = {}
observer_name='features'
model.features.pool0.register_forward_hook(get_activation(observers, observer_name))

# Call the function
transform_images_to_features_csv (model, observers, observer_name, dataContainer, output_file=fileName, sublabels=True)


Downloading: "https://download.pytorch.org/models/densenet121-a639ec97.pth" to /root/.cache/torch/hub/checkpoints/densenet121-a639ec97.pth
100%|██████████| 30.8M/30.8M [00:00<00:00, 63.9MB/s]


In [12]:
crop_size = 460
input_shape = 64

data_transforms =  transforms.Compose([
        transforms.CenterCrop(crop_size),
        transforms.Resize((input_shape, input_shape)),
        transforms.ToTensor()
    ])

def get_densenet161_model_fc_linear (freeze=True, pretrained=True, output_features=1, num_init_features: int = 64, bn_size: int = 4, drop_rate: float = 0):

    weights = None
    if pretrained:
        weights = models.DenseNet161_Weights.IMAGENET1K_V1
    
    model = models.densenet161(weights=weights)

    # freeze the layers
    if freeze:
        for param in model.parameters():
            param.requires_grad = False

    # Modify the classifier head
    num_ftrs = model.classifier.in_features
    model.classifier = nn.Sequential (    
        nn.Linear(num_ftrs, output_features)
    )

    return model

In [13]:
fileName = "DenseNet161.csv"

# Model 
model = get_densenet161_model_fc_linear()
model.eval()

# Add hook (activation) to the selected layer. It needs to be a dictionary because pointers obscure wizardy
observers = {}
observer_name='features'
model.features.pool0.register_forward_hook(get_activation(observers, observer_name))

# Call the function
transform_images_to_features_csv (model, observers, observer_name, dataContainer, output_file=fileName, sublabels=True)

Downloading: "https://download.pytorch.org/models/densenet161-8d451a50.pth" to /root/.cache/torch/hub/checkpoints/densenet161-8d451a50.pth
100%|██████████| 110M/110M [00:01<00:00, 78.0MB/s]


In [14]:
crop_size = 460
input_shape = 64

data_transforms =  transforms.Compose([
        transforms.CenterCrop(crop_size),
        transforms.Resize((input_shape, input_shape)),
        transforms.ToTensor()
    ])

def get_densenet169_model_fc_linear (freeze=True, pretrained=True, output_features=1, num_init_features: int = 64, bn_size: int = 4, drop_rate: float = 0):

    weights = None
    if pretrained:
        weights = models.DenseNet169_Weights.IMAGENET1K_V1
    
    model = models.densenet169(weights=weights)

    # freeze the layers
    if freeze:
        for param in model.parameters():
            param.requires_grad = False

    # Modify the classifier head
    num_ftrs = model.classifier.in_features
    model.classifier = nn.Sequential (    
        nn.Linear(num_ftrs, output_features)
    )

    return model

In [15]:
fileName = "DenseNet169.csv"

# Model 
model = get_densenet169_model_fc_linear()
model.eval()

# Add hook (activation) to the selected layer. It needs to be a dictionary because pointers obscure wizardy
observers = {}
observer_name='features'
model.features.pool0.register_forward_hook(get_activation(observers, observer_name))

# Call the function
transform_images_to_features_csv (model, observers, observer_name, dataContainer, output_file=fileName, sublabels=True)

Downloading: "https://download.pytorch.org/models/densenet169-b2777c0a.pth" to /root/.cache/torch/hub/checkpoints/densenet169-b2777c0a.pth
100%|██████████| 54.7M/54.7M [00:00<00:00, 75.6MB/s]


In [16]:
crop_size = 460
input_shape = 64

data_transforms =  transforms.Compose([
        transforms.CenterCrop(crop_size),
        transforms.Resize((input_shape, input_shape)),
        transforms.ToTensor()
    ])

def get_densenet201_model_fc_linear (freeze=True, pretrained=True, output_features=1, num_init_features: int = 64, bn_size: int = 4, drop_rate: float = 0):

    weights = None
    if pretrained:
        weights = models.DenseNet201_Weights.IMAGENET1K_V1
    
    model = models.densenet201(weights=weights)

    # freeze the layers
    if freeze:
        for param in model.parameters():
            param.requires_grad = False

    # Modify the classifier head
    num_ftrs = model.classifier.in_features
    model.classifier = nn.Sequential (    
        nn.Linear(num_ftrs, output_features)
    )

    return model

In [17]:
fileName = "DenseNet201.csv"

# Model 
model = get_densenet201_model_fc_linear()
model.eval()

# Add hook (activation) to the selected layer. It needs to be a dictionary because pointers obscure wizardy
observers = {}
observer_name='features'
model.features.pool0.register_forward_hook(get_activation(observers, observer_name))

# Call the function
transform_images_to_features_csv (model, observers, observer_name, dataContainer, output_file=fileName, sublabels=True)

Downloading: "https://download.pytorch.org/models/densenet201-c1103571.pth" to /root/.cache/torch/hub/checkpoints/densenet201-c1103571.pth
100%|██████████| 77.4M/77.4M [00:01<00:00, 75.4MB/s]


In [None]:
files = ['DenseNet121.csv','DenseNet161.csv','DenseNet169.csv','Densenet201']
df = pd.DataFrame()
for file in files:
  data = pd.read_csv(file)
  df = pd.concat([df,data], axis = 1)
  df.to_csv('ConcatenatedFeatures.csv', index = False)

In [22]:
#Two-File Merge
filename1 = "DenseNet121.csv"
filename2 = "DenseNet201.csv"

output_filename = "DenseNet121+201.csv"

features_csv_merge(filename1, filename1, output_filename)