In [15]:
import tensorflow as tf
from tensorflow.keras.applications import resnet50
from tensorflow.keras.applications import inception_resnet_v2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
# tf.config.experimental.set_memory_growth(physical_devices[0], True)

import xgboost  as xgb
from sklearn import tree
from sklearn import neural_network
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np
import pandas as pd
import os


import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
def set_random_seeds(seed_value=42):
    np.random.seed(seed_value)
    tf.random.set_seed(seed_value)

In [17]:
TASKS_NUMBERS = [1, 2, 3, 4, 9, 10]
# base_path = "/content/drive/MyDrive/S2/machine_learning/project/alzheimer/"
base_path = "/content/drive/MyDrive/MAIA/S2/machine_learning/project/alzheimer/"

def str2int_padded(img_nb, nb_zeros = 2):
    min_not_pad = 10**(nb_zeros-1)
    if img_nb<min_not_pad:
        img_nb = str(img_nb).zfill(nb_zeros)
    else:
        img_nb = str(img_nb)
    return img_nb

def get_path(task=1, folder = "images"):
    if task not in TASKS_NUMBERS:
        raise ValueError('Task not well selected, choose one among:', TASKS_NUMBERS)

    prefix = base_path + folder + '/TASK'
    path = prefix + str2int_padded(task)

    return path


In [18]:
def get_dataset_flow_train_eval(task, batch_size):
  set_random_seeds()  
  data_dir = get_path(task=task, folder="images_sep")
  input_shape = (299, 299, 3)

  data_augmentation = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,)
  # Load and preprocess the data
  train_data = data_augmentation.flow_from_directory(
    directory=data_dir + "/train",
    target_size=input_shape[:2],
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=False)

  val_data = data_augmentation.flow_from_directory(
    directory=data_dir + "/eval",
    target_size=input_shape[:2],
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=False)
  
  return train_data, val_data


In [19]:
def normalize_data(features):
    scaler = preprocessing.MinMaxScaler().fit(features) 

    features_scaled = scaler.transform(features)
    features_normalized = preprocessing.normalize(features_scaled, norm='l2')
    
    return features_normalized


In [20]:
def get_best_parameters_rf(features, labels):
    set_random_seeds()
    # Apply a stratified 10-fold cross validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    pipe_RF = Pipeline([('classifier', RandomForestClassifier())])

    parameters_RF={'classifier__n_estimators': [100, 200],
                   'classifier__bootstrap': [True, False], 
                   'classifier__max_depth': [10, 20, 50], 
                   'classifier__min_samples_leaf': [1, 2, 4], 
                   'classifier__min_samples_split': [2, 5, 10],
                   'classifier__max_features': ['auto', 'sqrt']}

    grid_search_RF = GridSearchCV(pipe_RF, parameters_RF, cv=cv)
    grid_search_RF.fit(features, labels)

    rf_params = grid_search_RF.best_params_
    rf_params = {key.replace('classifier__', ''): value for key, value in rf_params.items()}

    print('Best cross-validation accuracy for RandomForest:', grid_search_RF.best_score_)

    return rf_params


def get_best_parameters_xgb(features, labels):
    set_random_seeds()
    # Apply a stratified 10-fold cross validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    pipe_XGB = Pipeline([('classifier', xgb.XGBClassifier())])

    parameters_XGB={'classifier__min_child_weight' : [1, 5, 10],
                    'classifier__gamma': [0.5, 1, 1.5, 2],
                    'classifier__subsample': [0.6, 0.8, 1],
                    'classifier__colsample_bytree': [0.6, 0.8, 1],
                    'classifier__max_depth': [3, 4]}

    grid_search_XGB = GridSearchCV(pipe_XGB, parameters_XGB, cv=cv)
    grid_search_XGB.fit(features, labels)

    XGB_params = grid_search_XGB.best_params_
    XGB_params = {key.replace('classifier__', ''): value for key, value in XGB_params.items()}
    
    print('Best cross-validation accuracy for XGB:', grid_search_XGB.best_score_)

    return XGB_params

def get_best_parameters_tree(features, labels):
    set_random_seeds()
    # Apply a stratified 10-fold cross validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    pipe_tree = Pipeline([('classifier', tree.DecisionTreeClassifier())])

    parameters_tree={'classifier__criterion' : ['gini', 'entropy'],
                    'classifier__min_samples_split': [2, 10],
                    'classifier__max_depth': [2, 5, 10],
                    'classifier__min_samples_leaf': [1, 5, 10],
                    'classifier__max_leaf_nodes': [2, 5, 10]}

    grid_search_tree = GridSearchCV(pipe_tree, parameters_tree, cv=cv)
    grid_search_tree.fit(features, labels)

    tree_params = grid_search_tree.best_params_
    tree_params = {key.replace('classifier__', ''): value for key, value in tree_params.items()}
    
    print('Best cross-validation accuracy for DecisionTree:', grid_search_tree.best_score_)

    return tree_params

def get_best_parameters_svm(features, labels):
    set_random_seeds()
    # Apply a stratified 10-fold cross validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    pipe_svm = Pipeline([('classifier', SVC())])

    parameters_svm={'classifier__C' : [0.1, 1, 10, 100],
                    'classifier__gamma': [1, 0.1, 0.01, 0.001],
                    'classifier__kernel': ['rbf'],
                    'classifier__class_weight': ['balanced', None]}

    grid_search_svm = GridSearchCV(pipe_svm, parameters_svm, cv=cv)
    grid_search_svm.fit(features, labels)

    svm_params = grid_search_svm.best_params_
    svm_params = {key.replace('classifier__', ''): value for key, value in svm_params.items()}
    
    print('Best cross-validation accuracy for SVM:', grid_search_svm.best_score_)

    return svm_params

def get_best_parameters_mlp(features, labels):
    set_random_seeds()
    # Apply a stratified 10-fold cross validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    pipe_mlp = Pipeline([('classifier', neural_network.MLPClassifier())])

    parameters_mlp={'classifier__hidden_layer_sizes' : [50, 100, 200],
                    'classifier__activation': ['tanh', 'relu'],
                    'classifier__solver': ['ibfgs', 'sgd'],
                    'classifier__alpha': [0.0001, 0.05],
                    'classifier__learning_rate': ['constant', 'adaptative']}

    grid_search_mlp = GridSearchCV(pipe_mlp, parameters_mlp, cv=cv)
    grid_search_mlp.fit(features, labels)

    mlp_params = grid_search_mlp.best_params_
    mlp_params = {key.replace('classifier__', ''): value for key, value in mlp_params.items()}
    
    print('Best cross-validation accuracy for MLP:', grid_search_mlp.best_score_)

    return mlp_params

In [21]:
def RF_classifier(df, num, train, y_train, val, y_val, **rf_params):
    set_random_seeds()
    rf = RandomForestClassifier(random_state=42, **rf_params)
    rf.fit(train, y_train)
    y_pred = rf.predict(val)
    
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    df.loc['Accuracy', (num, 'RandomForest')] = accuracy.max()
    df.loc['F1-Score', (num, 'RandomForest')] = f1.max()
    df.loc['Precision', (num, 'RandomForest')] = precision.max()
    df.loc['Recall', (num, 'RandomForest')] = recall.max()

    return df


In [22]:
def XGB_classifier(df, num, train, y_train, val, y_val, **xgb_params): 
    set_random_seeds()
    XGB = xgb.XGBClassifier(**xgb_params)
    XGB.fit(train, y_train)
    y_pred = XGB.predict(val)
    
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    df.loc['Accuracy', (num, 'XGBoost')] = accuracy.max()
    df.loc['F1-Score', (num, 'XGBoost')] = f1.max()
    df.loc['Precision', (num, 'XGBoost')] = precision.max()
    df.loc['Recall', (num, 'XGBoost')] = recall.max()

    return df
    

In [23]:
def TREE_classifier(df, num, train, y_train, val, y_val, **tree_params): 
    set_random_seeds()
    dt = tree.DecisionTreeClassifier(**tree_params)
    dt.fit(train, y_train)
    y_pred = dt.predict(val)
    
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    df.loc['Accuracy', (num, 'DecisionTree')] = accuracy.max()
    df.loc['F1-Score', (num, 'DecisionTree')] = f1.max()
    df.loc['Precision', (num, 'DecisionTree')] = precision.max()
    df.loc['Recall', (num, 'DecisionTree')] = recall.max()

    return df

In [24]:
def SVM_classifier(df, num, train, y_train, val, y_val, **svm_params): 
    set_random_seeds()
    SVM = SVC(**svm_params)
    SVM.fit(train, y_train)
    y_pred = SVM.predict(val)
    
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    df.loc['Accuracy', (num, 'SVM')] = accuracy.max()
    df.loc['F1-Score', (num, 'SVM')] = f1.max()
    df.loc['Precision', (num, 'SVM')] = precision.max()
    df.loc['Recall', (num, 'SVM')] = recall.max()

    return df

In [25]:
def MLP_classifier(df, num, train, y_train, val, y_val, **mlp_params): 
    set_random_seeds()
    MLP = neural_network.MLPClassifier(**mlp_params)
    MLP.fit(train, y_train)
    y_pred = MLP.predict(val)
    
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    df.loc['Accuracy', (num, 'MLP')] = accuracy.max()
    df.loc['F1-Score', (num, 'MLP')] = f1.max()
    df.loc['Precision', (num, 'MLP')] = precision.max()
    df.loc['Recall', (num, 'MLP')] = recall.max()

    return df

In [26]:
def get_features_from_layer(model, imgs_train, imgs_val, layer=-1):
    set_random_seeds()
    layer_names = [layer.name for layer in model.layers]
    if isinstance(layer, int):
        if layer < 0 or layer > len(layer_names):
            layer = layer_names[-1]
        else:
            layer = layer_names[layer]
    elif isinstance(layer, str):
        if layer not in layer_names:
            layer = layer_names[-1]
    model_cuted = Model(inputs=model.input, outputs=model.get_layer(layer).output)
    return model_cuted.predict(imgs_train), model_cuted.predict(imgs_val)


In [27]:
def feature_extraction(features, labels, features_val):
  select = SelectKBest(score_func=f_classif, k=100)
  fit = select.fit(features, labels)

  return select.fit_transform(features , labels), select.transform(features_val)

In [28]:
set_random_seeds()

tasks = TASKS_NUMBERS
# classifiers = ['RandomForest', 'DecisionTree', 'SVM', 'XGBoost', 'MLP']
classifiers = ['DecisionTree', 'SVM', 'XGBoost', 'MLP']
columns = pd.MultiIndex.from_product([tasks, classifiers], names=['Task', 'Classifier'])
rows = ['Accuracy', 'F1-Score', 'Precision', 'Recall']
tabla = pd.DataFrame(index=rows, columns=columns)

for i in tasks:
    task = i
    print('\n\n---- TASK NUMBER ' + str(task), '----')
    folder = base_path+'mica/models/'
    extension = '.h5'

    if task == 1:
        name_model = 'task_01_vgg19_imagenet_epochs=10_nfl=2_batch=4_optimizer=adam'
    elif task == 2:
        name_model = 'task_02_inceptionV3_imagenet_epochs=10_nfl=0_batch=4_optimizer=adam'
    elif task == 3:
        name_model = 'task_03_vgg19_imagenet_epochs=8_nfl=2_batch=4_optimizer=adam'
    elif task == 4:
        name_model = 'task_04_resnet50_imagenet_epochs=10_nfl=0_batch=4_optimizer=paper'
    elif task == 9:
        name_model = 'task_09_inceptionResnetV2_imagenet_epochs=40_nfl=3_batch=4_optimizer=paper'
    elif task == 10:
        name_model = 'task_10_inceptionResnetV2_imagenet_epochs=10_nfl=1_batch=4_optimizer=adam'
    
    path = folder+name_model+extension
    model = load_model(path)

    # get the dataset
    train_data, val_data = get_dataset_flow_train_eval(task=task, batch_size=32)
    train_label = train_data.classes
    val_label = val_data.classes

    # get the features from a layer of the model
    layer_number = len(model.layers) - 4 
    features_train, features_val = get_features_from_layer(model, train_data, val_data, layer_number)

    features_train = np.array(features_train)
    num_samples_train = features_train.shape[0]
    features_train = features_train.reshape(num_samples_train, -1)
    features_train = normalize_data(features_train)

    features_val = np.array(features_val)
    num_samples_val = features_val.shape[0]
    features_val = features_val.reshape(num_samples_val, -1)
    features_val = normalize_data(features_val)

    features_train, features_val = feature_extraction(features_train, train_label, features_val)

    df = pd.DataFrame(features_train)
    df.to_csv(base_path+'mica/results/task_'+str(task)+'_features_train.csv', index=False)

    # get the best parameters for each classifier
    rf_params = get_best_parameters_rf(features_train, train_label)
    df_rf = pd.DataFrame.from_dict(rf_params, orient='index').transpose()
    df_rf.to_csv(base_path+'mica/results/task_'+str(task)+'rf_params.csv', index=False)
    print('rf')

    xgb_params = get_best_parameters_xgb(features_train, train_label)
    df_xgb = pd.DataFrame.from_dict(xgb_params, orient='index').transpose()
    df_xgb.to_csv(base_path+'mica/results/task_'+str(task)+'xgb_params.csv', index=False)
    print('xgb')

    tree_params = get_best_parameters_tree(features_train, train_label)
    df_dt = pd.DataFrame.from_dict(tree_params, orient='index').transpose()
    df_dt.to_csv(base_path+'mica/results/task_'+str(task)+'tree_params.csv', index=False)
    print('dt')

    svm_params = get_best_parameters_svm(features_train, train_label)
    df_svm = pd.DataFrame.from_dict(svm_params, orient='index').transpose()
    df_svm.to_csv(base_path+'mica/results/task_'+str(task)+'svm_params.csv', index=False)
    print('svm')

    mlp_params = get_best_parameters_mlp(features_train, train_label)
    print('mlp')
    df_mlp = pd.DataFrame.from_dict(mlp_params, orient='index').transpose()
    df_mlp.to_csv(base_path+'mica/results/task_'+str(task)+'mlp_params.csv', index=False)

    # classify the data
    tabla = RF_classifier(tabla, i, features_train, train_label, features_val, val_label, **rf_params)
    tabla = XGB_classifier(tabla, i, features_train, train_label, features_val, val_label, **xgb_params)
    tabla = TREE_classifier(tabla, i, features_train, train_label, features_val, val_label, **tree_params)
    tabla = SVM_classifier(tabla, i, features_train, train_label, features_val, val_label, **svm_params)
    tabla = MLP_classifier(tabla, i, features_train, train_label, features_val, val_label, **mlp_params)

tabla.to_csv(base_path+'mica/results/results_table.csv')

print(tabla)



---- TASK NUMBER 1 ----
Found 115 images belonging to 2 classes.
Found 50 images belonging to 2 classes.
Best cross-validation accuracy for RandomForest: 0.8522727272727272
rf
Best cross-validation accuracy for XGB: 0.8522727272727272
xgb
Best cross-validation accuracy for DecisionTree: 0.818939393939394
dt
Best cross-validation accuracy for SVM: 0.834090909090909
svm
Best cross-validation accuracy for MLP: 0.5681818181818182
mlp


---- TASK NUMBER 2 ----
Found 116 images belonging to 2 classes.
Found 50 images belonging to 2 classes.
Best cross-validation accuracy for RandomForest: 0.7583333333333334
rf
Best cross-validation accuracy for XGB: 0.7931818181818182
xgb
Best cross-validation accuracy for DecisionTree: 0.7560606060606061
dt
Best cross-validation accuracy for SVM: 0.7583333333333334
svm
Best cross-validation accuracy for MLP: 0.5257575757575758
mlp


---- TASK NUMBER 3 ----
Found 116 images belonging to 2 classes.
Found 50 images belonging to 2 classes.
Best cross-validati

In [29]:
# set_random_seeds()

# tareas = [1, 2, 9]
# clasificadores = ['RandomForest', 'DecisionTree', 'SVM', 'XGBoost', 'MLP']
# columnas = pd.MultiIndex.from_product([tareas, clasificadores], names=['Tarea', 'Clasificador'])
# filas = ['Accuracy', 'F1-Score', 'Precision', 'Recall']
# tabla = pd.DataFrame(index=filas, columns=columnas)


# for i in tareas:
#     task = i

#     data_dir = get_path(task=task)

#     resnet = resnet50.ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

#     preprocess = image.ImageDataGenerator(
#         preprocessing_function=tf.keras.applications.resnet50.preprocess_input,
#         validation_split=0.2)

#     dataset = preprocess.flow_from_directory(data_dir, target_size=(224, 224), batch_size=32)

#     layer_number = -1
#     features = get_features_from_layer(resnet, dataset, layer_number)

#     labels = dataset.classes

#     features = np.array(features)
#     num_samples = features.shape[0]
#     features = features.reshape(num_samples, -1)
#     features = normalize_data(feature_extraction(features, labels, k=20))

#     rf_params = get_best_parameters_rf(features, labels)
#     xgb_params = get_best_parameters_xgb(features, labels)
#     tree_params = get_best_parameters_tree(features, labels)
#     svm_params = get_best_parameters_svm(features, labels)
#     mlp_params = get_best_parameters_mlp(features, labels)

#     print('\n\n---- TASK NUMBER ' + str(i), '----')
#     tabla = RF_classifier(tabla, i, features, labels, **rf_params)
#     tabla = XGB_classifier(tabla, i, features, labels, **xgb_params)
#     tabla = TREE_classifier(tabla, i, features, labels, **tree_params)
#     tabla = SVM_classifier(tabla, i, features, labels, **svm_params)
#     tabla = MLP_classifier(tabla, i, features, labels, **mlp_params)


# print(tabla)



In [30]:
# set_random_seeds()

# tareas = [1, 2, 9]
# clasificadores = ['RandomForest', 'DecisionTree', 'SVM', 'XGBoost', 'MLP']
# columnas = pd.MultiIndex.from_product([tareas, clasificadores], names=['Tarea', 'Clasificador'])
# filas = ['Accuracy', 'F1-Score', 'Precision', 'Recall']
# tabla = pd.DataFrame(index=filas, columns=columnas)

# for i in [1, 2, 9]:
#     task = i

#     data_dir = get_path(task=task)

#     inception_resnet = inception_resnet_v2.InceptionResNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

#     preprocess = image.ImageDataGenerator(
#         preprocessing_function=tf.keras.applications.inception_resnet_v2.preprocess_input,
#         validation_split=0.2)

#     dataset = preprocess.flow_from_directory(data_dir, target_size=(224, 224), batch_size=32)

#     layer_number = -1
#     features = get_features_from_layer(inception_resnet, dataset, layer_number)

#     labels = dataset.classes

#     features = np.array(features)
#     num_samples = features.shape[0]
#     features = features.reshape(num_samples, -1)
#     features = normalize_data(feature_extraction(features, labels, k=20))

#     rf_params = get_best_parameters_rf(features, labels)
#     xgb_params = get_best_parameters_xgb(features, labels)
#     tree_params = get_best_parameters_tree(features, labels)
#     svm_params = get_best_parameters_svm(features, labels)
#     mlp_params = get_best_parameters_mlp(features, labels)

#     print('\n\n---- TASK NUMBER ' + str(i), '----')
#     tabla = RF_classifier(tabla, i, features, labels, **rf_params)
#     tabla = XGB_classifier(tabla, i, features, labels, **xgb_params)
#     tabla = TREE_classifier(tabla, i, features, labels, **tree_params)
#     tabla = SVM_classifier(tabla, i, features, labels, **svm_params)
#     tabla = MLP_classifier(tabla, i, features, labels, **mlp_params)


# print(tabla)