# Import libraries


In [47]:
import torch
import tensorflow as tf
from keras.callbacks import EarlyStopping, ModelCheckpoint

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from miceforest import ImputationKernel
import os 
from os import path
import pickle
from clinica.pipelines.machine_learning import algorithm, validation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import  StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE, RandomOverSampler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier



from pytorch_tabnet.tab_model import TabNetClassifier

# Upload Data

In [None]:
# upload df_train csv file
file='AD_dataset.csv'

all_features=['T1_score', 'fdg_score','sex', 'education_level', 'apoe4', 'apoe_gen1',
       'apoe_gen2', 'age', 'MMSE', 'cdr_sb', 'cdr_global', 'adas11', 'adas13',
       'adas_memory', 'adas_language', 'adas_concentration', 'adas_praxis',
       'ravlt_immediate', 'moca', 'TMT_A', 'TMT_B', 'dsst', 'logmem_delay',
       'logmem_imm', 'adni_ventricles_vol', 'adni_hippocampus_vol',
       'adni_brain_vol', 'adni_entorhinal_vol', 'adni_fusiform_vol',
       'adni_midtemp_vol', 'adni_icv', 'adni_fdg', 'adni_pib', 'adni_av45',
       'adni_abeta', 'adni_tau', 'adni_ptau']
models = {
          # Models using only demographic and clinical data

          "base": ["sex", "education_level", "MMSE", "cdr_sb"],

          "base_logmem": ["sex", "education_level", "MMSE", "cdr_sb", "logmem_delay", "logmem_imm"],

          "base_ravlt": ["sex", "education_level", "MMSE", "cdr_sb", "ravlt_immediate"],

          "base_logmem_ravlt": ["sex", "education_level", "MMSE", "cdr_sb", "ravlt_immediate", "logmem_delay",
                                "logmem_imm"],

          "base_adas": ["sex", "education_level", "MMSE", "cdr_sb", "adas_memory", "adas_language",
                              "adas_concentration", "adas_praxis"],

          "base_ravlt_adas": ["sex", "education_level", "MMSE", "cdr_sb", "adas_memory", "adas_language",
                              "adas_concentration", "adas_praxis", "ravlt_immediate"],

          # Models including APOE

          "base_ravlt_apoe": ["sex", "education_level", "apoe4", "MMSE", "cdr_sb", "ravlt_immediate"],

          "base_adas_apoe": ["sex", "education_level", "apoe4", "MMSE", "cdr_sb", "adas_memory", "adas_language",
                             "adas_concentration", "adas_praxis"],

          "base_ravlt_adas_apoe": ["sex", "education_level", "apoe4", "MMSE", "cdr_sb", "adas_memory", "adas_language",
                                   "adas_concentration", "adas_praxis", "ravlt_immediate"],

          # Models including imaging scores

          "base_T1score": ["sex", "education_level", "MMSE", "cdr_sb", "T1_score"],

          "base_fdgscore": ["sex", "education_level", "MMSE", "cdr_sb", "fdg_score"],

          "base_scores": ["sex", "education_level", "MMSE", "cdr_sb", "T1_score", "fdg_score"],

          "base_ravlt_scores": ["sex", "education_level", "MMSE", "cdr_sb", "ravlt_immediate", "T1_score", "fdg_score"],

          "base_adas_scores": ["sex", "education_level", "MMSE", "cdr_sb", "adas_memory", "adas_language",
                               "adas_concentration", "adas_praxis", "T1_score", "fdg_score"],

          "base_adas_memtest_scores": ["sex", "education_level", "MMSE", "cdr_sb", "adas_memory", "adas_language",
                                       "adas_concentration", "adas_praxis", "ravlt_immediate", "T1_score", "fdg_score"],
       
          "all_features": all_features}


original_data = pd.read_csv(file)
original_data['adni_abeta']  = original_data['adni_abeta'].replace('>1700', '1700').astype(float)
original_data = original_data.drop(columns=['participant_id', 'session_id', 'marital_status'])

# encode the target variable, sMCI = 1, pMCI = 0
original_data['diagnosis'] = LabelEncoder().fit_transform(original_data['diagnosis'])

# Initialize Impuation model to impute missing values
imputer = ImputationKernel(original_data,random_state=42,save_all_iterations_data=True)
imputer.mice(5)
data = imputer.complete_data()


print("Original dataset shape {}".format(data.shape))
#apply SMOTE over data df  to balance the dataset
sm = SMOTE(sampling_strategy="not majority") 
df, df['diagnosis'] = sm.fit_resample(data[all_features], data['diagnosis'])
print("Resampled dataset shape {}".format(df.shape))


In [None]:
# Plot SMOTE
fig, ax = plt.subplots(figsize=(12, 8))
scatter =ax.scatter(original_data['ravlt_immediate'], original_data['adas_memory'], c=original_data['diagnosis'],  s=40,  edgecolors='none', label='All', cmap='viridis')
legend_plt = ax.legend(*scatter.legend_elements(),
                       loc="lower left", title="Digits")
ax.add_artist(legend_plt)
plt.title('Before SMOTE')
plt.show()


fig, ax = plt.subplots(figsize=(12, 8))
scatter =ax.scatter(df['ravlt_immediate'], df['adas_memory'], c=df['diagnosis'], cmap='viridis', s=40,  edgecolors='none', label='All')
legend_plt = ax.legend(*scatter.legend_elements(),
                       loc="lower left", title="Digits")
ax.add_artist(legend_plt)
plt.title('After SMOTE')
plt.show()

In [None]:
def get_classifiers():
    classifiers = {
        "Random Forest": RandomForestClassifier(max_depth=2, random_state=0),
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=0),
        "SVM": SVC(),
        "KNN": KNeighborsClassifier(n_neighbors=3),
        "Naive Bayes": GaussianNB(),
        "Decision Tree": DecisionTreeClassifier(),
        "XGBoost": XGBClassifier(),
        "Neural Network": MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, max_iter=1000),
        "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=0),
        "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0),
        "Bagging": BaggingClassifier(n_estimators=100, random_state=0),
        "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=0)
    }
    return classifiers



for model_name, features in models.items():
    classifiers = get_classifiers()
    print(f"Model: {model_name}, Features: {features}")
    X = df[features]
    y = df['diagnosis']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    results = {}
    for classifier_name, classifier in classifiers.items():
        #print(f"Training {classifier_name}")
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        results[classifier_name] = [accuracy_score(y_test, y_pred), mean_squared_error(y_test, y_pred)]

    print(f"{'Classifier':<20}{'Accuracy':<10}{'MSE':<10}")
    print("-" * 40)
    for classifier_name, result in results.items():
        print(f"{classifier_name:<20}{result[0]:<10.2f}{result[1]:<10.2f}")
    print("-" * 40)
    print("\n")





    


In [None]:
class TsvRFWf(): #class to run the random forest classification
    def __init__(self, data_tsv, columns, output_dir, n_threads=20, n_iterations=250, test_size=0.2,
                 grid_search_folds=10, balanced=True, n_estimators_range=(100, 200, 400), max_depth_range=[None],
                 min_samples_split_range=[2], max_features_range=('auto', 0.25, 0.5), splits_indices=None,
                 inner_cv=False):

        self._output_dir = output_dir
        self._n_threads = n_threads
        self._n_iterations = n_iterations
        self._test_size = test_size
        self._grid_search_folds = grid_search_folds
        self._balanced = balanced
        self._n_estimators_range = n_estimators_range
        self._max_depth_range = max_depth_range
        self._min_samples_split_range = min_samples_split_range
        self._max_features_range = max_features_range
        self._splits_indices = splits_indices
        self._inner_cv = inner_cv
        self._columns=columns

        self._dataframe = data_tsv
        self._validation = None
        self._algorithm = None

    def run(self):
        x = self._dataframe[self._columns].to_numpy()
        unique = list(set(self._dataframe["diagnosis"]))
        y = np.array([unique.index(x) for x in self._dataframe["diagnosis"]])
        
        #apply random forest from algorithm on the parameters given by the user 
        parameters_dict = {
            "balanced": self._balanced,
            "grid_search_folds":  self._grid_search_folds,
            "n_estimators_range": self._n_estimators_range,
            "max_depth_range": self._max_depth_range,
            "min_samples_split_range": self._min_samples_split_range,
            "max_features_range": self._max_features_range,
            "n_threads": self._n_threads,
        }
        
        self._algorithm = algorithm.RandomForest(x, y,algorithm_params=parameters_dict)
                                                       
        parameters_dict = {
            "n_iterations": self._n_iterations,
            "test_size": self._test_size,
            "n_threads": self._n_threads,
            "splits_indices": self._splits_indices,
            "inner_cv": self._inner_cv,
        }
      
        self._validation = validation.RepeatedHoldOut(self._algorithm,validation_params=parameters_dict)
        
        classifier, best_params, results = self._validation.validate(y)

        
        classifier_dir = os.path.join(self._output_dir, 'classifier')
        if not path.exists(classifier_dir):
            os.makedirs(classifier_dir)

        self._algorithm.save_classifier(classifier, classifier_dir)
        self._algorithm.save_parameters(best_params, classifier_dir)
        weights = self._algorithm.save_weights(classifier, classifier_dir,self._output_dir)

        self._validation.save_results(self._output_dir)
        

def rf_classifications(model_name, columns, data_tsv_template, output_dir, months, indices_template=None,n_iterations=250,
                       test_size=0.2, n_threads=40, balanced=True, n_estimators_range=100, max_depth_range=5,
                       min_samples_split_range=2, max_features_range='auto', inner_cv=False):
   
    for i in months:
        if indices_template is None:splits_indices=None
        else:
            with open(indices_template, 'rb') as ind:splits_indices = pickle.load(ind,encoding='iso-8859-1')

        classification_dir = path.join(output_dir, '%s_months' % i, model_name)

        if not path.exists(classification_dir): os.makedirs(classification_dir)

        print("Running %s" % classification_dir)
        
        wf = TsvRFWf(data_tsv_template, columns, classification_dir, n_threads=n_threads, n_iterations=n_iterations,
                     test_size=test_size, balanced=balanced, n_estimators_range=n_estimators_range,
                     max_depth_range=max_depth_range, min_samples_split_range=min_samples_split_range,
                     max_features_range=max_features_range, splits_indices=splits_indices, inner_cv=inner_cv)
        
        wf.run()


def xgboost(file,columns,splits_indices,output):
    dataframe = pd.io.parsers.read_csv(file, sep='\t')
    x = dataframe[columns].to_numpy()
    unique = list(set(dataframe["diagnosis"]))
    y = np.array([unique.index(x) for x in dataframe["diagnosis"]])
    
    #apply random forest from algorithm on the parameters given by the user 
    parameters_dict = {
            "balanced": True,
            "grid_search_folds": 10,
            "max_depth_range": 10,
            "learning_rate_range": 0.01,
            "n_estimators_range": 150,
            "colsample_bytree_range": 0.5,
            "reg_alpha": 0,
            "reg_lambda": 1,
            "n_threads": 20,
        }
    
    algorithm1 = algorithm.XGBoost(x,y,parameters_dict)
                                                    
    
    
    parameters_dict = {
        "n_iterations": 250,
        "test_size": 0.2,
        "n_threads": 20,
        "splits_indices": splits_indices,
        "inner_cv": False,
    }
    
    validation1 = validation.RepeatedHoldOut(algorithm1,validation_params=parameters_dict)
    classifier, best_params, results = validation1.validate(y)    
    
    classifier_dir = os.path.join(output, 'classifier')
    if not path.exists(classifier_dir):
        os.makedirs(classifier_dir)

    algorithm1.save_classifier(classifier, classifier_dir)
    algorithm1.save_parameters(best_params, classifier_dir)
    weights = algorithm1.save_weights(classifier, classifier_dir,output)

    validation1.save_results(output)

def lreg(file,columns,splits_indices,output):

    dataframe = pd.io.parsers.read_csv(file, sep='\t')
    x = dataframe[columns].to_numpy()
    unique = list(set(dataframe["diagnosis"]))
    y = np.array([unique.index(x) for x in dataframe["diagnosis"]])
    
    #apply random forest from algorithm on the parameters given by the user 
    parameters_dict = {
            "penalty": "l2",
            "balanced": True,
            "grid_search_folds": 10,
            "c_range": np.logspace(-6, 2, 17),
            "n_threads": 20,
        }
    
    algorithm1 = algorithm.LogisticReg(x,y,parameters_dict)
                                                    
    
    
    parameters_dict = {
        "n_iterations": 250,
        "test_size": 0.2,
        "n_threads": 20,
        "splits_indices": splits_indices,
        "inner_cv": True,
    }
    
    validation1 = validation.RepeatedHoldOut(algorithm1,validation_params=parameters_dict)
    
    classifier, best_params, results = validation1.validate(y)

    
    
    classifier_dir = os.path.join(output, 'classifier')
    if not path.exists(classifier_dir):
        os.makedirs(classifier_dir)

    algorithm1.save_classifier(classifier, classifier_dir)
    algorithm1.save_parameters(best_params, classifier_dir)
    weights = algorithm1.save_weights(classifier, classifier_dir,output)

    validation1.save_results(output)      
       



In [None]:
# ANN  model 
#model with 3 hidden layers, 128, 64 and 5 neurons respectively
classification_dir = os.path.join('output')
if not os.path.exists(classification_dir):os.makedirs(classification_dir)

model=tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dense(64, activation=tf.nn.relu),
    tf.keras.layers.Dense(5, activation=tf.nn.sigmoid)
])
#compile model, learning rate=0.001, loss function=sparse cross entropy, optimizer=adam
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
#custom callback to save model after each epoch
class SaveBestModel(tf.keras.callbacks.Callback):
    #get the best model
    def on_epoch_end(self, epoch, logs=None):
        print("Epoch %d" % epoch)

        if logs['val_accuracy'] > self.best:
            self.best = logs['val_accuracy']
            model.save_weights(os.path.join(classification_dir, 'best_model.h5'))

output_dir = os.path.join('outputdir')
n_threads = 8
months = [36]
callbacks = [SaveBestModel()]
for i in models:
    print('model:',model)
    x = df[models[i]].to_numpy()
    y = df['diagnosis'].to_numpy()
    #split data into train and test
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    model.fit(X_train, y_train, epochs=200, callbacks=callbacks, validation_split=0.2)

    best_model = tf.keras.models.load_model('best_model.h5')
    
    features = best_model.predict(X_train)
    columns=['Feature1','Feature2','Feature3','Feature4','Feature5']
    new_df = pd.DataFrame(features, columns=columns)#'Feature1','Feature2','Feature3','Feature4','Feature5'
    new_df['diagnosis'] = y_train
    rf_classifications(i, columns, new_df,  output_dir, months, n_threads=n_threads)
    


In [4]:
for model_name, features in models.items():
    tabnet = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                              optimizer_params=dict(lr=2e-2),
                              scheduler_params={"step_size":10, # how to use learning rate scheduler
                                                "gamma":0.9},
                              scheduler_fn=torch.optim.lr_scheduler.StepLR,
                              mask_type='entmax', # "sparsemax"
                              n_steps=5,
                              n_a=8,
                              n_d=8,
                              gamma=1.3,
                              cat_idxs=[0, 1, 4],
                              cat_dims=[2, 3, 2],
                              cat_emb_dim=[1, 1, 1],
                              n_independent=2,
                              n_shared=2,
                              lambda_sparse=1e-4,
                              seed=0,
                              verbose=1,
                              )
    print(f"Model: {model_name}, Features: {features}")
    X = df[features]
    y = df['diagnosis']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    tabnet.fit(X_train.values, y_train.values, eval_set=[(X_test.values, y_test.values)], max_epochs=100, patience=10, batch_size=1024)
    y_pred = tabnet.predict(X_test.values)
    accuracy = accuracy_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{'TabNet':<20}{accuracy:<10.2f}{mse:<10.2f}")
    print("-" * 40)
    print("\n")
    

Device used : cpu


In [12]:
import os

project_structure = { "AD_project/": { "data/": ["preprocess.py", "imputation.py", "smote.py"], "models/": ["classifiers.py", "train_classifiers.py", "train_ann.py", "train_tabnet.py", "validation.py"], "utils/": ["plot.py", "save_model.py"], "main/": ["main_train_ml.py", "main_train_ann.py", "main_train_tabnet.py", "main_plot.py"], "requirements.txt": "", "README.md": "", "LICENSE": "", }}

def create_folders_files(structure, root="."):
    for key, value in structure.items():
        path = os.path.join(root, key)
        if isinstance(value, dict):
            os.makedirs(path, exist_ok=True)
            create_folders_files(value, path)
        else:
            os.makedirs(path, exist_ok=True)
            for file in value:
                open(os.path.join(path, file), 'a').close()

create_folders_files(project_structure)

FileExistsError: [WinError 183] Cannot create a file when that file already exists: '.\\AD_project/requirements.txt'