# Classification using Machine Learning Methods

## Data preprocessing

In [None]:
#import all the libraries
import pandas as pd

In [None]:
#create a new dataframe with only the columns of the selected features
import pandas as pd
columns_to_keep= ['spectralFluxUV_sma3nz_amean', 
                  'shimmerLocaldB_sma3nz_stddevNorm',
                  'HNRdBACF_sma3nz_amean',
                  'shimmerLocaldB_sma3nz_amean',
                  'HNRdBACF_sma3nz_stddevNorm',
                  'slopeUV500-1500_sma3nz_amean',
                  'F2frequency_sma3nz_stddevNorm',
                  'loudness_sma3_percentile20.0',
                  'jitterLocal_sma3nz_amean',
                  'jitterLocal_sma3nz_stddevNorm',
                 'F2bandwidth_sma3nz_stddevNorm',
                 'spectralFluxV_sma3nz_amean',
                 'spectralFlux_sma3_amean',
                 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope',
                 'loudness_sma3_stddevRisingSlope',
                 'slopeUV0-500_sma3nz_amean']

In [None]:
#create a function that selects the most relevant columns and creates the target variable
def import_and_clean (file_name, label):
    df = pd.read_csv(file_name, sep='\t')
    df = df.loc[:,columns_to_keep]
    df["target"] = label
    return df

In [None]:
#implement the function import_and_clean to the two dataframe (df_ASD and df_TD)
df_ASD = import_and_clean("ASD_children.tsv", "ASD")
df_TD = import_and_clean("TD_children.tsv", "Controls")

#merge the 2 datasets
df=pd.concat([df_ASD,df_TD],axis=0)

## Data processing

In [None]:
#import the libraries
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [None]:
#use the function LabelEncoder to normalise the data
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

df["target"]=le.fit_transform(df['target'])
df["target"].tail()

X = df[columns_to_keep]
y = df["target"]
X = preprocessing.normalize(X)

In [None]:
#implement the train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.2, 
                                                    shuffle=True, 
                                                    random_state=5)

## Supervised Learning Methods

In [None]:
#import all the libraries
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns;
from datetime import datetime
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,confusion_matrix,mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model, datasets

In [None]:
def model_trainer (model_kind, n_splits=3, **args):

    models = []
    skf = StratifiedKFold(n_splits=n_splits, shuffle = True)

    for train_index, test_index in skf.split(X_train, y_train):
        current = {}
        Xfold_train = X_train[train_index,:]
        Xfold_test = X_train[test_index,:]
        yfold_train = y_train.iloc[train_index]
        yfold_test = y_train.iloc[test_index]
        # Create Logistic regression object
        current["model"] = model_kind(**args) #random_state=42
        # Train the model using the training sets
        current["model"].fit(Xfold_train,yfold_train)
        # Verify predictions using the training folds
        y_pred = current["model"].predict(Xfold_train)
        current["train_accuracy"]=current["model"].score(Xfold_train, yfold_train) *100
        #calculate the recall
        current["train_recall"]=recall_score(yfold_train,y_pred)*100
        # calculate the precision
        current["train_precision"]=precision_score(yfold_train,y_pred)*100
        # calculate the f1 score
        current["train_f1"]=f1_score(yfold_train,y_pred)*100
        
        # Verify predictions using the validation fold
        y_pred = current["model"].predict(Xfold_test)
        current["val_accuracy"]=current["model"].score(Xfold_test, yfold_test) *100
        #calculate the recall
        current["val_recall"]=recall_score(yfold_test,y_pred)*100
        # calculate the precision
        current["val_precision"]=precision_score(yfold_test,y_pred)*100
        # calculate the f1 score
        current["val_f1"]=f1_score(yfold_test,y_pred)*100

        models.append(current)
    models = pd.DataFrame(models)
    return models

### Decision Tree

In [None]:
#train the classifier using model_trainer function 
#show the three folds obtained by the cross-validation on the train set

from sklearn.tree import DecisionTreeClassifier

tree_models = model_trainer(DecisionTreeClassifier)
tree_models

In [None]:
#choose the best model between the three folds
best_tree_model = tree_models.loc[1,"model"]

### K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_models = model_trainer(KNeighborsClassifier)
knn_models

In [None]:
best_knn_model = knn_models.loc[2,"model"] 

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_models = model_trainer(RandomForestClassifier, max_depth=3, n_estimators=4)
rf_models

In [None]:
best_rf_model = rf_models.loc[0,"model"] 

### Support Vector Machine

In [None]:
from sklearn.svm import LinearSVC
svm_models = model_trainer(LinearSVC)
svm_models

In [None]:
best_svm_model = svm_models.loc[1,"model"] 

### Calculate and Print the evaluation metrics for each model, both on tran and test sets

In [None]:
#MODEL NAME and NUMBER OF RUNNING(NoR)

y_train_model_NoR = best_model_NoR.predict(X_train)
y_pred_model_NoR  = best_model_NoR.predict(X_test)

#evaluate the performance of the train set

#calculate the accuracy
#train set
model_NoR_train_accuracy=accuracy_score(y_train,y_train_model_NoR)
print("Model accuracy of the model on train set: %.2f" %model_NoR_train_accuracy)
#test set
model_NoR_test_accuracy=accuracy_score(y_test,y_pred_model_NoR)
print("Model accuracy of the model on test set: %.2f" %model_NoR_test_accuracy)

#calculate the recall
#train set
model_NoR_train_recall=recall_score(y_train,y_train_model_NoR)
print("Model recall of the model on train set: %.2f" 
      %model_NoR_train_recall)
#test set
model_NoR_test_recall=recall_score(y_test,y_pred_model_NoR)
print("Model recall of the model on test set: %.2f" 
      %model_NoR_test_recall)

# calculate precision
#train set
model_NoR_train_precision=precision_score(y_train,y_train_model_NoR)
print("Model precision of the model on train set: %.2f" 
      %model_NoR_train_precision)
#test set
model_NoR_test_precision=precision_score(y_test,y_pred_model_NoR)
print("Model precision of the model on test set: %.2f" 
      %model_NoR_test_precision)

# calculate f1-score
#train st
model_train_f1=f1_score(y_train,y_train_model_NoR)
print("Model F1-score of the model on train set: %.2f" 
      %model_NoR_train_f1)
#test set
model_train_f1=f1_score(y_train,y_pred_model_NoR)
print("Model F1-score of the model on test set: %.2f" 
      %model_NoR_test_f1)

# calculate AUC
#train set
model_NoR_train_AUC=roc_auc_score(y_train,y_train_model_NoR)
print("Model AUC of the model on the train set: %.2f" 
      %model_NoR_train_AUC)
#test set
model_NoR_test_AUC=roc_auc_score(y_train,y_pred_model_NoR)
print("Model AUC of the model on the test set: %.2f" 
      %model_NoR_test_AUC)

### Save the models

In [None]:
import os
import pickle

pickle.dump(tree_models, open("{}/decision_tree_clf.pkl".format(directory), 
                              'wb'))
pickle.dump(knn_models, open("{}/knn_clf.pkl".format(directory), 
                             'wb'))
pickle.dump(rf_models, open("{}/random_forest_clf.pkl".format(directory), 
                            'wb'))
pickle.dump(svm_models, open("{}/svc.pkl".format(directory), 
                             'wb'))

### Confusion Matrix

In [None]:
def confusion_metrics_plot(y_test, y_preds):
    cfm = confusion_metrics(y_test, y_preds)

    tn = cfm[0][0]
    tp = cfm[1][1]
    fn = cfm[1][0]
    fp = cfm[0][1]

    group_names = ['True ASD','False TD','False ASD','True TD']
    group_counts = ["{0:0.0f}".format(value) for value in cfm.flatten()]
    group_percentages = ["{0:.2%}".format(value) 
                         for value in cfm.flatten()/np.sum(cfm)]
    labels = [v1+'\n'+v2+'\n'+v3 
              for v1, v2, v3 in zip(group_names,
                                    group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    sns.set(font_scale=1.3)
    
    fig, ax = plt.subplots(figsize=(7,5)) 
    sns.heatmap(cfm, vmin = 0, vmax = 5, annot=labels, fmt='', 
                cmap='Blues', ax=ax)
    plt.tight_layout()
    plt.ylabel('Gold standard labels', size=15)
    plt.xlabel('Classifier output labels', size=15)

In [None]:
#implement the function 'confusion_metrics_plot' to each best model, 
#both on train and test set

confusion_matrix_plot(y_train,y_train_model_NoR)
confusion_matrix_plot(y_test,y_pred_model_NoR)