# Notebook for ML grid search + cross validation

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import RFE
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier

from numpy import set_printoptions
set_printoptions(precision=3)

In [None]:
# first load in the data

# - FILE (a unique designator), 
# - GROUP (1 = control, 2 = aMCI, 3 = AD)
# - ANIMALS / TRANSCRIPT (the human transcription of animal fluency / free speech)
# - <features>

af = pd.read_excel('animal_fluency_features.xlsx')
fs = pd.read_excel('free_speech_features.xlsx')

# merge the two feature sets on the FILE column
df = af.merge(fs, on='FILE')

# drop the columns you don't want to use for feature analysis
df = df.drop(columns=['participant_parse_depth_per_sentence'])
df = df.drop(columns=['TRANSCRIPT'])
df = df.drop(columns=['ANIMALS'])

# Feature analysis

In [None]:
# top features based on ANOVA F value

dfarray = df.values
X = dfarray[:,1:-1] # first column is FILE, last is GROUP 
Y = dfarray[:,-1] # last is GROUP

# feature extraction
test = SelectKBest(score_func=f_classif, k=10)
fit = test.fit(X, Y)

# summarize scores
scores = fit.scores_

# Indices of 10 largest elements in list 
# using sorted() + lambda + list slicing 
res = sorted(range(len(scores)), key = lambda sub: scores[sub], reverse=True)[:10] 
  
# printing top 10 f value featueres 
print("Indices list of max 10 elements is : " + str(res)) 
for i, ind in zip(range(len(res)), res):
    print(str(i), ':', df.columns[1:-1][ind])
    

In [None]:
# top features with recursive feature elimination 

model = LogisticRegression()
rfe = RFE(model, 10)
fit = rfe.fit(X, Y) # from above

print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

chosenfeatures = [df.columns[1:-1][i] for i in range(len(fit.support_)) if fit.support_[i]]
print('chosen features: ', chosenfeatures)

In [None]:
# top features from feature importance in extra trees classifier

model = ExtraTreesClassifier(n_estimators=10)
model.fit(X, Y)

scores = model.feature_importances_

# Indices of 10 largest elements in list 
# using sorted() + lambda + list slicing 
res = sorted(range(len(scores)), key = lambda sub: scores[sub], reverse=True)[:10] 
  
# printing top 10 f value featueres 
print("Indices list of max 10 elements is : " + str(res)) 
for i, ind in zip(range(len(res)), res):
    print(str(i), ':', df.columns[1:-1][ind])

In [None]:
# heat map of correlations to group label

corr = df.corr()

corr.style.background_gradient()

# Grid search

In [None]:
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(92): # if doing k folds: for i in range(self.grid_searches[k].cv): 
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]


models1 = {
    'ExtraTreesClassifier': ExtraTreesClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'SVC': SVC(),
    'LogisticRegressionClassifier': LogisticRegression(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
}
    
params1 = {
    'ExtraTreesClassifier': { 'n_estimators': [16, 32, 64, 100], 'criterion':['gini','entropy'], 'max_depth': np.arange(3, 15)}, #96
    'RandomForestClassifier': { 'n_estimators': [16, 32, 64, 100], 'max_depth':np.arange(3, 15) }, #48
    'SVC': [
        {'kernel': ['linear'], 'C': [0.025, .1, 1, 10]},
        {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [2, 1, .1, .01, 0.001, 0.0001]},
    ], 
    'LogisticRegressionClassifier': { 'solver':['newton-cg', 'lbfgs', 'sag', 'saga'],'fit_intercept':[True, False], 'C':[.5, 1.5, 3, 4, 3.5, 4.5, 5, 10], 'tol': [1e-20, 1e-10, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]},
    'KNeighborsClassifier': {'n_neighbors':[3,5,11,19], 'weights':['uniform', 'distance'], 'metric':['manhattan']},
    'DecisionTreeClassifier': {'max_depth':np.arange(3, 15)},
    'GradientBoostingClassifier': {},
}


In [None]:
dfarray = df.values
X = dfarray[:,1:-1]
Y = dfarray[:,-1]

#scale values
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X_scaled, Y, cv=LeaveOneOut(), n_jobs=-1)
helper1.score_summary(sort_by='mean_score')

## repeat the above with different sets of features found above ##

# Leave one out cross validation

In [None]:
'''
# depending on the scenario, may need to mask the group labels to be 0s/1s 
# e.g., for predicting control vs aMCI and AD, I masked controls to be 0 and aMCI/AD to be 1:
df['GROUP'].mask(features['group'] == 1, 0, inplace=True)
df['GROUP'].mask(features['group'] == 2, 1, inplace=True)
df['GROUP'].mask(features['group'] == 3, 1, inplace=True)
'''

# an explicit way to do leave one out cross validation:

dfarray = df.values 
X = dfarray[:,1:-1] # can narrow it down to smaller sets of features here -> 
                    # i.e., X = df['feature1', 'feature2','feature3'].values
Y = dfarray[:,-1]

#scale values
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

#train on all but 1, test on 1

predictions = {}
y_true = []
y_pred = []

for row_index in range(len(X)):
    
    train_x = np.delete(X, row_index, 0)
    train_y = np.delete(Y, row_index, 0)
    
    test_x = X[row_index]
    test_y = Y[row_index]
    
    scaler = MinMaxScaler()
    Xtrain_scaled = scaler.fit_transform(train_x)
    Xtest_scaled = scaler.transform(test_x.reshape(1, -1))
    
    # depending on the classification sccenario, the most appropriate model changes
    rfc = ExtraTreesClassifier(bootstrap= True, class_weight='balanced', criterion= 'gini', max_depth= 11, n_estimators= 100)
    
    rfc.fit(Xtrain_scaled, train_y) 
    
    prediction = rfc.predict(Xtest_scaled)[0]
    
    y_true.append(test_y)
    y_pred.append(prediction)
    
    predictions[row_index] = prediction


In [None]:
# from here we can compare predictions to Y:

classification_report(y_true, y_pred, target_names=['healthy control', 'aMCI', 'AD'])

In [None]:
confusion_matrix(y_true, y_pred, labels=[1,2,3])