# Pipelines for classifiers

For each dataset, classifier and folds:
- scaling
- feature selection
- outerCV

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# remove warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix,accuracy_score, roc_auc_score,f1_score, recall_score, precision_score
from sklearn.utils import class_weight

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import RFECV, VarianceThreshold, SelectKBest, chi2
from sklearn.feature_selection import SelectFromModel, SelectPercentile, f_classif

### Define script parameters

In [None]:
# define output variables
outVars = ['Class']

# define list of folds
foldTypes = [10] # [3,5,10]

# define a label for output files
targetName = 's-Std.Outer'

seed = 42

### Function definitions

In [None]:
def  set_weights(y_data, option='balanced'):
    """Estimate class weights for umbalanced dataset
       If ‘balanced’, class weights will be given by n_samples / (n_classes * np.bincount(y)). 
       If a dictionary is given, keys are classes and values are corresponding class weights. 
       If None is given, the class weights will be uniform """
    cw = class_weight.compute_class_weight(option, np.unique(y_data), y_data)
    w = {i:j for i,j in zip(np.unique(y_data), cw)}
    return w 

In [None]:
def getDataFromDataset(sFile, OutVar):
    # read details file
    print('\n-> Read dataset', sFile)
    df = pd.read_csv(sFile)
    print('Shape', df.shape)
    # print(list(df.columns))

    # select X and Y
    ds_y = df[OutVar]
    ds_X = df.drop(OutVar,axis = 1)
    Xdata = ds_X.values # get values of features
    Ydata = ds_y.values # get output values

    print('Shape X data:', Xdata.shape)
    print('Shape Y data:',Ydata.shape)
    
    # return data for X and Y, feature names as list
    return (Xdata, Ydata, list(ds_X.columns))

In [None]:
def Pipeline_OuterCV(Xdata, Ydata, label = 'my', class_weights = {0: 1, 1: 1}, folds = 10, seed = 42):
    # inputs:
    # data for X, Y; a label about data, class_weights, number of folds, seeed
    
    # default: 10-fold CV, 1:1 class weights (balanced dataset)
    priors = [(class_weights[0]/(class_weights[0]+class_weights[1])), (class_weights[1]/(class_weights[0]+class_weights[1]))]
    
    # define classifiers
    names = ['KNN', 'SVM linear', 'SVM', 'LR', 'LDA', 'DT', 'RF', 'XGB'] # ['KNN', 'SVM linear', 'SVM', 'LR', 'DT', 'RF', 'XGB']
    classifiers = [KNeighborsClassifier(3),
                   SVC(kernel="linear",random_state=seed,class_weight=class_weights,gamma='scale'),
                   SVC(kernel = 'rbf', random_state=seed,class_weight=class_weights,gamma='scale'),
                   LogisticRegression(solver='lbfgs',random_state=seed,class_weight=class_weights),
                   LinearDiscriminantAnalysis(solver='svd',priors=priors), # No tiene random_state
                   DecisionTreeClassifier(random_state = seed, class_weight=class_weights),
                   RandomForestClassifier(n_estimators=100,n_jobs=-1,random_state=seed,class_weight=class_weights),
                   XGBClassifier(n_jobs=-1,scale_pos_weight= class_weights[0]/class_weights[1],seed=seed)
                  ]
    # results dataframe: each column for a classifier
    df_res = pd.DataFrame(columns=names)

    # build each classifier
    print('* Building scaling+feature selection+outer '+str(folds)+'-fold CV for '+str(len(names))+' classifiers:', str(names))
    total = time.time()
    
    # define a fold-CV for all the classifier
    outer_cv = StratifiedKFold(n_splits=folds,shuffle=True,random_state=seed)
    
    for name, clf in zip(names, classifiers):
        start = time.time()
        
        # create pipeline: scaler + classifier
        estimators = []
        
        # SCALER
        # MinMaxScaler(), StandardScaler(), RobustScaler(), QuantileTransformer(), PowerTransformer()
        estimators.append(('Scaler', StandardScaler()))
        
        # add Classifier
        estimators.append(('Classifier', clf)) 
        
        # create pipeline
        model = Pipeline(estimators)
        
        # evaluate pipeline
        scores = cross_val_score(model, Xdata, Ydata, cv=outer_cv, scoring='roc_auc', n_jobs=-1)
        
        df_res[name] = scores
        print('%s, MeanAUC=%0.2f, Time:%0.1f mins' % (name, scores.mean(), (time.time() - start)/60))
        
    # save results
    resFile = './results/'+str(label)+'_Outer-'+str(folds)+'-foldCV.csv'
    df_res.to_csv(resFile, index=False)
    print('* Scores saved', resFile)
        
    print('Total time:', (time.time() - total)/60, ' mins')             
    
    # print all results
    # print(df_res)
    
    # generate a boxplot for all classifiers
    boxplot = df_res.boxplot(column=names)
    
    # save the figure
    plotFile = './results/'+str(label)+'_s-MinMax.fs-Best50.Outer-'+str(folds)+'-foldCV.jpeg'
    
    boxplot.figure.savefig(plotFile,format='jpeg',dpi=100)
    print('* Saving plot:', plotFile)
    
    # clean each figure
    boxplot.figure.clf()
    
    # return AUC scores for all classifiers as dataframe (each column a classifier)
    return df_res

### Calculations

In [None]:
# for each subset file
df_results = None # all results 

for OutVar in outVars:
    sFile = './datasets/ds.raw.csv'

    # get data from file
    Xdata, Ydata, Features = getDataFromDataset(sFile,OutVar)

    # Calculate class weights
    class_weights = set_weights(Ydata)
    print("Class weights = ", class_weights)
        
    # try different folds for each subset -> box plots
    for folds in foldTypes:
        
        # calculate outer CV for different binary classifiers
        df_fold = Pipeline_OuterCV(Xdata, Ydata, label = OutVar, class_weights = class_weights, folds = folds, seed = seed)
        df_fold['Dataset'] = OutVar
        df_fold['folds'] = folds
        
        # add each result to a summary dataframe
        df_results = pd.concat([df_results,df_fold])

In [None]:
# save all results
summaryFile = './results/Summary_'+targetName+'CV.csv'
print('\n==>> Saving summary', summaryFile)
df_results.to_csv(summaryFile, index=False)
df_results

## Box plots from results

Load the results from file (if you dont want to run the previous calculations):

In [None]:
summaryFile = './results/Summary_'+targetName+'CV.csv'

print('\n-> Read summary results', summaryFile)
df_results = pd.read_csv(summaryFile)

Get list of classifiers from output file:

In [None]:
classifierNames = list(df_results.columns)
classifierNames.remove('Dataset')
classifierNames.remove('folds')
classifierNames

Get list of folds:

In [None]:
foldTypes = list(set(df_results['folds']))
foldTypes.sort()
foldTypes

### Boxplots by Dataset

Create grouped plots for each dataset and number of folds:

In [None]:
for f in foldTypes:
    plt.figure()
    plt.clf()
    print('==> Fold =', f)
    grouped = df_results[df_results['folds']==f].drop(['folds'], axis=1).groupby('Dataset')
    grouped.boxplot(figsize=(16,12), return_type='axes')
    plt.savefig('./results/byDataset_'+targetName+'-'+str(f)+'-foldCV.png')
    plt.show()

### Boxplot by Classifier

Create grouped plots for each number of folds and classifier:

In [None]:
for f in foldTypes:
    plt.figure()
    plt.clf()
    print('==> Fold =', f)
    df_results[df_results['folds']==f].boxplot(classifierNames,
                                                  by='Classifier',
                                                  figsize=(16,12),vert=False,
                                                  return_type='axes')
    plt.savefig('./results/byClassifier_'+targetName+'-'+str(f)+'-foldCV.png')
    plt.show()