In [34]:
import numpy as np
import scipy 
import scipy.stats
from scipy import stats
import seaborn as sns
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
from itertools import groupby
import pickle
import os
import math
from sympy import S, symbols
from collections import Counter
import sklearn 
from sklearn import preprocessing

pd.options.mode.chained_assignment = None 

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

sns.set(style='white', context='notebook', palette='deep')
#sns.mpl.rcParams['figure.figsize'] = (16, 10)

# Directory to store pickled dataframes
directory = '/Users/dianaow/Documents/formula-1-race-data/dataframes/'

In [37]:
import statsmodels.api as sm
from sklearn.metrics import r2_score , mean_absolute_error, mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.utils import resample
from sklearn.utils import class_weight

from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, train_test_split
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, accuracy_score, precision_score, average_precision_score, \
classification_report, recall_score, confusion_matrix, f1_score
from sklearn.model_selection import KFold 
from sklearn.base import clone

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.under_sampling import TomekLinks

import statsmodels.graphics.regressionplots 
import mord as mord
from skmultilearn.problem_transform import ClassifierChain

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
def read_from_pickle(directory, filename):
    df = pd.DataFrame()
    filepath = directory + filename
    with open(filepath, 'rb') as file:
        df = pickle.load(file)
            
    return df

### Function to split dataset to train and test set

In [2]:
def scale_data(X_train, X_test, scaler):
    
    if scaler == 'StandardScaler':
        SS = StandardScaler()
        Xs_train = SS.fit_transform(X_train)
        Xs_test = SS.fit_transform(X_test)
    elif scaler == 'MinMaxScaler':
        mm = MinMaxScaler()
        Xs_train = mm.fit_transform(X_train)
        Xs_test = mm.fit_transform(X_test)  
    elif scaler == False:
        Xs_train = X_train
        Xs_test = X_test
        
    return Xs_train, Xs_test

def build_train_test_set(df, df_test, train_yr, test_yr, index_list, target_var, target_var_list, scaler, 
                         name=None, races_curr_same_cat=None, races_same_cat=None, multilabel=False, print_stats=False):
    
    """
       Capabilites of this function:
       1) Splits dataset to a train and test set and allows for a single or multiple target variables. 
       2) 2 methods of train-test split
           Approach 1: Train-test split by year (set all_races=None)
               - This is not a viable options if model includes features that are only known pre-race. eg drivers' selected tyre sets and qualifying position)
           Approach 2: Train-test split by races

       Notes:
       - If multilabel=True, input 'target_var' and 'target_var_list' as the same variables.
    """
        
    # =============================================================================================
    # BUILD TRAIN AND TEST SET
    train_set = df[df['year'].isin(train_yr)].reset_index(drop=True)
    
    if name != None:

        train_set = train_set[train_set['name'].isin(races_same_cat['name'])]
        target_ibdex = races_curr_same_cat.index(name)
        races_before = races_curr_same_cat[:target_ibdex]
        addto_train_set = df[(df['year'].isin(test_yr)) & (df['name'].isin(races_before))].reset_index(drop=True)
        train_set = train_set.append(addto_train_set)
            
        test_set = df_test[df_test['name'] == name].reset_index(drop=True)
    
    elif (name == None) & (races_same_cat == None) & (races_curr_same_cat == None):
        test_set = df_test
        
    # Separate index, features and target variable
    arr = []
    arr = list(index_list)
    arr_tar_tvar = list(target_var_list)
    arr.extend(arr_tar_tvar)
    learning_columns = np.setdiff1d(train_set.columns, np.array(arr))
    X_train = train_set.loc[:, learning_columns]
    X_test = test_set.loc[:, learning_columns]

    if multilabel==True:
        Y_train = train_set[target_var_list]
        Y_test = test_set[target_var_list]
        # Ensure target variables are in integers as alogrithms will throw an error otherwise
        Y_train = Y_train.apply(lambda x: x.astype(int)) # Dataframe
        Y_test = Y_test.apply(lambda x: x.astype(int))
    else:
        Y_train = train_set[target_var]
        Y_test = test_set[target_var]
        Y_train = Y_train.astype(int) # Series
        Y_test = Y_test.astype(int)
        
    # =============================================================================================
    # Standard scale the dataset after train-test split
    Xs_train, Xs_test = scale_data(X_train, X_test, scaler)
        
    # =============================================================================================
    # PRINT STATISTICS (because multilabel option will generate a dataframe, whereas a single target variable results in a series,
    # to print statistics, different methods apply...)
    
    if (multilabel==True) & (print_stats==True):
        print '=================================='
        print 'Race:', name
        print '=================================='

        print 'Train set:' + str(len(Y_train))
        print '---------------------------------'
        print 'Shape of Train Set: ', Xs_train.shape
        print Y_train.apply(pd.value_counts).fillna(0)

        print '=================================='
        print 'Test set: ' + str(len(Y_test))
        print '---------------------------------'
        print 'Shape of Test Set: ', Xs_test.shape
        test_vc = Y_test.apply(pd.value_counts).fillna(0)
        print test_vc

    elif (multilabel==False) & (print_stats==True):
        print '=================================='
        print 'Race:', name
        print '=================================='

        print 'Train set:' + str(Y_train.count())
        print '---------------------------------'
        print 'Shape of Train Set: ', Xs_train.shape
        print Y_train.value_counts() 

        print '=================================='
        print 'Test set: ' + str(Y_test.count())
        print '---------------------------------'
        print 'Shape of Test Set: ', Xs_test.shape
        print Y_test.value_counts() 

    return train_set, test_set, Xs_train, Xs_test, np.array(Y_train).ravel(), np.array(Y_test).ravel()

### Functions to fit dataset on model and generate predictions

In [42]:
def MODEL_FITTING(Xs_train, y_train, classifier, sampler, multilabel=False):
    
    """
       Generic function enabling fitting on dataset based on a chosen algorithm (eg. classifier) and sampler.
       Allows a single target variable or multiple target variables as inputs (eg. multilabel classification)
    """

    # Remove target variables with one class only as this causes some algorithms to throw an error.
    #if multilabel==True:
        #col_train = [i for i in y_train if np.unique(y_train[i]).size > 1]
        #y_train = y_train[col_train]
        #y_test = y_test[col_train]

    # If sampling required, sample first, then fit on train set
    cXs_train_sm, cy_train_sm = sampler.fit_sample(Xs_train, y_train)
    classifier.fit(cXs_train_sm, cy_train_sm)
    
    return cXs_train_sm, cy_train_sm

In [4]:
def PREDICTION(train_set, test_set, Xs_train, Xs_test, y_train, y_test, \
               name, classifier, sampler, generator, index_list, target_var, target_var_list, \
               cross_validate=False, stacking=False, multilabel=False):
    
    # =============================================================================================
    # Cross-validation on train set
    if (cross_validate==True):
        
        pred = pd.DataFrame()
        pred_train = pd.DataFrame()
        proba = pd.DataFrame()
        proba_train = pd.DataFrame()
        cv_results = [] 
        cv_results.append(cross_val_score(classifier[1], Xs_train, y_train, \
                                          scoring = "accuracy", cv = generator, n_jobs=njobs))
        
        cv_means = np.mean(cv_results)
        cv_std = np.std(cv_results)
        
        # Store results in dataframe
        results_stats = pd.DataFrame({'Index': name, 'Target Var': target_var, 'Method': classifier[0], 'Resampler':sampler[0], 
                                      "CrossValMeans":cv_means,"CrossValerrors": cv_std}, index=[0])

        # Rearrange columns of dataframe
        results_stats = results_stats[['Index', 'Target Var', 'Method', 'Resampler', "CrossValMeans", "CrossValerrors"]]

    # =============================================================================================
    # PREDICTION
    else:
        y_pred = classifier[1].predict(Xs_test) # Make prediction on test set 
        y_pred_train = classifier[1].predict(Xs_train) # Make prediction on train set
        y_proba = classifier[1].predict_proba(Xs_test) # Generate prediction probabilities for test set
        y_proba_train = classifier[1].predict_proba(Xs_train) # Generate prediction probabilities for train set

        def rename_some_cols(df, suffix, col_start):
            new_names = [(i,i+suffix) for i in df.iloc[:, col_start:].columns.values]
            df.rename(columns = dict(new_names), inplace=True)
            return df

        def make_predictions(y, dataset, stacking, merge_index):

            if stacking==True:
                pred = pd.DataFrame(y, columns=[classifier[0] + "_" + str(target_var)])
            else:
                pred = pd.DataFrame(y).rename(columns={0: str(target_var)+'_predicted'})
                pred['tyre'] = target_var
                pred['sampler'] = sampler[0]
                pred['classifier'] = classifier[0]

            # Merge indexes to df of predicted results
            if merge_index==True:
                arr = list(index_list)
                arr.extend(arr_tar_tvar)
                dataset = dataset[arr].reset_index(drop=True)
                dataset = rename_some_cols(dataset, '_actual', 3)
                pred = pd.concat([dataset, pred], axis=1)

            return pred

        # Convert predicted results of both train and test set to a dataframe. 
        # If single target variable, use chosen algorithm as column name.
        if (multilabel==False) & (stacking==False):
            arr_tar_tvar = list([target_var]) 
            pred = make_predictions(y_pred, test_set, stacking=False, merge_index=True)
            pred_train = make_predictions(y_pred_train, train_set, stacking=False, merge_index=True)
            proba = make_predictions(y_proba[:,1], test_set, stacking=False, merge_index=True)
            proba_train = make_predictions(y_proba_train[:,1], train_set, stacking=False, merge_index=True)

        elif (multilabel==False) & (stacking==True):
            arr_tar_tvar = list([target_var])  
            pred = make_predictions(y_pred, test_set, stacking=True, merge_index=False)
            pred_train = make_predictions(y_pred_train, train_set, stacking=True, merge_index=False)
            proba = make_predictions(y_proba[:,1], test_set, stacking=True, merge_index=False)
            proba_train = make_predictions(y_proba_train[:,1], train_set, stacking=True, merge_index=False)

        # If multilabel, use target variables as column name.
        else:
            arr_tar_tvar = list(target_var_list)
            col_train = [i for i in y_train if np.unique(y_train[i]).size > 1]
            pred = pd.DataFrame(y_pred.todense(), columns=col_train)
            pred['classifier'] = classifier[0]
            pred_train = pd.DataFrame(y_pred_train.todense(), columns=col_train)
            pred_train['classifier'] = classifier[0] 
        
        if multilabel==False:
            results_stats = calc_classification_stats(name, target_var, classifier[0], sampler[0], y_test, y_pred, average=None)  
        else:
            results_stats = calc_classification_stats(name, target_var, classifier[0], sampler[0], y_test, y_pred, average='samples')
 
    return results_stats, pred, pred_train, proba, proba_train

### Multi-label Classification

In [5]:
def multilabel_PREDICTION_loop(df, df_test, train_yr, test_yr, index_list, target_var_list, \
                                 scaler, methods, samplers, generator, \
                                 races_curr_same_cat, races_same_cat, all_races, \
                                 zV_nzV_check=False, VIF_check=False, cross_validate=False, multilabel=True, print_stats=False\
                                ):

    results_all = pd.DataFrame()
    pred_all = pd.DataFrame()
    pred_train_all = pd.DataFrame()
    proba_all = pd.DataFrame()
    proba_train_all = pd.DataFrame()
    
    # =============================================================================================
    # MULTI-LABEL CLASSIFICATION
    if multilabel==True:
        for m, s in itertools.product(methods, samplers):
            for name in races_curr_same_cat:
                train_set, test_set, Xs_train, Xs_test, Y_train, Y_test = \
                    build_train_test_set(
                                          df, df_test, train_yr, test_yr, \
                                          index_list, target_var, target_var_list, scaler, \
                                          name, races_curr_same_cat, races_same_cat, \
                                          zV_nzV_check, VIF_check, multilabel, print_stats=False)
                    
                cXs_train_sm, cy_train_sm = MODEL_FITTING(Xs_train, Y_train, m[1], s[1], multilabel)

                results, pred, pred_train, proba, proba_train = PREDICTION(
                                           train_set, test_set, cXs_train_sm, Xs_test, cy_train_sm, Y_test, \
                                           name, m, s, generator, \
                                           index_list, target_var, target_var_list, \
                                           cross_validate, stacking, multilabel)

                results_all, proba_all, proba_train_all, pred_all, pred_train_all = \
                    concat_dfs(results, pred, pred_train, proba, proba_train)
                
        return results_all, proba_all, proba_train_all, pred_all, pred_train_all

### Single-label Classification (with ensemble stacking)

In [11]:
def singletarget_PREDICTION_loop(df, df_test, train_yr, test_yr, index_list, target_var_list, 
                                 scaler, methods, samplers, generator, races_curr_same_cat, races_same_cat, 
                                 meta_learner, cross_validate=False, multilabel=False, 
                                 ensemble=False, ensb_use_all=False, print_stats=False 
                            ):

    results_all = pd.DataFrame()
    pred_all = pd.DataFrame()
    pred_train_all = pd.DataFrame()
    proba_all = pd.DataFrame()
    proba_train_all = pd.DataFrame()
    Pstrain = pd.DataFrame()
    Pstest = pd.DataFrame()
                                    
    # =============================================================================================
    # SINGLE TARGET VARIABLE
    for target_var in target_var_list:
        
        P_train_all = pd.DataFrame()
        P_test_all = pd.DataFrame()

        for idx,name in enumerate(races_curr_same_cat):
        
            # Initialize new dataframe for each race
            P_train_race = pd.DataFrame()
            P_test_race = pd.DataFrame()
        
            for m, s in itertools.product(methods[idx], samplers):
                
                if ensemble==True:
                
                    train_set, test_set, results, pred, pred_train, meta_features_test, meta_features_train = \
                        fit_and_predict(m, s, generator, df, df_test, train_yr, test_yr, index_list, target_var, target_var_list, scaler, \
                            name, races_curr_same_cat, races_same_cat, results_all, proba_all, proba_train_all, pred_all, pred_train_all, \
                            multilabel, cross_validate, print_stats, stacking=True, concat=False
                        )

                    # Utilize prediction probabilities as meta features
                    # Generates prediction matrix of meta features -> With each loop, store the new meta feature generated in the step above.
                    # By the end of the nested loop's iteration, this df should store the meta features of ONE target var from all models.
                    P_train_race = pd.concat([P_train_race, meta_features_train], axis=1)
                    P_test_race = pd.concat([P_test_race, meta_features_test], axis=1)

                    if ensb_use_all==True:
                        # Merge rest of features to the prediction matrix 
                        if (len(P_train_race.columns) == len(methods[0])) & (len(P_test_race.columns) == len(methods[0])):
                            P_train_race = pd.concat([train_set.reset_index(drop=True), P_train_race.reset_index(drop=True)], axis=1)
                            P_test_race = pd.concat([test_set.reset_index(drop=True), P_test_race.reset_index(drop=True)], axis=1)

                    else:
                        # Only merge index to the prediction matrix (Do not merge all features)
                        if (len(P_train_race.columns) == len(methods[0])) & (len(P_test_race.columns) == len(methods[0])):
                            arr = list(index_list)
                            arr.extend([target_var])
                            P_train_race = pd.concat([train_set[arr].reset_index(drop=True), P_train_race.reset_index(drop=True)], axis=1)
                            P_test_race = pd.concat([test_set[arr].reset_index(drop=True), P_test_race.reset_index(drop=True)], axis=1)
                
                else:     
                    results_all, pred_all, pred_train_all, proba_all, proba_train_all = \
                        fit_and_predict(m, s, generator, df, df_test, train_yr, test_yr, index_list, target_var, target_var_list, scaler, \
                            name, races_curr_same_cat, races_same_cat, results_all, proba_all, proba_train_all, pred_all, pred_train_all, \
                            multilabel, cross_validate, print_stats, stacking=False, concat=True
                        )
                        
            if ensemble==True:

                results_all, pred_all, pred_train_all, proba_all, proba_train_all = \
                    fit_and_predict(meta_learner, samplers[0], generator, P_train_race, P_test_race, train_yr, test_yr, index_list, target_var, target_var_list, scaler, \
                        name, races_curr_same_cat, races_same_cat, results_all, proba_all, proba_train_all, pred_all, pred_train_all,\
                        multilabel, cross_validate, print_stats, stacking=False, concat=True
                    )
                    
                Pstrain = pd.concat([Pstrain, P_train_race])  
                Pstest = pd.concat([Pstest, P_test_race])  
                
    if ensemble==True:
        return results_all, pred_all, pred_train_all, Pstest, Pstrain
    else:
        return results_all, pred_all, pred_train_all, proba_all, proba_train_all

In [7]:
def fit_and_predict(m, s, generator, df, df_test, train_yr, test_yr, index_list, target_var, target_var_list, scaler, \
                    name, races_curr_same_cat, races_same_cat, results_all, proba_all, proba_train_all, pred_all, pred_train_all,\
                    multilabel=False, cross_validate=False, print_stats=False, stacking=False, concat=False
                    ):
    
    train_set, test_set, Xs_train, Xs_test, Y_train, Y_test = build_train_test_set(
                              df, df_test, train_yr, test_yr, index_list, target_var, target_var_list, scaler, \
                              name, races_curr_same_cat, races_same_cat, multilabel, print_stats)

    cXs_train_sm, cy_train_sm = MODEL_FITTING(Xs_train, Y_train, m[1], s[1], multilabel)

    # This only generates ONE columns/meta feature of one target variable predicted with ONE model (eg. logreg)
    results, pred, pred_train, proba, proba_train = PREDICTION(
                               train_set, test_set, cXs_train_sm, Xs_test, cy_train_sm, Y_test, \
                               name, m, s, generator, index_list, target_var, target_var_list, \
                               cross_validate, stacking, multilabel)

    if concat==True:
        results_all = pd.concat([results_all, results]) # Df containing performance results of each estimator
        proba_all = pd.concat([proba_all, proba]) # Prediction probabilities generated from test set
        proba_train_all = pd.concat([proba_train_all, proba_train]) # Prediction probabilities generated from train set
        pred_all = pd.concat([pred_all, pred]) # Predictions generated from test set
        pred_train_all = pd.concat([pred_train_all, pred_train]) # Predictions generated from train set

        return results_all, pred_all, pred_train_all, proba_all, proba_train_all
    else:
        return train_set, test_set, results, pred, pred_train, proba, proba_train

### Functions to evaluation prediction results

In [9]:
def calc_classification_stats(name, target_var, classifier, sampler, y_test, y_pred, average):

    # Initiate lists for storing results
    recall = []
    precision = []
    baseline_accuracy = []
    test_accuracy = []
    f1 = []
    avg_precision = []
    
    #recall.append(recall_score(y_test, y_pred, average=None))
    #precision.append(precision_score(y_test, y_pred, average=None))
    baseline_accuracy.append(float(pd.Series(y_test).value_counts().max()) / pd.Series(y_test).count())
    test_accuracy.append(accuracy_score(y_test, y_pred))
    avg_precision.append(average_precision_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred, average=None)) 

     
    results_stats = pd.DataFrame({'Index': str(name), 'Target Var': str(target_var), 'Method': classifier, 'Resampler':sampler,\
                                  'Avg Precision': avg_precision, 'F1 Score': f1, 'Test accuracy': test_accuracy, \
                                  'Baseline accuracy': baseline_accuracy})

    results_stats = results_stats[['Index', 'Target Var', 'Method', 'Resampler', "Baseline accuracy", "Test accuracy", 'F1 Score', 'Avg Precision']]

    return results_stats  

def format_stats_df(df, cols_w_array):
    
    df_new = pd.DataFrame()
    df_new_all = pd.DataFrame()
    
    for i in cols_w_array:
        df_new = pd.DataFrame(df[i].values.tolist(),columns=[str(i)+' (minority)', str(i)+' (majority)']).reset_index(drop=True)

        df_new_all = pd.concat([df_new_all, df_new], axis=1).reset_index(drop=True)
    
    df_new = pd.concat([df.reset_index(drop=True), df_new_all], axis=1)
    
    df_new['Distance from baseline'] = df_new['Test accuracy'] - df_new['Baseline accuracy']
    
    return df_new

def plot_algo_results(df, grp_col, metrics_list, sort_method):
    
    """Function to plot statistics of prediction results"""

    def calc(df, grp_col, col, new_col_name):
        if new_col_name == 'Std':
            df = pd.DataFrame(df.groupby([grp_col])[col].apply(lambda x: np.std(x)))
  
        elif new_col_name == 'Mean':
            df = pd.DataFrame(df.groupby([grp_col])[col].apply(lambda x: np.mean(x)))
            
        return df.reset_index().rename(columns={col: col+ " (" + new_col_name + ")"})
    
    # This function only calculates mean and standard deviation
    def create_grp_stats(df, grp_col, col):
        p_std = calc(df, grp_col, col, 'Std')
        p_mean = calc(df, grp_col, col, 'Mean')
        p = pd.merge(p_mean, p_std, on=grp_col, how='left')
        return p
    
    # Set order 
    def sort_order(df, col_to_sortby, grp_col):
        if sort_method == "desc":
            ordering = df.sort_values([col_to_sortby])[grp_col].unique()
        elif sort_method == "race":
            ordering = races
        else:
            raise ValueError("Only desc or race are accepted keywords for sort_method variable")

        ids = reversed(list(ordering))
        ids = [str(item) for item in ids]  
        return ids
        
    # plot results
    def plot_barplot(df, x, y, ids, row, col):
        plt.figure()   
        g = sns.barplot(x, y, data = df, order=ids, palette="Set3", orient = "h", ax=axes[row][col])
        g.set_title(x, fontsize=16)
        for p in g.patches:
            width = p.get_width()
            g.text(width*1.05, p.get_y()+0.55*p.get_height(), '{:1.2f}'.format(width), ha='center', va='center')
        
    df_new = pd.DataFrame()
    for i in metrics_list:
        p = create_grp_stats(df, grp_col, i) 
        df_new = pd.concat([df_new, p], axis=1)
        df_new = df_new.T.drop_duplicates().T
    
    if len(df_new.columns) == 3:
        nrows = 1
        ncols = 2
    else:
        nrows = len(df_new.columns)-len(metrics_list)-1
        ncols = 2

    to_plot = df_new.columns[1:]
    fig, axes = plt.subplots(nrows = nrows, ncols = ncols, sharex="all", figsize=(15,15), squeeze=False)
    fig.subplots_adjust(wspace=0.5)
    counter = 0
    for row in range(nrows):
        for col in range(ncols):
            ids = sort_order(df_new, to_plot[counter], grp_col)
            plot_barplot(df_new, to_plot[counter], grp_col, ids, row, col)
            counter += 1
    plt.tight_layout()
            
    return df_new

### Grid Search

In [41]:
def grid_search_wrapper_loop(df, df_test, train_yr, test_yr, index_list, target_var_list, 
                             scaler, races_to_use, races_curr_same_cat, races_same_cat, 
                             clf, param_grid, scorer, refit, print_stats=False, plot_params=False
                            ):

    search_results_all = pd.DataFrame()
    
    for name in races_to_use:
        
        train_set, test_set, Xs_train, Xs_test, Y_train, Y_test = \
          build_train_test_set(
                            df, df_test, train_yr, test_yr, 
                            index_list, target_var_list, target_var_list, scaler, 
                            name, races_curr_same_cat, races_same_cat, multilabel=False, print_stats=False)
            
        Y_train_flip = np.logical_not(Y_train).astype(int)
        Y_train_flip = np.array(Y_train_flip).ravel()
        
        if plot_params==True:
            plot_indiv_params(Xs_train, Y_train_flip, clf, param_grid, scorer, refit)
        else:
            search_results = search_all_params(name, Xs_train, Y_train_flip, clf, param_grid, scorer, refit)
            search_results_all = pd.concat([search_results_all, search_results])
            
    return search_results_all

def plot_indiv_params(Xs_train, Y_train_flip, clf, param_grid, scorer, refit):
    
    index = 1
    plt.figure(figsize=(16,12))
    for parameter, param_range in dict.items(param_grid):   
        grid_search = GridSearchCV(clf, param_grid = {parameter: param_range}, scoring=scorer, refit=refit, cv=skfold, return_train_score=True, n_jobs=-1)
        grid_search.fit(Xs_train, Y_train_flip)

        df = {}
        for i, score in enumerate(grid_search.grid_scores_):
            df[score[0][parameter]] = score[1]

        df = pd.DataFrame.from_dict(df, orient='index')
        df.reset_index(level=0, inplace=True)
        df = df.sort_values(by='index')

        plt.subplot(3,2,index)
        plot = plt.plot(df['index'], df[0])
        plt.title(parameter)
        index += 1
    
    return

def search_all_params(name, Xs_train, Y_train_flip, clf, param_grid, scorer, refit):

    grid_search = GridSearchCV(clf, param_grid, scoring=scorer, refit=refit, cv=skfold, return_train_score=True, n_jobs=-1)
    grid_result = grid_search.fit(Xs_train, Y_train_flip)

    search_results = pd.DataFrame(grid_search.grid_scores_)
    search_results['name'] = name
    search_results['scorer'] = scorer
    search_results['classifiers'] = [clone(clf.set_params(**i)) for i in search_results['parameters']]
    search_results['best score'] = grid_search.best_score_
    search_results = search_results.sort_values('mean_validation_score', ascending=False).head(5)

    return search_results.reset_index(drop=True)

### Determine predictors with near zero or zero variance (Required for regression)

In [24]:
def nearZeroVariance(X, freqCut = 95 / 5, uniqueCut = 5):
    '''
    Determine predictors with near zero or zero variance.
    Inputs:
    X: pandas data frame
    freqCut: the cutoff for the ratio of the most common value to the second most common value
    uniqueCut: the cutoff for the percentage of distinct values out of the number of total samples
    Returns a tuple containing a list of column names: (zeroVar, nzVar)
    '''

    colNames = X.columns.values.tolist()
    freqRatio = dict()
    uniquePct = dict()

    for names in colNames:
        counts = (
            (X[names])
            .value_counts()
            .sort_values(ascending = False)
            .values
            )

        if len(counts) == 1:
            freqRatio[names] = -1
            uniquePct[names] = (float(len(counts)) / len(X[names])) * 100
            continue

        freqRatio[names] = counts[0] / counts[1]
        uniquePct[names] = (float(len(counts)) / len(X[names])) * 100

    zeroVar = list()
    nzVar = list()
    for k in uniquePct.keys():
        if freqRatio[k] == -1:
            zeroVar.append(k)

        if uniquePct[k] < uniqueCut and freqRatio[k] > freqCut:
            nzVar.append(k)

    return(zeroVar, nzVar)

In [25]:
def zV_nzV_filter(df, index_list, target_var_list):
     
    # IMPORTANT! copy the index_list variable or else it will be modified 
    idxes = list(index_list)
    
    # Select predictor columns
    no_check_cols = [c for c in df.columns if c in target_var_list]
    no_check_cols.extend(idxes)
    learning_columns = np.setdiff1d(df.columns, np.array(no_check_cols))
    df_tocheck = df.loc[:, learning_columns]
    
    # Check for predictors with near zero or zero variance    
    zeroVar, nzVar = nearZeroVariance(df_tocheck)
    zero_nz_Var = list(zeroVar)
    zero_nz_Var.extend(nzVar)
    
    # Final list columns that have passed the check
    non_zero_nz_Var = list(filter(lambda x: x not in zero_nz_Var, df_tocheck.columns.tolist()))
    
    # Append original index and target variable
    non_zero_nz_Var.extend(idxes)
    non_zero_nz_Var.extend(target_var_list)
    
    # Final dataframe of selected columns
    #df = df[non_zero_nz_Var]
    
    #print '-------------------------------------------------------------'
    #print 'Columns which pass low variance test:' + str(non_zero_nz_Var)
    
    return non_zero_nz_Var

In [26]:
def zV_nzV_filter(df, index_list, target_var_list):
     
    # IMPORTANT! copy the index_list variable or else it will be modified 
    idxes = list(index_list)
    
    # Select predictor columns
    no_check_cols = [c for c in df.columns if c in target_var_list]
    no_check_cols.extend(idxes)
    learning_columns = np.setdiff1d(df.columns, np.array(no_check_cols))
    df_tocheck = df.loc[:, learning_columns]
    
    # Check for predictors with near zero or zero variance    
    zeroVar, nzVar = nearZeroVariance(df_tocheck)
    zero_nz_Var = list(zeroVar)
    zero_nz_Var.extend(nzVar)
    
    # Final list columns that have passed the check
    non_zero_nz_Var = list(filter(lambda x: x not in zero_nz_Var, df_tocheck.columns.tolist()))
    
    # Append original index and target variable
    non_zero_nz_Var.extend(idxes)
    non_zero_nz_Var.extend(target_var_list)
    
    # Final dataframe of selected columns
    #df = df[non_zero_nz_Var]
    
    #print '-------------------------------------------------------------'
    #print 'Columns which pass low variance test:' + str(non_zero_nz_Var)
    
    return non_zero_nz_Var

In [27]:
def VIF_filter(df, index_list, target_var_list):
    
    # IMPORTANT! copy the index_list variable or else it will be modified 
    idxes = list(index_list)
    
    # Select predictor columns
    no_check_cols = [c for c in df.columns if c in target_var_list]
    no_check_cols.extend(idxes)
    learning_columns = np.setdiff1d(df.columns, np.array(no_check_cols))
    df_tocheck = df.loc[:, learning_columns]
             
    # Check for Variance Inflation Factor
    # For each X, calculate VIF and save in dataframe
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(df_tocheck.values, i) for i in range(df_tocheck.shape[1])]
    vif["features"] = df_tocheck.columns
    
     # Final list columns that have passed the check
    features_vif_below10 = vif[vif['VIF Factor'] < 10].features.tolist()
    
    # Append original index and target variable
    features_vif_below10.extend(idxes)
    features_vif_below10.extend(target_var_list)

    # Final dataframe of selected columns
    #df = df[features_vif_below10]
    
    #print '------------------------------------------------------------'
    #print 'Columns which pass VIF test:' + str(features_vif_below10)
        
    return features_vif_below10