In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn

In [25]:
#transforming
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

#dimensionality reduction
from sklearn.decomposition import PCA

#resampling (need to use imblearn pipeline)
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline


#performance measures
from sklearn.metrics import fbeta_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import fbeta_score, make_scorer


#ML models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [17]:
full_frame = pd.read_csv("/Users/FelixHoffmann/Desktop/GreenLeaders/shared datasets/Final_Training_Data_65515.csv",index_col=0)

In [13]:
#function to create 120 pipelines for each ML Model it is applied to

#5 resampling options
resampling=[('None'),
            ('Rand_Over',('over', RandomOverSampler())),
            ('Rand_Under',('under', RandomUnderSampler())),
            ('SMOTE',('over', SMOTE())),
            ('SMOTE_Under',('over',SMOTE()),('under',RandomUnderSampler()))]

#6 transformation options
transformations=[('None'),
                ('Yeo_Johnson',('t1', PowerTransformer())),
                ('Box_Cox',('t1', MinMaxScaler(feature_range=(1, 2))),('t2', PowerTransformer(method='box-cox'))),
                ('MinMax',('t1', MinMaxScaler())),
                ('Standardize',('t1', StandardScaler())),
                ('Robust_Scaler',('t1', RobustScaler()))]

#4 dimensionality reduction options
dim_reductions=[('None'),
                ('PCA4',('pca',PCA(n_components=4))),
                ('PCA8',('pca',PCA(n_components=8))),
                ('PCA17',('pca',PCA(n_components=17)))]


def get_mixed_pipelines(model):

    pipelines = list()

    #for each dimensionality reduction
    for lijn in dim_reductions:

        #for each resampling technique
        for line in resampling:    

            #for each transformation
            for zeile in transformations:
                
                if lijn == 'None':
            
                    if line == 'None':
                        if zeile == 'None':
                            pipelines.append((f"{zeile}_{line}_{lijn}", imblearn.pipeline.Pipeline([('m',model)])))
                        elif len(zeile)<3:
                            pipelines.append((f"{zeile[0]}_{line}_{lijn}", imblearn.pipeline.Pipeline([zeile[1],('m',model)])))
                        else: 
                            pipelines.append((f"{zeile[0]}_{line}_{lijn}", imblearn.pipeline.Pipeline([zeile[1],zeile[2],('m',model)])))


                    elif len(line)<3:
                        if zeile == 'None':
                            pipelines.append((f"{zeile}_{line[0]}_{lijn}", imblearn.pipeline.Pipeline([line[1],('m',model)])))
                        elif len(zeile)<3:
                            pipelines.append((f"{zeile[0]}_{line[0]}_{lijn}", imblearn.pipeline.Pipeline([zeile[1],line[1],('m',model)])))
                        else: 
                            pipelines.append((f"{zeile[0]}_{line[0]}_{lijn}", imblearn.pipeline.Pipeline([zeile[1],zeile[2],line[1],('m',model)])))

                    else:
                        if zeile == 'None':
                            pipelines.append((f"{zeile}_{line[0]}_{lijn}", imblearn.pipeline.Pipeline([line[1],line[2],('m',model)])))
                        elif len(zeile)<3:
                            pipelines.append((f"{zeile[0]}_{line[0]}_{lijn}", imblearn.pipeline.Pipeline([zeile[1],line[1],line[2],('m',model)])))
                        else:
                            pipelines.append((f"{zeile[0]}_{line[0]}_{lijn}", imblearn.pipeline.Pipeline([zeile[1],zeile[2],line[1],line[2],('m',model)])))

                else: 
                    if line == 'None':
                        if zeile == 'None':
                            pipelines.append((f"{zeile}_{lijn[0]}_{line}", imblearn.pipeline.Pipeline([lijn[1],('m',model)])))
                        elif len(zeile)<3:
                            pipelines.append((f"{zeile[0]}_{lijn[0]}_{line}", imblearn.pipeline.Pipeline([zeile[1],lijn[1],('m',model)])))
                        else: 
                            pipelines.append((f"{zeile[0]}_{lijn[0]}_{line}", imblearn.pipeline.Pipeline([zeile[1],zeile[2],lijn[1],('m',model)])))

                    elif len(line)<3:
                        if zeile == 'None':
                            pipelines.append((f"{zeile}_{lijn[0]}_{line[0]}", imblearn.pipeline.Pipeline([lijn[1],line[1],('m',model)])))
                        elif len(zeile)<3:
                            pipelines.append((f"{zeile[0]}_{lijn[0]}_{line[0]}", imblearn.pipeline.Pipeline([zeile[1],lijn[1],line[1],('m',model)])))
                        else: 
                            pipelines.append((f"{zeile[0]}_{lijn[0]}_{line[0]}", imblearn.pipeline.Pipeline([zeile[1],zeile[2],lijn[1],line[1],('m',model)])))

                    else:
                        if zeile == 'None':
                            pipelines.append((f"{zeile}_{lijn[0]}_{line[0]}", imblearn.pipeline.Pipeline([lijn[1],line[1],line[2],('m',model)])))
                        elif len(zeile)<3:
                            pipelines.append((f"{zeile[0]}_{lijn[0]}_{line[0]}", imblearn.pipeline.Pipeline([zeile[1],lijn[1],line[1],line[2],('m',model)])))
                        else:
                            pipelines.append((f"{zeile[0]}_{lijn[0]}_{line[0]}", imblearn.pipeline.Pipeline([zeile[1],zeile[2],lijn[1],line[1],line[2],('m',model)])))
                            
                            
    return pipelines

In [14]:
#example: Get 120 pipelines for LDA:
get_mixed_pipelines(LinearDiscriminantAnalysis())

[('None_None_None', Pipeline(steps=[('m', LinearDiscriminantAnalysis())])),
 ('Yeo_Johnson_None_None', Pipeline(steps=[('t1', PowerTransformer()),
                  ('m', LinearDiscriminantAnalysis())])),
 ('Box_Cox_None_None',
  Pipeline(steps=[('t1', MinMaxScaler(feature_range=(1, 2))),
                  ('t2', PowerTransformer(method='box-cox')),
                  ('m', LinearDiscriminantAnalysis())])),
 ('MinMax_None_None',
  Pipeline(steps=[('t1', MinMaxScaler()), ('m', LinearDiscriminantAnalysis())])),
 ('Standardize_None_None',
  Pipeline(steps=[('t1', StandardScaler()), ('m', LinearDiscriminantAnalysis())])),
 ('Robust_Scaler_None_None',
  Pipeline(steps=[('t1', RobustScaler()), ('m', LinearDiscriminantAnalysis())])),
 ('None_Rand_Over_None', Pipeline(steps=[('over', RandomOverSampler()),
                  ('m', LinearDiscriminantAnalysis())])),
 ('Yeo_Johnson_Rand_Over_None',
  Pipeline(steps=[('t1', PowerTransformer()), ('over', RandomOverSampler()),
                  ('m', L

In [26]:
#Create table containing training time, Recall, F2, ROCAUC of each pipeline for logistic regression

ftwo_scorer = make_scorer(fbeta_score, beta=2)

X = full_frame.drop(columns='GreenLeaderBinary')
y = full_frame['GreenLeaderBinary']


model=LogisticRegression()
pipelines = get_mixed_pipelines(model)

spotchecks_LG = []

for name, pipeline in pipelines:
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)

    scores = sklearn.model_selection.cross_validate(pipeline, X, y, scoring = {'recall': 'recall','ftwo_scorer': make_scorer(fbeta_score, beta=2),'roc_auc':'roc_auc'}, cv=cv, n_jobs=-1)
    
    print(name, np.round(np.mean(scores['fit_time']),3),np.round(np.mean(scores['test_recall']),3),np.round(np.mean(scores['test_ftwo_scorer']),3),np.round(np.mean(scores['test_roc_auc']),3))
    spotchecks_LG.append((name,np.round(np.mean(scores['fit_time']),3),np.round(np.mean(scores['test_recall']),3),np.round(np.mean(scores['test_ftwo_scorer']),3),np.round(np.mean(scores['test_roc_auc']),3)))

pd.DataFrame(spotchecks_LG).to_csv('Evaluation_List_LogReg.csv')

None_None_None 4.795 0.04 0.048 0.798
Yeo_Johnson_None_None 12.166 0.108 0.128 0.876
Box_Cox_None_None 11.389 0.106 0.126 0.875
MinMax_None_None 5.456 0.097 0.115 0.858
Standardize_None_None 4.59 0.1 0.118 0.858
Robust_Scaler_None_None 6.453 0.095 0.112 0.852
None_Rand_Over_None 12.405 0.705 0.372 0.838
Yeo_Johnson_Rand_Over_None 21.568 0.795 0.398 0.878
Box_Cox_Rand_Over_None 20.692 0.797 0.4 0.878
MinMax_Rand_Over_None 12.204 0.758 0.385 0.865
Standardize_Rand_Over_None 9.69 0.759 0.392 0.867
Robust_Scaler_Rand_Over_None 12.078 0.761 0.385 0.865
None_Rand_Under_None 0.648 0.72 0.376 0.838
Yeo_Johnson_Rand_Under_None 10.641 0.803 0.391 0.875
Box_Cox_Rand_Under_None 8.135 0.804 0.393 0.874
MinMax_Rand_Under_None 0.632 0.769 0.369 0.859
Standardize_Rand_Under_None 0.882 0.767 0.382 0.863
Robust_Scaler_Rand_Under_None 0.951 0.766 0.378 0.863
None_SMOTE_None 13.231 0.651 0.353 0.81
Yeo_Johnson_SMOTE_None 21.687 0.779 0.395 0.874
Box_Cox_SMOTE_None 19.218 0.785 0.399 0.874
MinMax_SMOTE_Non

In [None]:
#Create table containing training time, Recall, F2, ROCAUC of each pipeline for LDA

ftwo_scorer = make_scorer(fbeta_score, beta=2)

X = full_frame.drop(columns='GreenLeaderBinary')
y = full_frame['GreenLeaderBinary']


model=LinearDiscriminantAnalysis()
pipelines = get_mixed_pipelines(model)

spotchecks_LG = []

for name, pipeline in pipelines:
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)

    scores = sklearn.model_selection.cross_validate(pipeline, X, y, scoring = {'recall': 'recall','ftwo_scorer': make_scorer(fbeta_score, beta=2),'roc_auc':'roc_auc'}, cv=cv, n_jobs=-1)
    
    print(name, np.round(np.mean(scores['fit_time']),3),np.round(np.mean(scores['test_recall']),3),np.round(np.mean(scores['test_ftwo_scorer']),3),np.round(np.mean(scores['test_roc_auc']),3))
    spotchecks_LG.append((name,np.round(np.mean(scores['fit_time']),3),np.round(np.mean(scores['test_recall']),3),np.round(np.mean(scores['test_ftwo_scorer']),3),np.round(np.mean(scores['test_roc_auc']),3)))

pd.DataFrame(spotchecks_LG).to_csv('Evaluation_List_LDA.csv')

In [None]:
#Create table containing training time, Recall, F2, ROCAUC of each pipeline for QDA

ftwo_scorer = make_scorer(fbeta_score, beta=2)

X = full_frame.drop(columns='GreenLeaderBinary')
y = full_frame['GreenLeaderBinary']


model=QuadraticDiscriminantAnalysis()
pipelines = get_mixed_pipelines(model)

spotchecks_LG = []

for name, pipeline in pipelines:
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)

    scores = sklearn.model_selection.cross_validate(pipeline, X, y, scoring = {'recall': 'recall','ftwo_scorer': make_scorer(fbeta_score, beta=2),'roc_auc':'roc_auc'}, cv=cv, n_jobs=-1)
    
    print(name, np.round(np.mean(scores['fit_time']),3),np.round(np.mean(scores['test_recall']),3),np.round(np.mean(scores['test_ftwo_scorer']),3),np.round(np.mean(scores['test_roc_auc']),3))
    spotchecks_LG.append((name,np.round(np.mean(scores['fit_time']),3),np.round(np.mean(scores['test_recall']),3),np.round(np.mean(scores['test_ftwo_scorer']),3),np.round(np.mean(scores['test_roc_auc']),3)))

pd.DataFrame(spotchecks_LG).to_csv('Evaluation_List_QDA.csv')

In [None]:
#Create table containing training time, Recall, F2, ROCAUC of each pipeline for Random Forest

ftwo_scorer = make_scorer(fbeta_score, beta=2)

X = full_frame.drop(columns='GreenLeaderBinary')
y = full_frame['GreenLeaderBinary']


model=RandomForestClassifier()
pipelines = get_mixed_pipelines(model)

spotchecks_LG = []

for name, pipeline in pipelines:
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)

    scores = sklearn.model_selection.cross_validate(pipeline, X, y, scoring = {'recall': 'recall','ftwo_scorer': make_scorer(fbeta_score, beta=2),'roc_auc':'roc_auc'}, cv=cv, n_jobs=-1)
    
    print(name, np.round(np.mean(scores['fit_time']),3),np.round(np.mean(scores['test_recall']),3),np.round(np.mean(scores['test_ftwo_scorer']),3),np.round(np.mean(scores['test_roc_auc']),3))
    spotchecks_LG.append((name,np.round(np.mean(scores['fit_time']),3),np.round(np.mean(scores['test_recall']),3),np.round(np.mean(scores['test_ftwo_scorer']),3),np.round(np.mean(scores['test_roc_auc']),3)))

pd.DataFrame(spotchecks_LG).to_csv('Evaluation_List_RandomForest.csv')

In [36]:
# from the above tables, the highest performing pipelines for each metric were determined.
# in order to better understand their qualities, averaged confusion matrices are calculated
# the calculation of metrics was done using 10-fold cross validation
# here, 10 confusion matrices calculated based on the sub-groups from cross validation are averaged

#get y and X
y=np.array(full_frame['GreenLeaderBinary'])
X=np.array(full_frame.drop(columns=['GreenLeaderBinary']))

#create empty list to save confusion matrices
conf_matrix_list_of_arrays = []

#set cross validation parameters
kf=RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)


#Highest Recall
#pipeline= imblearn.pipeline.Pipeline(steps=[
#                  ('under', RandomUnderSampler()),
#                  ('m', QuadraticDiscriminantAnalysis())])


#Highest F2 Score
#pipeline= imblearn.pipeline.Pipeline(steps=[('t1', RobustScaler()), ('over', SMOTE()),
#                  ('under', RandomUnderSampler()),
#                  ('m', QuadraticDiscriminantAnalysis())])

        
#highest ROC AUC score        
pipeline= imblearn.pipeline.Pipeline(steps=[('t1', MinMaxScaler()),
                  ('over', SMOTE()),
                  ('m', RandomForestClassifier())])

        
        
#use groups of observations determined by cross validation function        
for train_index, test_index in kf.split(X,y):
    
    #get y and X of subsample
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    #fit model and calculate confusion matrix for one of the 10 cases
    pipeline.fit(X_train, y_train)
    conf_matrix = confusion_matrix(y_test, pipeline.predict(X_test))
    
    #save confusion matrices in list
    conf_matrix_list_of_arrays.append(conf_matrix)
    
#return averaged and rounded confusion matrix
np.round(np.mean(conf_matrix_list_of_arrays,axis=0))

array([[6252.,   62.],
       [ 177.,   61.]])

# F2, recall, ROCAUC metric leaders confusion matrices

In [33]:
#QDA for best F2
np.round(np.mean(conf_matrix_list_of_arrays,axis=0))

array([[5369.,  945.],
       [  76.,  162.]])

In [35]:
#QDA best for recall
np.round(np.mean(conf_matrix_list_of_arrays,axis=0))

array([[2900., 3414.],
       [  46.,  192.]])

In [37]:
#Random Forest for best ROCAUC
np.round(np.mean(conf_matrix_list_of_arrays,axis=0))

array([[6252.,   62.],
       [ 177.,   61.]])