# Example Evaluation Code

This notebook will be very __similar__ to the code I use to evaluate your results - it is provided for __your convenience__ so that you can use it to evaluate your preprocessing results at any time before your __final submission__.

Please note that the results here will __NOT__ be the same as my evaluation results.

Let's start with loading the required packages.

In [12]:
# import required package for data handling
import pandas as pd
import numpy as np

# import required packages for splitting data
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# import `logistic regression` model
from sklearn.linear_model import LogisticRegression

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import shapiro
import seaborn as sns
from scipy.stats import zscore, skew
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from itertools import chain
import xgboost
from sklearn import metrics, model_selection
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.decomposition import PCA

from sklearn.metrics import roc_curve, confusion_matrix

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 500)

Next you should load __your__ data. In this case, I am using a sample dataset (`GroupX.csv`) which contains 6 predictors (`X1 - X6`) and two target variables (`Y1, Y2`).

Please make sure you change the data to your __OWN__ dataset when using this code.

__NOTE__:
1. Your dataset maybe very different from the sample dataset.
2. Please follow this structure when submitting your dataset.

In [8]:
feat_mat = pd.read_csv('feat_mat.csv', header=0).drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'])
tar = pd.read_csv('target_df.csv')
feat_mat.head()

Unnamed: 0,C1,C3_imputed,C4,C7_imputed,words_per_sentence,real_word_per,non_real_word_per,short_sentences,short_sentences_per,long_sentences_per,...,outstanding_share_per_iqr_standardized_normalized,offering_share_per_iqr_standardized_normalized,C5_prime_iqr_standardized_normalized,C6_prime_iqr_standardized_normalized,C2,data_updated,C3_prime,industry_bin__Manufacturing,industry_bin__Other,industry_bin__Services
0,0.053924,0.169794,0.749962,0.001671,0.372945,0.944727,0.055273,0.200306,0.224992,0.775008,...,0.139724,-0.127188,0.043286,0.920862,1.0,0,1,1,0,0
1,0.119884,0.168708,0.583748,0.000843,0.394163,0.934175,0.065825,0.37156,0.219755,0.780245,...,2.201454,-2.26593,1.745398,-0.448096,0.0,0,0,1,0,0
2,0.038517,0.16879,0.717213,0.000238,0.334588,0.94522,0.05478,0.053517,0.22936,0.77064,...,-0.217267,0.226597,-0.308389,-2.056332,1.0,0,0,1,0,0
3,0.095811,0.168861,0.714502,0.000275,0.291162,0.953234,0.046766,0.172783,0.346964,0.653036,...,-0.273287,0.281867,-0.35944,-1.170267,1.0,0,0,1,0,0
4,0.033702,0.169072,0.499347,0.020605,0.323613,0.923515,0.076485,0.302752,0.289146,0.710854,...,0.047082,-0.03509,-0.052653,0.537671,1.0,0,1,0,1,0


Now you need to specify your targets and predictors. __NOTE__ we have two targets here (`Y1, Y2`).

In [9]:
y2 = tar.Y2

Check the shape of the data.

It is very possible that you will use different sets of the predictors for `Y1` and `Y2`. Now let's define them.

First, let's define predictors for `Y1` - which will be the first 5 features in `data`.

Use below code to select the first 5 features as predictors for `Y1`.

Upon investigation of the data, we know we have __six__ features (`X1 - X6`) predicting `Y2`. Use similar code (as below) to select them.

In [13]:
def get_important_features_rfe(xTrain, yTrain, num_feats):
    '''
    Takes training values and returns the most important features in order
    '''
    clf = LogisticRegression(random_state=123, max_iter = 1000)
    selector = RFE(clf, num_feats, step=1)
    selector.fit(xTrain, yTrain)
    
    return [x for _,x in sorted(zip(selector.ranking_, list(xTrain.columns)))]
    
    
feat_rfe = get_important_features_rfe(feat_mat[[i for i in list(feat_mat.columns) 
                                     if i != 'I1'and 'iqr_standardized_normalized' in i]],
                          y2, 10)

In [50]:
predictors_y2 = feat_mat[feat_rfe[:10]]

# lists for f1-score and AUC
f1_score_lst = []
auc_lst = []


#loop to calculate f1 and auc scores and present averages after 10 runs
for count in range (1,10):
    #Model building
    clf1 = LogisticRegression()

    
    # Splitting data into testing and training
    X2_train, X2_test, y2_train, y2_test = train_test_split(predictors_y2, y2, test_size=0.2, random_state=123)
    
    # Begin oversampling
    oversample = pd.concat([X2_train,y2_train],axis=1)
    max_size = oversample['Y2'].value_counts().max()
    lst = [oversample]
    for class_index, group in oversample.groupby('Y2'):
        lst.append(group.sample(max_size-len(group), replace=True))
    X2_train = pd.concat(lst)
    y2_train=pd.DataFrame.copy(X2_train['Y2'])
    del X2_train['Y2']
    
    # fitting model on oversampled data
    clf1.fit(X2_train, y2_train)
    
    y2_pred = clf1.predict(X2_test)
    
    
    #10-fold cross validation
    kfold = model_selection.KFold(n_splits=10, random_state=123)
    scoring = 'accuracy'
    results = model_selection.cross_val_score(clf1, X2_train, y2_train, cv=kfold, scoring=scoring)
    
    #calculate f1-score and AUC
    
    clf1_roc_auc = roc_auc_score(y2_test, y2_pred)
    
    
    #calculate average f1-score and AUC
    f1_score_lst.append(precision_recall_fscore_support(y2_test, y2_pred, average='weighted')[2])
    auc_lst.append(clf1_roc_auc)
    
    
print('F1 {:.4f}; AUC {:.4f} '.format(np.mean(f1_score_lst),np.mean(auc_lst)))

confusion_matrix_y2 = confusion_matrix(y2_test, y2_pred)


print('Accuracy of classifier on test set: {:.3f}'.format(clf1.score(X2_test, y2_test)))

print("10-fold cross validation average accuracy of clf1: %.3f" % (results.mean()))

print('Confusion Matrix for Classfier:')
print(confusion_matrix_y2)

print('Classification Report for Classfier:')
print(classification_report(y2_test, y2_pred))

imp = pd.DataFrame(zip(predictors_y2.columns, clf1.coef_[0]))





F1 0.6741; AUC 0.6122 
Accuracy of classifier on test set: 0.659
10-fold cross validation average accuracy of clf1: 0.620
Confusion Matrix for Classfier:
[[17 16]
 [29 70]]
Classification Report for Classfier:
              precision    recall  f1-score   support

           0       0.37      0.52      0.43        33
           1       0.81      0.71      0.76        99

    accuracy                           0.66       132
   macro avg       0.59      0.61      0.59       132
weighted avg       0.70      0.66      0.68       132



In [45]:
imp.columns = ['field', 'imp']
imp.sort_values(by = 'imp')
imp.sort_values('imp')

Unnamed: 0,field,imp
5,outstanding_share_per_iqr_standardized_normalized,-0.11344
7,real_word_per_iqr_standardized_normalized,-0.081528
3,non_real_word_per_iqr_standardized_normalized,-0.073219
8,short_sentences_iqr_standardized_normalized,-0.063225
6,pos_words_percent_iqr_standardized_normalized,0.061861
9,uncertain_words_percent_iqr_standardized_norma...,0.146997
0,C4_iqr_standardized_normalized,0.206251
4,offering_share_per_iqr_standardized_normalized,0.316794
2,C6_prime_iqr_standardized_normalized,0.5641
1,C5_prime_iqr_standardized_normalized,0.630189


### PCA

In [51]:
X2_train

Unnamed: 0,C4_iqr_standardized_normalized,C5_prime_iqr_standardized_normalized,C6_prime_iqr_standardized_normalized,non_real_word_per_iqr_standardized_normalized,offering_share_per_iqr_standardized_normalized,outstanding_share_per_iqr_standardized_normalized,pos_words_percent_iqr_standardized_normalized,real_word_per_iqr_standardized_normalized,short_sentences_iqr_standardized_normalized,uncertain_words_percent_iqr_standardized_normalized
303,-1.986528,-0.702535,0.227854,0.105486,0.682907,-0.681289,-1.185910,-0.091443,-0.881927,-0.885924
75,0.052305,1.476454,0.850828,1.744737,-1.205474,1.204023,-0.915965,-1.767273,-0.945286,2.061087
267,-1.102548,0.191677,0.554070,0.925739,-0.263660,0.276583,-1.870109,-0.934757,-0.205251,-2.056006
491,0.082544,0.747594,-0.280780,1.733440,-0.718293,0.728345,0.424286,-1.756046,-0.494828,-0.567506
101,-0.705530,0.464080,0.705885,-0.002472,-0.496854,0.509157,-1.783850,0.018488,-0.554385,2.061087
...,...,...,...,...,...,...,...,...,...,...
628,-1.027777,1.745398,0.603822,1.177085,-1.666626,1.644745,0.011439,-1.193086,-0.319479,0.681187
599,-1.419823,-1.102998,0.938617,-1.275447,1.227557,-1.237775,0.386357,1.270932,0.866624,-0.553741
247,0.961533,-0.839893,-0.837195,-0.835533,0.859497,-0.861561,0.249806,0.848932,-0.224144,0.306613
251,-1.378213,1.745398,-1.472387,-0.921725,-1.564697,1.548202,-0.158397,0.932630,1.161151,-1.149096


In [46]:
pca = PCA(0.95)
pca_transformed = pca.fit_transform(feat_mat[[i for i in feat_rfe[:10]]])


In [75]:
X2_train

Unnamed: 0,0,1,2,3,4,5,6
0,1.226255,-0.132458,0.228051,-0.835661,1.461639,-1.022270,1.719716
1,-1.927918,-2.241339,2.962121,-0.205626,-0.224036,-0.239944,-0.491821
2,-0.071858,-1.697794,-0.012033,0.311111,1.412772,0.011048,2.493536
3,-0.703054,-2.672157,0.077995,-0.608705,-0.553440,0.428641,0.156327
4,-1.058241,0.388617,2.608713,-0.062291,1.018400,-0.579937,-0.030870
...,...,...,...,...,...,...,...
52,-1.761582,0.353911,-0.578827,1.489498,0.103392,1.290776,-0.390665
97,-0.426711,-0.171188,1.026408,-0.009798,0.479818,-0.278247,-0.531803
343,-0.784074,-0.612558,-2.258011,-0.664215,0.514634,0.301464,0.632781
237,-0.832929,0.543464,-1.837011,0.379254,-1.326105,0.924952,-0.345398


In [74]:
predictors_y2 = pca_transformed

# lists for f1-score and AUC
f1_score_lst = []
auc_lst = []


#loop to calculate f1 and auc scores and present averages after 10 runs
for count in range (1,10):
    #Model building
    clf1 = LogisticRegression()

    
    # Splitting data into testing and training
    X2_train, X2_test, y2_train, y2_test = train_test_split(predictors_y2, y2, test_size=0.2, random_state=123)
    X2_train = pd.DataFrame(X2_train)
    X2_train['Y2'] = y2_train
    # Begin oversampling
    oversample = X2_train
    max_size = oversample['Y2'].value_counts().max()
    lst = [oversample]
    for class_index, group in oversample.groupby('Y2'):
        lst.append(group.sample(max_size-len(group), replace=True))
    X2_train = pd.concat(lst)
    y2_train=pd.DataFrame.copy(X2_train['Y2'])
    del X2_train['Y2']
    
    # fitting model on oversampled data
    clf1.fit(pd.DataFrame(X2_train), y2_train)
    
    y2_pred = clf1.predict(X2_test)
    
    
    #10-fold cross validation
    kfold = model_selection.KFold(n_splits=10, random_state=123)
    scoring = 'accuracy'
    results = model_selection.cross_val_score(clf1, X2_train, y2_train, cv=kfold, scoring=scoring)
    
    #calculate f1-score and AUC
    
    clf1_roc_auc = roc_auc_score(y2_test, y2_pred)
    
    
    #calculate average f1-score and AUC
    f1_score_lst.append(precision_recall_fscore_support(y2_test, y2_pred, average='weighted')[2])
    auc_lst.append(clf1_roc_auc)
    
    
print('F1 {:.4f}; AUC {:.4f} '.format(np.mean(f1_score_lst),np.mean(auc_lst)))

confusion_matrix_y2 = confusion_matrix(y2_test, y2_pred)


print('Accuracy of classifier on test set: {:.3f}'.format(clf1.score(X2_test, y2_test)))

print("10-fold cross validation average accuracy of clf1: %.3f" % (results.mean()))

print('Confusion Matrix for Classfier:')
print(confusion_matrix_y2)

print('Classification Report for Classfier:')
print(classification_report(y2_test, y2_pred))


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [78]:
X2_train['Y2'] = y2_train

In [81]:
clf1 = LogisticRegression()


# Splitting data into testing and training
X2_train, X2_test, y2_train, y2_test = train_test_split(predictors_y2, y2, test_size=0.2, random_state=123)
X2_train = pd.DataFrame(X2_train)
X2_train['Y2'] = y2_train
# Begin oversampling
oversample = X2_train
max_size = oversample['Y2'].value_counts().max()
lst = [oversample]
for class_index, group in oversample.groupby('Y2'):
    lst.append(group.sample(max_size-len(group), replace=True))
X2_train = pd.concat(lst)
y2_train=pd.DataFrame.copy(X2_train['Y2'])
del X2_train['Y2']

# fitting model on oversampled data
clf1.fit(pd.DataFrame(X2_train), y2_train)

y2_pred = clf1.predict(X2_test)


#10-fold cross validation
kfold = model_selection.KFold(n_splits=10, random_state=123)
scoring = 'accuracy'
results = model_selection.cross_val_score(clf1, X2_train, y2_train, cv=kfold, scoring=scoring)

#calculate f1-score and AUC

clf1_roc_auc = roc_auc_score(y2_test, y2_pred)


#calculate average f1-score and AUC
f1_score_lst.append(precision_recall_fscore_support(y2_test, y2_pred, average='weighted')[2])
auc_lst.append(clf1_roc_auc)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').