# Evaluation Code: Pipeline 3.1

In [11]:
# import required package for data handling
import pandas as pd
import numpy as np

# import required packages for splitting data
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# import `logistic regression` model
from sklearn.linear_model import LogisticRegression

In [12]:
data = pd.read_csv('pipeline3_1.csv')
data.head()

Unnamed: 0,C1_skew,C4_skew,T3_skew,T4_skew,T5_skew,S1_skew,S2_skew,S3_skew,C5_skew,C6_skew,Imputed_C7_skew,C3_,Imputed_C2,I3_bins_new,Y1,Y2
0,0.075166,0.634522,0.622036,0.163401,0.317875,-0.150781,0.220314,-0.11856,0.038988,1.269109,-0.60728,1,1.0,2,0,1
1,1.784572,-0.677744,0.1054,0.225731,-0.101333,-1.358504,0.867934,-0.14078,1.988896,-0.788429,-0.946172,0,0.0,2,1,0
2,-0.482648,0.359251,0.64728,0.11149,1.3915,2.089019,-1.047739,0.076708,-0.284815,-0.788429,-1.452379,0,1.0,2,1,0
3,1.244371,0.336847,1.073356,-1.26046,1.372745,2.089019,-0.63043,0.805082,-0.332517,-0.788429,-1.401754,0,1.0,2,1,1
4,-0.681012,-1.268502,-0.371534,-0.592161,-0.525495,-0.394125,0.523762,0.01905,-0.048561,1.265166,1.092803,1,1.0,1,0,1


In [13]:
data.dtypes

C1_skew            float64
C4_skew            float64
T3_skew            float64
T4_skew            float64
T5_skew            float64
S1_skew            float64
S2_skew            float64
S3_skew            float64
C5_skew            float64
C6_skew            float64
Imputed_C7_skew    float64
C3_                  int64
Imputed_C2         float64
I3_bins_new          int64
Y1                   int64
Y2                   int64
dtype: object

In [14]:
# specify targets and predictors
y1 = data.Y1
y2 = data.Y2

In [15]:
data.shape

(660, 16)

In [16]:
# Define predictors for `Y1` 
# top first 5 features in `data`

cols = (data.columns)
cols

Index(['C1_skew', 'C4_skew', 'T3_skew', 'T4_skew', 'T5_skew', 'S1_skew',
       'S2_skew', 'S3_skew', 'C5_skew', 'C6_skew', 'Imputed_C7_skew', 'C3_',
       'Imputed_C2', 'I3_bins_new', 'Y1', 'Y2'],
      dtype='object')

In [17]:
# define predictors for Y1

predictors_y1 = data[['T3_skew','T4_skew','S2_skew','Imputed_C7_skew','Imputed_C2']]
predictors_y1.head()

Unnamed: 0,T3_skew,T4_skew,S2_skew,Imputed_C7_skew,Imputed_C2
0,0.622036,0.163401,0.220314,-0.60728,1.0
1,0.1054,0.225731,0.867934,-0.946172,0.0
2,0.64728,0.11149,-1.047739,-1.452379,1.0
3,1.073356,-1.26046,-0.63043,-1.401754,1.0
4,-0.371534,-0.592161,0.523762,1.092803,1.0


In [18]:
# define predictors for Y2

predictors_y2 = data[['C4_skew','S2_skew','C6_skew','Imputed_C2','I3_bins_new']]
predictors_y2.head()

Unnamed: 0,C4_skew,S2_skew,C6_skew,Imputed_C2,I3_bins_new
0,0.634522,0.220314,1.269109,1.0,2
1,-0.677744,0.867934,-0.788429,0.0,2
2,0.359251,-1.047739,-0.788429,1.0,2
3,0.336847,-0.63043,-0.788429,1.0,2
4,-1.268502,0.523762,1.265166,1.0,1


In [19]:
# lists for f1-score and AUC
f1_score_lst = []
auc_lst = []


#loop to calculate f1 and auc scores and present averages after 10 runs
for count in range (1,10):
    #Model building
    clf = LogisticRegression()
    X1_train, X1_test, y1_train, y1_test = train_test_split(predictors_y1, y1, test_size=0.2, random_state=123)
    clf.fit(X1_train, y1_train)

    y1_pred = clf.predict(X1_test)

    
    #10-fold cross validation
    kfold = model_selection.KFold(n_splits=10, random_state=7)
    scoring = 'accuracy'
    results = model_selection.cross_val_score(clf, X1_train, y1_train, cv=kfold, scoring=scoring)

    

    
    #calculate f1-score and AUC
    
    clf_roc_auc = roc_auc_score(y1_test, y1_pred)
    f1_score_lst.append(precision_recall_fscore_support(y1_test, y1_pred, average='weighted')[2])
    auc_lst.append(clf_roc_auc)


print('F1 {:.4f}; AUC {:.4f} '.format(np.mean(f1_score_lst),np.mean(auc_lst)))

#result=logit_model.fit()
confusion_matrix_y1 = confusion_matrix(y1_test, y1_pred)


#print(result.summary())
print('Accuracy of classifier on test set: {:.2f}'.format(clf.score(X1_test, y1_test)))

print("10-fold cross validation average accuracy of classifier: %.3f" % (results.mean()))

print('Confusion Matrix for Logistic Regression Classfier:')
print(confusion_matrix_y1)

print('Classification Report for Logistic Regression Classfier:')
print(classification_report(y1_test, y1_pred))




F1 0.6085; AUC 0.6182 
Accuracy of classifier on test set: 0.61
10-fold cross validation average accuracy of classifier: 0.596
Confusion Matrix for Logistic Regression Classfier:
[[43 35]
 [17 37]]
Classification Report for Logistic Regression Classfier:
              precision    recall  f1-score   support

           0       0.72      0.55      0.62        78
           1       0.51      0.69      0.59        54

    accuracy                           0.61       132
   macro avg       0.62      0.62      0.61       132
weighted avg       0.63      0.61      0.61       132





In [10]:
# lists for f1-score and AUC
f1_score_lst = []
auc_lst = []


#loop to calculate f1 and auc scores and present averages after 10 runs
for count in range (1,10):
    #Model building
    clf1 = LogisticRegression()

    
    # Splitting data into testing and training
    X2_train, X2_test, y2_train, y2_test = train_test_split(predictors_y2, y2, test_size=0.2, random_state=123)
    
    # Begin oversampling
    oversample = pd.concat([X2_train,y2_train],axis=1)
    max_size = oversample['Y2'].value_counts().max()
    lst = [oversample]
    for class_index, group in oversample.groupby('Y2'):
        lst.append(group.sample(max_size-len(group), replace=True))
    X2_train = pd.concat(lst)
    y2_train=pd.DataFrame.copy(X2_train['Y2'])
    del X2_train['Y2']
    
    # fitting model on oversampled data
    clf1.fit(X2_train, y2_train)
    
    y2_pred = clf1.predict(X2_test)
    
    
    #10-fold cross validation
    kfold = model_selection.KFold(n_splits=10, random_state=123)
    scoring = 'accuracy'
    results = model_selection.cross_val_score(clf1, X2_train, y2_train, cv=kfold, scoring=scoring)
    
    #calculate f1-score and AUC
    
    clf1_roc_auc = roc_auc_score(y2_test, y2_pred)
    
    
    #calculate average f1-score and AUC
    f1_score_lst.append(precision_recall_fscore_support(y2_test, y2_pred, average='weighted')[2])
    auc_lst.append(clf1_roc_auc)
    
    
print('F1 {:.4f}; AUC {:.4f} '.format(np.mean(f1_score_lst),np.mean(auc_lst)))

confusion_matrix_y2 = confusion_matrix(y2_test, y2_pred)


print('Accuracy of classifier on test set: {:.3f}'.format(clf1.score(X2_test, y2_test)))

print("10-fold cross validation average accuracy of clf1: %.3f" % (results.mean()))

print('Confusion Matrix for Classfier:')
print(confusion_matrix_y2)

print('Classification Report for Classfier:')
print(classification_report(y2_test, y2_pred))




F1 0.6206; AUC 0.5972 
Accuracy of classifier on test set: 0.598
10-fold cross validation average accuracy of clf1: 0.589
Confusion Matrix for Classfier:
[[20 16]
 [37 59]]
Classification Report for Classfier:
              precision    recall  f1-score   support

           0       0.35      0.56      0.43        36
           1       0.79      0.61      0.69        96

    accuracy                           0.60       132
   macro avg       0.57      0.59      0.56       132
weighted avg       0.67      0.60      0.62       132

