# BT4222 Assignment 1

Author: Loh Hong Tak Edmund

Python Version: 3.8.5

In [52]:
# Loading Packages

# EDA
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np

# Machine Learning
from sklearn import metrics, cluster
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,  roc_curve, auc, log_loss 
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV


# Settings
import string
import itertools

## Helper Functions

In [2]:
def getRFEfeatures(model, x, y, n_features_to_select):
    rfe = RFE(model, n_features_to_select)
    rfe = rfe.fit(x,y)
    selected_features = list(x.columns[rfe.support_])
    print('Selected features: %s' % selected_features)
    return selected_features

def get_auc(model, x, y):
    y_pred_proba = model.predict_proba(x)[:,1]
    print(y_pred_proba)
    [fpr, tpr, thr] = roc_curve(y, y_pred_proba)
    return auc(fpr, tpr)

def get_logloss(model, x, y):
    y_pred_proba = model.predict_proba(x)[:,1]
    return log_loss(y, y_pred_proba)

def print_train_score(model, x_train, y_train, auc=True):
    pred = model.predict(x_train)
    model_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
    print("Train Result:\n================================================")
    print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
    print("_______________________________________________")
    print(f"CLASSIFICATION REPORT:\n{model_report}")
    print("_______________________________________________")
    print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
    if auc:
        print("_______________________________________________")
        print(f"AUC Score: \n {get_auc(model, x_train, y_train)}\n")
    

def print_test_score(model, x_test, y_test, auc=True):
    pred = model.predict(x_test)
    model_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
    print("Test Result:\n================================================")
    print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
    print("_______________________________________________")
    print(f"CLASSIFICATION REPORT:\n{model_report}")
    print("_______________________________________________")
    print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")
    if auc:
        print("_______________________________________________")
        print(f"AUC Score: \n {get_auc(model, x_test, y_test)}\n")    

## Importing Datasets

In [3]:
# Import data

df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Exploratory Data Analysis

In [4]:
df.head()

Unnamed: 0,index,age,race,sex,education,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,y
0,1,26,White,Male,Some-college,a_2,b_4,c_0,d_39,e_4,53833,10,0,0,42,0
1,2,35,White,Male,Some-college,a_2,b_3,c_0,d_39,e_4,67728,10,0,2051,45,0
2,3,61,White,Male,HS-grad,a_2,b_12,c_0,d_39,e_6,268831,9,0,0,53,0
3,4,48,Asian-Pac-Islander,Female,Bachelors,a_5,b_1,c_4,d_30,e_4,238360,13,0,0,40,0
4,5,20,White,Male,Some-college,a_4,b_6,c_3,d_39,e_4,199011,10,0,0,12,0


In [5]:
test.head()

Unnamed: 0,index,age,race,sex,education,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10
0,1,39,White,Male,HS-grad,a_2,b_3,c_0,d_39,e_4,366757,9,0,0,35
1,2,56,White,Male,Some-college,a_2,b_0,c_0,d_27,e_0,275943,10,0,0,40
2,3,17,White,Male,12th,a_4,b_0,c_3,d_39,e_0,103810,8,0,0,40
3,4,58,White,Female,HS-grad,a_6,b_1,c_4,d_39,e_4,151910,9,0,0,40
4,5,59,White,Male,Bachelors,a_4,b_1,c_1,d_39,e_2,161944,13,0,0,38


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26561 entries, 0 to 26560
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   index      26561 non-null  int64 
 1   age        26561 non-null  int64 
 2   race       26561 non-null  object
 3   sex        26561 non-null  object
 4   education  26561 non-null  object
 5   f1         26561 non-null  object
 6   f2         26561 non-null  object
 7   f3         26561 non-null  object
 8   f4         26561 non-null  object
 9   f5         26561 non-null  object
 10  f6         26561 non-null  int64 
 11  f7         26561 non-null  int64 
 12  f8         26561 non-null  int64 
 13  f9         26561 non-null  int64 
 14  f10        26561 non-null  int64 
 15  y          26561 non-null  int64 
dtypes: int64(8), object(8)
memory usage: 3.2+ MB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   index      6000 non-null   int64 
 1   age        6000 non-null   int64 
 2   race       6000 non-null   object
 3   sex        6000 non-null   object
 4   education  6000 non-null   object
 5   f1         6000 non-null   object
 6   f2         6000 non-null   object
 7   f3         6000 non-null   object
 8   f4         6000 non-null   object
 9   f5         6000 non-null   object
 10  f6         6000 non-null   int64 
 11  f7         6000 non-null   int64 
 12  f8         6000 non-null   int64 
 13  f9         6000 non-null   int64 
 14  f10        6000 non-null   int64 
dtypes: int64(7), object(8)
memory usage: 703.2+ KB


In [8]:
df.describe()

Unnamed: 0,index,age,f6,f7,f8,f9,f10,y
count,26561.0,26561.0,26561.0,26561.0,26561.0,26561.0,26561.0,26561.0
mean,13281.0,38.610933,190232.8,10.073981,1070.548662,87.575129,40.488573,0.239712
std,7667.644586,13.671748,105646.4,2.573905,7320.365123,404.081775,12.358773,0.426916
min,1.0,17.0,12285.0,1.0,0.0,0.0,1.0,0.0
25%,6641.0,28.0,118399.0,9.0,0.0,0.0,40.0,0.0
50%,13281.0,37.0,178383.0,10.0,0.0,0.0,40.0,0.0
75%,19921.0,48.0,237670.0,12.0,0.0,0.0,45.0,0.0
max,26561.0,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0


In [9]:
df = df.drop('index', axis=1)

In [10]:
for col in df.columns:
    fig = px.histogram(df, x=col,  title="Histogram of "+str(col))
    #fig.show()

## Converting all categorical variable to dummy variables

In [11]:
for col in df.columns:
    if df[col].dtypes == object:
        dummy = pd.get_dummies(df[col])
        df = df.drop(col,axis=1)
        df = pd.concat([df,dummy], axis=1)

df.head()
        

Unnamed: 0,age,f6,f7,f8,f9,f10,y,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,...,d_9,e_0,e_1,e_2,e_3,e_4,e_5,e_6,e_7,e_8
0,26,53833,10,0,0,42,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,35,67728,10,0,2051,45,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,61,268831,9,0,0,53,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,48,238360,13,0,0,40,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,20,199011,10,0,0,12,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


## Splitting Training and Testing Data

In [12]:
X = df.drop('y', axis=1)
y = df.y.values

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

## Building Logistic Regression Model

Building a logistic regression model with recursive feature elimination (RFE) to select the most important features.

### Feature Selection using RFE

In [13]:
logRegModel = LogisticRegression(max_iter=10000)
logRegModel_features = getRFEfeatures(logRegModel, X, y, 50)



Pass n_features_to_select=50 as keyword args. From version 0.25 passing these as positional arguments will result in an error



Selected features: ['f7', ' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other', ' White', ' Female', ' Male', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate', ' HS-grad', ' Prof-school', ' Some-college', 'a_0', 'a_2', 'a_3', 'a_4', 'a_5', 'a_6', 'b_0', 'b_1', 'b_12', 'b_14', 'b_3', 'b_4', 'b_5', 'b_6', 'b_7', 'b_8', 'b_9', 'c_1', 'c_2', 'c_3', 'c_5', 'd_0', 'd_26', 'd_33', 'd_39', 'e_0', 'e_1', 'e_5', 'e_6', 'e_7']


In [14]:
x_train_LR = x_train[logRegModel_features]
x_test_LR = x_test[logRegModel_features]

## Logistic Regression Model Building and Fitting

In [15]:
logRegModel = LogisticRegression(max_iter=10000)
logRegModel.fit(x_train_LR, y_train)

LogisticRegression(max_iter=10000)

## Results for Logistics Regression Model

### Classification Report for Train Data

In [16]:
print_train_score(logRegModel, x_train_LR,y_train)

Train Result:
Accuracy Score: 83.22%
_______________________________________________
CLASSIFICATION REPORT:
                      0            1  accuracy     macro avg  weighted avg
precision      0.864090     0.695219  0.832172      0.779655      0.823287
recall         0.924041     0.543825  0.832172      0.733933      0.832172
f1-score       0.893061     0.610273  0.832172      0.751667      0.824733
support    16114.000000  5134.000000  0.832172  21248.000000  21248.000000
_______________________________________________
Confusion Matrix: 
 [[14890  1224]
 [ 2342  2792]]

_______________________________________________
[0.71049526 0.83641746 0.05514418 ... 0.2780625  0.00336335 0.14759615]
AUC Score: 
 0.8798386015127221



### Classification Report for Test Data

In [17]:
print_test_score(logRegModel, x_test_LR,y_test)

Test Result:
Accuracy Score: 83.63%
_______________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.871872     0.682046  0.836251     0.776959      0.827819
recall        0.922304     0.551500  0.836251     0.736902      0.836251
f1-score      0.896379     0.609865  0.836251     0.753122      0.829887
support    4080.000000  1233.000000  0.836251  5313.000000   5313.000000
_______________________________________________
Confusion Matrix: 
 [[3763  317]
 [ 553  680]]

_______________________________________________
[0.01344695 0.04286195 0.01912045 ... 0.34165605 0.00854516 0.00654709]
AUC Score: 
 0.8812271400855558



In [18]:
get_auc(logRegModel, x_train_LR, y_train)

[0.71049526 0.83641746 0.05514418 ... 0.2780625  0.00336335 0.14759615]


0.8798386015127221

## Support Vector Machine



In [19]:
scaling = MinMaxScaler(feature_range=(-1,1)).fit(x_train)
x_train_svm = scaling.transform(x_train)
x_test_svm = scaling.transform(x_test)

### Linear SVM

In [20]:
linearSVM = LinearSVC(loss='hinge', dual=True,max_iter=1000000)
linearSVM.fit(x_train_svm,y_train)


LinearSVC(loss='hinge', max_iter=1000000)

In [21]:
print_train_score(linearSVM, x_train_svm, y_train, auc=False)

Train Result:
Accuracy Score: 85.13%
_______________________________________________
CLASSIFICATION REPORT:
                      0            1  accuracy     macro avg  weighted avg
precision      0.875878     0.745953  0.851327      0.810915      0.844485
recall         0.936701     0.583366  0.851327      0.760033      0.851327
f1-score       0.905269     0.654716  0.851327      0.779993      0.844730
support    16114.000000  5134.000000  0.851327  21248.000000  21248.000000
_______________________________________________
Confusion Matrix: 
 [[15094  1020]
 [ 2139  2995]]



In [22]:
print_test_score(linearSVM, x_test_svm, y_test, auc=False)

Test Result:
Accuracy Score: 86.05%
_______________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.886906     0.746493  0.860531     0.816700      0.854320
recall        0.937990     0.604217  0.860531     0.771104      0.860531
f1-score      0.911733     0.667862  0.860531     0.789798      0.855137
support    4080.000000  1233.000000  0.860531  5313.000000   5313.000000
_______________________________________________
Confusion Matrix: 
 [[3827  253]
 [ 488  745]]



## Polynomial Kernel SVM

In [23]:
polynomialSVM = SVC(kernel="poly")
polynomialSVM.fit(x_train_svm,y_train)

SVC(kernel='poly')

In [24]:
print_train_score(polynomialSVM, x_train_svm, y_train, auc=False)

Train Result:
Accuracy Score: 86.93%
_______________________________________________
CLASSIFICATION REPORT:
                      0            1  accuracy     macro avg  weighted avg
precision      0.892011     0.778145  0.869305      0.835078      0.864498
recall         0.941666     0.642189  0.869305      0.791927      0.869305
f1-score       0.916166     0.703660  0.869305      0.809913      0.864820
support    16114.000000  5134.000000  0.869305  21248.000000  21248.000000
_______________________________________________
Confusion Matrix: 
 [[15174   940]
 [ 1837  3297]]



In [25]:
print_test_score(polynomialSVM, x_test_svm, y_test, auc=False)

Test Result:
Accuracy Score: 85.92%
_______________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.889616     0.733848  0.859213     0.811732      0.853467
recall        0.932353     0.617194  0.859213     0.774773      0.859213
f1-score      0.910483     0.670485  0.859213     0.790484      0.854786
support    4080.000000  1233.000000  0.859213  5313.000000   5313.000000
_______________________________________________
Confusion Matrix: 
 [[3804  276]
 [ 472  761]]



## Radial Kernel SVM

In [26]:
rbfSVM = SVC(kernel="rbf")
rbfSVM.fit(x_train_svm,y_train)

SVC()

In [27]:
print_train_score(rbfSVM, x_train_svm, y_train, auc=False)

Train Result:
Accuracy Score: 84.48%
_______________________________________________
CLASSIFICATION REPORT:
                      0            1  accuracy     macro avg  weighted avg
precision      0.870626     0.732120  0.844832      0.801373      0.837160
recall         0.934219     0.564277  0.844832      0.749248      0.844832
f1-score       0.901302     0.637334  0.844832      0.769318      0.837521
support    16114.000000  5134.000000  0.844832  21248.000000  21248.000000
_______________________________________________
Confusion Matrix: 
 [[15054  1060]
 [ 2237  2897]]



In [28]:
print_test_score(rbfSVM, x_test_svm, y_test, auc=False)

Test Result:
Accuracy Score: 84.34%
_______________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.873849     0.706914  0.843403     0.790382      0.835108
recall        0.930392     0.555556  0.843403     0.742974      0.843403
f1-score      0.901235     0.622162  0.843403     0.761698      0.836469
support    4080.000000  1233.000000  0.843403  5313.000000   5313.000000
_______________________________________________
Confusion Matrix: 
 [[3796  284]
 [ 548  685]]



## Adaboost

In [30]:
adaRFE = AdaBoostClassifier(n_estimators=100,random_state=0)
ada_features = getRFEfeatures(adaRFE, X, y, 50)


Pass n_features_to_select=50 as keyword args. From version 0.25 passing these as positional arguments will result in an error



Selected features: ['age', 'f6', 'f7', 'f8', 'f9', 'f10', ' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other', ' White', ' Female', ' Male', ' Bachelors', ' HS-grad', ' Masters', 'a_1', 'a_2', 'a_6', 'b_0', 'b_1', 'b_10', 'b_11', 'b_12', 'b_13', 'b_4', 'b_5', 'b_6', 'b_7', 'b_8', 'b_9', 'c_0', 'c_1', 'c_2', 'c_3', 'c_4', 'c_5', 'd_0', 'd_1', 'd_22', 'd_7', 'd_8', 'd_9', 'e_0', 'e_1', 'e_2', 'e_3', 'e_4', 'e_5', 'e_6']


In [58]:
x_train_ada = x_train[ada_features]
x_test_ada = x_test[ada_features]

In [59]:
ada = AdaBoostClassifier(n_estimators=100,random_state=0)
ada.fit(x_train_ada, y_train)

AdaBoostClassifier(n_estimators=100, random_state=0)

In [61]:
print_train_score(ada, x_train_ada,y_train)

Train Result:
Accuracy Score: 86.47%
_______________________________________________
CLASSIFICATION REPORT:
                      0            1  accuracy     macro avg  weighted avg
precision      0.887135     0.772420   0.86474      0.829777      0.859417
recall         0.941417     0.624075   0.86474      0.782746      0.864740
f1-score       0.913470     0.690368   0.86474      0.801919      0.859564
support    16114.000000  5134.000000   0.86474  21248.000000  21248.000000
_______________________________________________
Confusion Matrix: 
 [[15170   944]
 [ 1930  3204]]

_______________________________________________
[0.50302379 0.50361685 0.48960574 ... 0.49460401 0.47957197 0.4922542 ]
AUC Score: 
 0.9225629812111495



In [62]:
print_test_score(ada, x_test_ada,y_test)

Test Result:
Accuracy Score: 87.35%
_______________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.893897     0.784195  0.873518     0.839046      0.868438
recall        0.947794     0.627737  0.873518     0.787766      0.873518
f1-score      0.920057     0.697297  0.873518     0.808677      0.868361
support    4080.000000  1233.000000  0.873518  5313.000000   5313.000000
_______________________________________________
Confusion Matrix: 
 [[3867  213]
 [ 459  774]]

_______________________________________________
[0.48831676 0.49124126 0.4880783  ... 0.50016367 0.48410922 0.48037058]
AUC Score: 
 0.9262318512157499



## Hyperparameter tuning for Adaboost using GridSearchCV

In [56]:
ada_tuned = AdaBoostClassifier()

grid = dict()
grid['n_estimators'] = [10, 50, 100, 500, 1000, 2000]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

grid_search = GridSearchCV(estimator=ada_tuned, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')

grid_result = grid_search.fit(x_train_ada, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.868615 using {'learning_rate': 1.0, 'n_estimators': 1000}


In [68]:
ada_tuned = grid_result.best_estimator_
print_test_score(ada_tuned, x_test_ada,y_test)

Test Result:
Accuracy Score: 88.07%
_______________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.901632     0.792766   0.88067     0.847199      0.876367
recall        0.948039     0.657745   0.88067     0.802892      0.880670
f1-score      0.924253     0.718972   0.88067     0.821612      0.876613
support    4080.000000  1233.000000   0.88067  5313.000000   5313.000000
_______________________________________________
Confusion Matrix: 
 [[3868  212]
 [ 422  811]]

_______________________________________________
[0.49875252 0.49897431 0.4988345  ... 0.49999985 0.49835283 0.49819099]
AUC Score: 
 0.9328135187570568

