In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

#classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('white')  # plot formatting

I will use one variation of the dataset used in caruana\
Letters that are "O" will be treated as positive class, very unbalanced data, we will see how our algorithms perform

# Loading in dataset

In [2]:
column_names=["letter", "xbox", "ybox", "width", "height", "onpix",
             "xbar", "ybar", "x2bar", "y2bar", "xybar", "x2ybar", "xy2bar",
             "xedge", "xedgey", "yedge", "yedgex"]
df = pd.read_csv("datasets/letter-recognition.data", header=0, names=column_names, index_col=False)
df.head()

Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
0,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
1,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
2,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
3,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
4,S,4,11,5,8,3,8,8,6,9,5,6,6,0,8,9,7


In [3]:
df.shape

(19999, 17)

In [4]:
df.isnull().sum()

letter    0
xbox      0
ybox      0
width     0
height    0
onpix     0
xbar      0
ybar      0
x2bar     0
y2bar     0
xybar     0
x2ybar    0
xy2bar    0
xedge     0
xedgey    0
yedge     0
yedgex    0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   letter  19999 non-null  object
 1   xbox    19999 non-null  int64 
 2   ybox    19999 non-null  int64 
 3   width   19999 non-null  int64 
 4   height  19999 non-null  int64 
 5   onpix   19999 non-null  int64 
 6   xbar    19999 non-null  int64 
 7   ybar    19999 non-null  int64 
 8   x2bar   19999 non-null  int64 
 9   y2bar   19999 non-null  int64 
 10  xybar   19999 non-null  int64 
 11  x2ybar  19999 non-null  int64 
 12  xy2bar  19999 non-null  int64 
 13  xedge   19999 non-null  int64 
 14  xedgey  19999 non-null  int64 
 15  yedge   19999 non-null  int64 
 16  yedgex  19999 non-null  int64 
dtypes: int64(16), object(1)
memory usage: 2.6+ MB


In [6]:
df.describe()

Unnamed: 0,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
count,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0
mean,4.023651,7.035452,5.121956,5.372469,3.505975,6.897545,7.500175,4.628831,5.178609,8.282164,6.453823,7.928996,3.046252,8.338867,3.691935,7.80119
std,1.913206,3.304631,2.014568,2.261445,2.190441,2.026071,2.325087,2.699837,2.380875,2.488485,2.631016,2.080671,2.3325,1.546759,2.567004,1.61751
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,5.0,4.0,4.0,2.0,6.0,6.0,3.0,4.0,7.0,5.0,7.0,1.0,8.0,2.0,7.0
50%,4.0,7.0,5.0,6.0,3.0,7.0,7.0,4.0,5.0,8.0,6.0,8.0,3.0,8.0,3.0,8.0
75%,5.0,9.0,6.0,7.0,5.0,8.0,9.0,6.0,7.0,10.0,8.0,9.0,4.0,9.0,5.0,9.0
max,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0


In [7]:
df['letter'].value_counts()

U    813
D    805
P    803
T    795
M    792
A    789
X    787
Y    786
Q    783
N    783
F    775
G    773
E    768
B    766
V    764
L    761
R    758
I    755
O    753
W    752
S    748
J    747
K    739
C    736
Z    734
H    734
Name: letter, dtype: int64

In [8]:
def standarizeLetter(letter):
    """
    makes letters "O" positive
    """
    if letter == 'O':
        return 1
    else:
        return -1
    
df['letter'] = df['letter'].apply(standarizeLetter)

In [9]:
df['letter'].value_counts()

-1    19246
 1      753
Name: letter, dtype: int64

In [10]:
X = df.drop(['letter'], axis=1)
y = df['letter']

# Algorithm 1: Logistic Regression

In [11]:
def LG_model(X, y):
    '''returns the scores of each trial'''
    
    pipe = Pipeline(steps=[('classifier', LogisticRegression())])

    search_space = [{'classifier': [LogisticRegression(max_iter=5000)],
                     'classifier__solver': ['saga'],
                     'classifier__penalty': ['l1', 'l2'],
                     'classifier__C': np.logspace(-8, 4, 13)},
                    {'classifier': [LogisticRegression(max_iter=5000)],
                     'classifier__solver': ['lbfgs'],
                     'classifier__penalty': ['l2'],
                     'classifier__C': np.logspace(-8, 4, 13)},
                    {'classifier': [LogisticRegression(max_iter=5000)],
                     'classifier__solver': ['lbfgs','saga'],
                     'classifier__penalty': ['none']}
                    ]
    
    TRIALS = 5#DOING 5 TRIALS
    log_test_acc = []#saving the accuracy for each logistic regression trial on test set
    log_test_auc = []#saving the roc auc for each logistic regression trial on test set
    log_test_f1 = []#saving the f1 score for each logistic regression trial on test set
    train_acc = []
    train_auc = []
    train_f1 = []

    for i in range(TRIALS):
        #sampling 5000 training size for k-fold
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000)
        
        # Create grid search |
        clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=5), 
                       scoring=['accuracy', 'roc_auc', 'f1'], refit=False,
                       verbose=1, n_jobs=-1)
        best_model = clf.fit(X_train, y_train)

        #top 3 best params model parameters
        model_acc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
        model_auc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc'])]
        model_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1'])]

        model_params = [model_acc, model_auc, model_f1]

        #gets the scores for each model and append them to corresponding array
        for index, params in enumerate(model_params):
            C = 0
            try:
                C = params['classifier__C']
            except KeyError:
               pass

            solver = params['classifier__solver']
            penalty = params['classifier__penalty']

            #training best models on validation set
            logit_model = LogisticRegression(max_iter=5000)
            logit_model.set_params(C=C, solver=solver, penalty=penalty)
            logit_model.fit(X_train, y_train) 

            #get roc_auc, acc and f1 scores on test set
            if index == 0:
                score = logit_model.score(X_test, y_test)
                log_test_acc.append(score)
                train_score = logit_model.score(X_train, y_train)
                train_acc.append(train_score)

            elif index == 1:
                roc_score = roc_auc_score(y_test, logit_model.predict_proba(X_test)[:, 1])
                log_test_auc.append(roc_score)
                train_roc = roc_auc_score(y_train, logit_model.predict_proba(X_train)[:, 1])
                train_auc.append(train_roc)

            #appending to f1
            else:
                y_predict = logit_model.predict(X_test)#predictions on test set
                f_score = f1_score(y_test, y_predict)
                log_test_f1.append(f_score)
                f_train = f1_score(y_train, logit_model.predict(X_train))
                train_f1.append(f_train)
    
    return log_test_acc, log_test_auc, log_test_f1, train_acc, train_auc, train_f1

In [12]:
log_acc, log_auc, log_f1, log_train_acc, log_train_auc, log_train_f1 = LG_model(X, y)

Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 190 out of 205 | elapsed:   13.1s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done 205 out of 205 | elapsed:   13.9s finished


Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 205 out of 205 | elapsed:   13.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 190 out of 205 | elapsed:   11.8s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done 205 out of 205 | elapsed:   12.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 190 out of 205 | elapsed:   12.7s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done 205 out of 205 | elapsed:   13.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 190 out of 205 | elapsed:   12.8s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done 205 out of 205 | elapsed:   13.8s finished


# Algorithm 2: KNN

In [13]:
def KNN_model(X, y):
    classifier = KNeighborsClassifier()
    num_k = np.ceil(np.logspace(0, 2.7, 3)).astype(int)
    
    search_space = {'n_neighbors': num_k, 
                      'weights': ['uniform', 'distance']}

    TRIALS = 5#DOING 5 TRIALS
    acc_scores = []#saving the accuracy for each trial on test set
    auc_scores = []#saving the roc auc for each trial on test set
    f1_scores = []#saving the f1 score for each trial on test set
    train_acc = []
    train_auc = []
    train_f1 = []
    
    for i in range(TRIALS):
        #sampling 5000 training size for k-fold
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000)
        
        # Create grid search
        clf = GridSearchCV(classifier, search_space, cv=StratifiedKFold(n_splits=5), 
                           scoring=['accuracy', 'roc_auc', 'f1'], refit=False,
                           verbose=1, n_jobs=-1)
        best_model = clf.fit(X_train, y_train)

        #top 3 best params model parameters
        model_acc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
        model_auc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc'])]
        model_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1'])]

        model_params = [model_acc, model_auc, model_f1]

        #gets the scores for each model and append them to corresponding array
        for index, params in enumerate(model_params):
            k = params['n_neighbors']
            weight = params['weights']

            #training best models on validation set
            KNN_model = KNeighborsClassifier()
            KNN_model.set_params(n_neighbors=k, weights=weight)
            KNN_model.fit(X_train, y_train)#training on 5000 points

            #model == model_acc
            if index == 0:
                score = KNN_model.score(X_test, y_test)
                acc_scores.append(score)
                train_score = KNN_model.score(X_train, y_train)
                train_acc.append(train_score)
                
            #model_auc
            elif index == 1:
                roc_score = roc_auc_score(y_test, KNN_model.predict_proba(X_test)[:, 1])
                auc_scores.append(roc_score)
                train_roc = roc_auc_score(y_train, KNN_model.predict_proba(X_train)[:, 1])
                train_auc.append(train_roc)

            #appending to f1
            else:
                y_predict = KNN_model.predict(X_test)#predictions on test set
                f_score = f1_score(y_test, y_predict)
                f1_scores.append(f_score)
                f_train = f1_score(y_train, KNN_model.predict(X_train))
                train_f1.append(f_train)
    
    return acc_scores, auc_scores, f1_scores, train_acc, train_auc, train_f1

In [14]:
knn_acc, knn_auc, knn_f1, knn_train_acc, knn_train_auc, knn_train_f1 = KNN_model(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.3s finished


Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    2.8s finished


Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.3s finished


Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.4s finished


Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.7s finished


# Algorithm 3: Random Forest

In [15]:
def RF_model(X, y):
    classifier = RandomForestClassifier()
    max_features = [1, 2, 4, 6, 8, 12, 16, 20]
    
    for feature in max_features:
        if feature > len(X.columns):
            max_features.remove(feature)
            
    search_space = {'n_estimators': [1024], 'max_features': max_features}
    
    TRIALS = 5#DOING 5 TRIALS
    acc_scores = []#saving the accuracy for each trial on test set
    auc_scores = []#saving the roc auc for each trial on test set
    f1_scores = []#saving the f1 score for each trial on test set
    train_acc = []
    train_auc = []
    train_f1 = []
    
    for i in range(TRIALS):
        #sampling 5000 training size for k-fold
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000)
        
        # Create grid search
        clf = GridSearchCV(classifier, search_space, cv=StratifiedKFold(n_splits=5), 
                       scoring=['accuracy', 'roc_auc', 'f1'], refit=False,
                       verbose=2, n_jobs=-1)
        best_model = clf.fit(X_train, y_train)
        
        #top 3 best params model parameters
        model_acc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
        model_auc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc'])]
        model_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1'])]
        
        model_params = [model_acc, model_auc, model_f1]
        
        #gets the scores for each model and append them to corresponding array
        for index, params in enumerate(model_params):
            n_estimators = params['n_estimators']
            max_features = params['max_features']
            
            #train once more on entire validation set
            RF_model = RandomForestClassifier(n_estimators=n_estimators, 
                                                  max_features=max_features)
            RF_model.fit(X_train, y_train)
            
            #scoring on test set using multiple metrics
            #model == model_acc
            if index == 0:
                score = RF_model.score(X_test, y_test)
                acc_scores.append(score)
                train_score = RF_model.score(X_train, y_train)
                train_acc.append(train_score)
            
            #model_auc
            elif index == 1:
                roc_score = roc_auc_score(y_test, RF_model.predict_proba(X_test)[:, 1])
                auc_scores.append(roc_score)
                train_roc = roc_auc_score(y_train, RF_model.predict_proba(X_train)[:, 1])
                train_auc.append(train_roc)

            #appending to f1
            else:
                y_predict = RF_model.predict(X_test)#predictions on test set
                f_score = f1_score(y_test, y_predict)
                f1_scores.append(f_score)
                f_train = f1_score(y_train, RF_model.predict(X_train))
                train_f1.append(f_train)
                
    return acc_scores, auc_scores, f1_scores, train_acc, train_auc, train_f1

In [16]:
rf_acc, rf_auc, rf_f1, rftrain_acc, rftrain_auc, rftrain_f1= RF_model(X, y)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   22.3s finished


Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   22.2s finished


Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   21.0s finished


Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   20.6s finished


Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   23.2s finished


# Results

## Logistic Regression Results:

In [17]:
print("""TEST SET PERFORMANCE:""")
scores = pd.DataFrame([log_acc, log_auc, log_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(scores)
print()

logt_scores = pd.DataFrame([log_train_acc, log_train_auc, log_train_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
logt_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print("TRAIN SET PERFORMANCE:")
print(logt_scores)

TEST SET PERFORMANCE:
      Trial 1   Trial 2   Trial 3   Trial 4   Trial 5
ACC  0.962331  0.963064  0.962064  0.961997  0.962331
ROC  0.858444  0.859928  0.857098  0.862397  0.853068
FSC  0.000000  0.000000  0.000000  0.000000  0.000000

TRAIN SET PERFORMANCE:
      Trial 1   Trial 2   Trial 3   Trial 4   Trial 5
ACC  0.962400  0.960200  0.963200  0.963400  0.962400
ROC  0.864321  0.879413  0.860877  0.867247  0.871137
FSC  0.000000  0.000000  0.000000  0.000000  0.000000


## KNN Results:

In [18]:
print("""TEST SET PERFORMANCE:""")
k_scores = pd.DataFrame([knn_acc, knn_auc, knn_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
k_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(k_scores)
print()

print("TRAIN SET PERFORMANCE:")
kt_scores = pd.DataFrame([knn_train_acc, knn_train_auc, knn_train_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
kt_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(kt_scores)

TEST SET PERFORMANCE:
      Trial 1   Trial 2   Trial 3   Trial 4   Trial 5
ACC  0.992133  0.989866  0.990933  0.991933  0.990733
ROC  0.991464  0.994721  0.995173  0.994100  0.994624
FSC  0.895760  0.865724  0.880282  0.893953  0.879654

TRAIN SET PERFORMANCE:
     Trial 1  Trial 2  Trial 3  Trial 4  Trial 5
ACC      1.0      1.0      1.0      1.0      1.0
ROC      1.0      1.0      1.0      1.0      1.0
FSC      1.0      1.0      1.0      1.0      1.0


## Random Forest Results:

In [19]:
print("TEST SET PERFORMANCE:")
rf_scores = pd.DataFrame([rf_acc, rf_auc, rf_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
rf_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(rf_scores)
print()
print("TRAIN SET PERFORMANCE:")
rft_scores = pd.DataFrame([rftrain_acc, rftrain_auc, rftrain_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
rft_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(rft_scores)

TEST SET PERFORMANCE:
      Trial 1   Trial 2   Trial 3   Trial 4   Trial 5
ACC  0.988066  0.987799  0.987866  0.985666  0.988866
ROC  0.997133  0.995457  0.997112  0.994531  0.997793
FSC  0.814668  0.817374  0.818725  0.790650  0.828571

TRAIN SET PERFORMANCE:
     Trial 1  Trial 2  Trial 3  Trial 4  Trial 5
ACC      1.0      1.0      1.0      1.0      1.0
ROC      1.0      1.0      1.0      1.0      1.0
FSC      1.0      1.0      1.0      1.0      1.0
