In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

#classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('white')  # plot formatting

# Load in Dataset

In [2]:
column_names = ["age", "workclass", "fnlwgt", "education", "education_num",
                "marital_status", "occupation", "relationship", "race", "sex",
                "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"
               ]
df = pd.read_csv("datasets/adult.data", header=0, names=column_names, index_col=False)

In [3]:
#getting the shape of dataset, num observations and features
df.shape

(32560, 15)

In [4]:
df.head()#data looks good

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [5]:
#checking for any null values
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [6]:
def transformIncome(string):
    income = df['income'].unique() #[' <=50K', ' >50K']
    
    #classify <=50k is 0 and >50k is 1 pos
    if string == income[1]:
        return 1
    else:
        return 0
    
df['income'] = df['income'].apply(transformIncome)

In [7]:
#positive and negative classes
df['income'].value_counts()

0    24719
1     7841
Name: income, dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32560 non-null  int64 
 1   workclass       32560 non-null  object
 2   fnlwgt          32560 non-null  int64 
 3   education       32560 non-null  object
 4   education_num   32560 non-null  int64 
 5   marital_status  32560 non-null  object
 6   occupation      32560 non-null  object
 7   relationship    32560 non-null  object
 8   race            32560 non-null  object
 9   sex             32560 non-null  object
 10  capital_gain    32560 non-null  int64 
 11  capital_loss    32560 non-null  int64 
 12  hours_per_week  32560 non-null  int64 
 13  native_country  32560 non-null  object
 14  income          32560 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 3.7+ MB


In [9]:
#getting features and target
X = df.drop(['income'], axis=1)
y = df['income']

In [10]:
categoric_features = ["workclass", "education", "marital_status", 
                      "occupation", "relationship", "race", "sex", "native_country"]
numeric_features = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]

#one hot encoder and scaler
scaler = StandardScaler()
ohe    = OneHotEncoder(sparse=False)

#scaling numeric columns
scaled_columns = pd.DataFrame(scaler.fit_transform(X[numeric_features]), 
                                columns=numeric_features, 
                                index=X.index)
encoded_columns = ohe.fit_transform(X[categoric_features])#turns to dense matrix

# Concatenate them back together
for index, category in enumerate(np.concatenate(ohe.categories_)):
    scaled_columns[category] = encoded_columns[:, index]
    
scaled_columns

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,?,Federal-gov,Local-gov,Never-worked,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,0.837097,-1.008742,1.134779,-0.145914,-0.216663,-2.222120,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.042640,0.245046,-0.420027,-0.145914,-0.216663,-0.035430,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.057031,0.425770,-1.197429,-0.145914,-0.216663,-0.035430,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.775755,1.408146,1.134779,-0.145914,-0.216663,-0.035430,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.115952,0.898170,1.523480,-0.145914,-0.216663,-0.035430,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,-0.849066,0.639710,0.746077,-0.145914,-0.216663,-0.197407,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32556,0.103982,-0.335466,-0.420027,-0.145914,-0.216663,-0.035430,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32557,1.423589,-0.358811,-0.420027,-0.145914,-0.216663,-0.035430,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32558,-1.215624,0.110927,-0.420027,-0.145914,-0.216663,-1.655200,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
X_preprocessed=scaled_columns
X_preprocessed.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,?,Federal-gov,Local-gov,Never-worked,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,0.837097,-1.008742,1.134779,-0.145914,-0.216663,-2.22212,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.04264,0.245046,-0.420027,-0.145914,-0.216663,-0.03543,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.057031,0.42577,-1.197429,-0.145914,-0.216663,-0.03543,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.775755,1.408146,1.134779,-0.145914,-0.216663,-0.03543,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.115952,0.89817,1.52348,-0.145914,-0.216663,-0.03543,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [12]:
X.head()#unchanged for random forest, and decision tree

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States


In [13]:
X_labelencode = X.apply(LabelEncoder().fit_transform)
print(X_labelencode.shape)
print(len(X.columns))
X_labelencode.head()

(32560, 14)
14


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,33,6,2925,9,12,2,4,0,4,1,0,0,12,39
1,21,4,14085,11,8,0,6,1,4,1,0,0,39,39
2,36,4,15335,1,6,2,6,0,2,1,0,0,39,39
3,11,4,19354,9,12,2,10,5,2,0,0,0,39,5
4,20,4,17699,12,13,2,4,5,4,0,0,0,39,39


# Algorithm 1: Logistic Regression

Logistic Regression (LOGREG): we train both
unregularized and regularized models, varying the
ridge (regularization) parameter by factors of 10 from
10^−8
to 10^4


In [14]:
def LOGREG(X, y):
    '''returns the scores of each trial'''
    
    pipe = Pipeline(steps=[('classifier', LogisticRegression())])

    search_space = [{'classifier': [LogisticRegression(max_iter=5000)],
                     'classifier__solver': ['saga'],
                     'classifier__penalty': ['l1', 'l2'],
                     'classifier__C': np.logspace(-8, 4, 13)},
                    {'classifier': [LogisticRegression(max_iter=5000)],
                     'classifier__solver': ['lbfgs'],
                     'classifier__penalty': ['l2'],
                     'classifier__C': np.logspace(-8, 4, 13)},
                    {'classifier': [LogisticRegression(max_iter=5000)],
                     'classifier__solver': ['lbfgs','saga'],
                     'classifier__penalty': ['none']}
                    ]
    
    TRIALS = 5#DOING 5 TRIALS
    log_test_acc = []#saving the accuracy for each logistic regression trial on test set
    log_test_auc = []#saving the roc auc for each logistic regression trial on test set
    log_test_f1 = []#saving the f1 score for each logistic regression trial on test set
    train_acc = []
    train_auc = []
    train_f1 = []

    for i in range(TRIALS):
        #sampling 5000 training size for k-fold
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000)
        
        # Create grid search |
        clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=5), 
                       scoring=['accuracy', 'roc_auc', 'f1'], refit=False,
                       verbose=1, n_jobs=-1)
        best_model = clf.fit(X_train, y_train)

        #top 3 best params model parameters
        model_acc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
        model_auc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc'])]
        model_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1'])]

        model_params = [model_acc, model_auc, model_f1]

        #gets the scores for each model and append them to corresponding array
        for index, params in enumerate(model_params):
            C = 0
            try:
                C = params['classifier__C']
            except KeyError:
               pass

            solver = params['classifier__solver']
            penalty = params['classifier__penalty']

            #training best models on validation set
            logit_model = LogisticRegression(max_iter=5000)
            logit_model.set_params(C=C, solver=solver, penalty=penalty)
            logit_model.fit(X_train, y_train) 

            #get roc_auc, acc and f1 scores on test set
            if index == 0:
                score = logit_model.score(X_test, y_test)
                log_test_acc.append(score)
                train_score = logit_model.score(X_train, y_train)
                train_acc.append(train_score)

            elif index == 1:
                roc_score = roc_auc_score(y_test, logit_model.predict_proba(X_test)[:, 1])
                log_test_auc.append(roc_score)
                train_roc = roc_auc_score(y_train, logit_model.predict_proba(X_train)[:, 1])
                train_auc.append(train_roc)

            #appending to f1
            else:
                y_predict = logit_model.predict(X_test)#predictions on test set
                f_score = f1_score(y_test, y_predict)
                log_test_f1.append(f_score)
                f_train = f1_score(y_train, logit_model.predict(X_train))
                train_f1.append(f_train)
                
    return log_test_acc, log_test_auc, log_test_f1, train_acc, train_auc, train_f1

In [15]:
log_acc, log_auc, log_f1, log_train_acc, log_train_auc, log_train_f1 = LOGREG(X_preprocessed, y)
print(log_acc)
print(log_auc)
print(log_f1)

Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 205 out of 205 | elapsed:  1.4min finished


Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 205 out of 205 | elapsed:  1.7min finished


Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 205 out of 205 | elapsed:  1.2min finished


Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 205 out of 205 | elapsed:  1.4min finished


Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 205 out of 205 | elapsed:  1.5min finished


[0.8474963715529753, 0.8495645863570392, 0.8502177068214805, 0.8472060957910015, 0.8487300435413643]
[0.9038779499298359, 0.9041393855794552, 0.9039679811502224, 0.9051781480667745, 0.9048461707394269]
[0.6586187073991951, 0.662414210738797, 0.6607142857142857, 0.6488575268817204, 0.6511005105029709]


In [16]:
print("""TEST SET PERFORMANCE:""")
scores = pd.DataFrame([log_acc, log_auc, log_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(scores)
print()

logt_scores = pd.DataFrame([log_train_acc, log_train_auc, log_train_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
logt_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print("TRAIN SET PERFORMANCE:")
print(logt_scores)

TEST SET PERFORMANCE:
      Trial 1   Trial 2   Trial 3   Trial 4   Trial 5
ACC  0.847496  0.849565  0.850218  0.847206  0.848730
ROC  0.903878  0.904139  0.903968  0.905178  0.904846
FSC  0.658619  0.662414  0.660714  0.648858  0.651101

TRAIN SET PERFORMANCE:
      Trial 1   Trial 2   Trial 3   Trial 4   Trial 5
ACC  0.863600  0.850200  0.853400  0.850000  0.860200
ROC  0.913989  0.910883  0.911407  0.909116  0.912944
FSC  0.673807  0.669319  0.666364  0.666976  0.671368


# Algorithm 2: KNN

eplores 30 different values of k, evenly spaced from a range between 1 to 500
With different weights: uniform, and distance


In [17]:
def KNN_model(X, y):
    classifier = KNeighborsClassifier()
    num_k = np.ceil(np.logspace(0, 2.7, 30)).astype(int)
    
    search_space = {'n_neighbors': num_k, 
                      'weights': ['uniform', 'distance']}

    TRIALS = 5#DOING 5 TRIALS
    acc_scores = []#saving the accuracy for each trial on test set
    auc_scores = []#saving the roc auc for each trial on test set
    f1_scores = []#saving the f1 score for each trial on test set
    train_acc = []
    train_auc = []
    train_f1 = []
    
    for i in range(TRIALS):
        #sampling 5000 training size for k-fold
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000)
        
        # Create grid search
        clf = GridSearchCV(classifier, search_space, cv=StratifiedKFold(n_splits=5), 
                           scoring=['accuracy', 'roc_auc', 'f1'], refit=False,
                           verbose=1, n_jobs=-1)
        best_model = clf.fit(X_train, y_train)

        #top 3 best params model parameters
        model_acc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
        model_auc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc'])]
        model_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1'])]

        model_params = [model_acc, model_auc, model_f1]

        #gets the scores for each model and append them to corresponding array
        for index, params in enumerate(model_params):
            k = params['n_neighbors']
            weight = params['weights']

            #training best models on validation set
            KNN_model = KNeighborsClassifier()
            KNN_model.set_params(n_neighbors=k, weights=weight)
            KNN_model.fit(X_train, y_train)#training on 5000 points

            #model == model_acc
            if index == 0:
                score = KNN_model.score(X_test, y_test)
                acc_scores.append(score)
                train_score = KNN_model.score(X_train, y_train)
                train_acc.append(train_score)
                
            #model_auc
            elif index == 1:
                roc_score = roc_auc_score(y_test, KNN_model.predict_proba(X_test)[:, 1])
                auc_scores.append(roc_score)
                train_roc = roc_auc_score(y_train, KNN_model.predict_proba(X_train)[:, 1])
                train_auc.append(train_roc)

            #appending to f1
            else:
                y_predict = KNN_model.predict(X_test)#predictions on test set
                f_score = f1_score(y_test, y_predict)
                f1_scores.append(f_score)
                f_train = f1_score(y_train, KNN_model.predict(X_train))
                train_f1.append(f_train)
    
    return acc_scores, auc_scores, f1_scores, train_acc, train_auc, train_f1

In [18]:
knn_acc, knn_auc, knn_f1, knn_train_acc, knn_train_auc, knn_train_f1 = KNN_model(X_preprocessed, y) #using onhotencoded and stnadarized
print(knn_acc)
print(knn_auc)
print(knn_f1)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   32.9s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.0min finished


Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.0min finished


Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   35.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.1min finished


Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   37.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.1min finished


Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   37.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.1min finished


[0.8376269956458636, 0.8365021770682148, 0.8368650217706821, 0.833055152394775, 0.836066763425254]
[0.8898338722653379, 0.8880254825549282, 0.8901120016859949, 0.8878525728610537, 0.8877675680377131]
[0.6352516857456484, 0.6282397187474451, 0.6174626145682586, 0.6325940212150434, 0.6326530612244897]


In [19]:
print("""TEST SET PERFORMANCE:""")
k_scores = pd.DataFrame([knn_acc, knn_auc, knn_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
k_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(k_scores)
print()

print("TRAIN SET PERFORMANCE:")
kt_scores = pd.DataFrame([knn_train_acc, knn_train_auc, knn_train_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
kt_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(kt_scores)

TEST SET PERFORMANCE:
      Trial 1   Trial 2   Trial 3   Trial 4   Trial 5
ACC  0.837627  0.836502  0.836865  0.833055  0.836067
ROC  0.889834  0.888025  0.890112  0.887853  0.887768
FSC  0.635252  0.628240  0.617463  0.632594  0.632653

TRAIN SET PERFORMANCE:
      Trial 1  Trial 2   Trial 3   Trial 4  Trial 5
ACC  1.000000    0.847  1.000000  0.999800   0.8488
ROC  1.000000    1.000  1.000000  0.889163   1.0000
FSC  0.691813    1.000  0.684473  0.681641   1.0000


# Algorithm 3: Random Forest

from caruana paper: "The
forests have 1024 trees. The size of the feature set
considered at each split is 1,2,4,6,8,12,16 or 20."

n_estimators: 1024\
max_features: 1,2,4,6,8,12,16 or 20

In [20]:
def RF_model(X, y):
    classifier = RandomForestClassifier()
    max_features = [1, 2, 4, 6, 8, 12, 16, 20]
    
    for feature in max_features:
        if feature > len(X.columns):
            max_features.remove(feature)
            
    search_space = {'n_estimators': [1024], 'max_features': max_features}
    
    TRIALS = 5#DOING 5 TRIALS
    acc_scores = []#saving the accuracy for each trial on test set
    auc_scores = []#saving the roc auc for each trial on test set
    f1_scores = []#saving the f1 score for each trial on test set
    train_acc = []
    train_auc = []
    train_f1 = []
    
    for i in range(TRIALS):
        #sampling 5000 training size for k-fold
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000)
        
        # Create grid search
        clf = GridSearchCV(classifier, search_space, cv=StratifiedKFold(n_splits=5), 
                       scoring=['accuracy', 'roc_auc', 'f1'], refit=False,
                       verbose=2, n_jobs=-1)
        best_model = clf.fit(X_train, y_train)
        
        #top 3 best params model parameters
        model_acc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
        model_auc = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc'])]
        model_f1 = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_f1'])]
        
        model_params = [model_acc, model_auc, model_f1]
        
        #gets the scores for each model and append them to corresponding array
        for index, params in enumerate(model_params):
            n_estimators = params['n_estimators']
            max_features = params['max_features']
            
            #train once more on entire validation set
            RF_model = RandomForestClassifier(n_estimators=n_estimators, 
                                                  max_features=max_features)
            RF_model.fit(X_train, y_train)
            
            #scoring on test set using multiple metrics
            #model == model_acc
            if index == 0:
                score = RF_model.score(X_test, y_test)
                acc_scores.append(score)
                train_score = RF_model.score(X_train, y_train)
                train_acc.append(train_score)
            
            #model_auc
            elif index == 1:
                roc_score = roc_auc_score(y_test, RF_model.predict_proba(X_test)[:, 1])
                auc_scores.append(roc_score)
                train_roc = roc_auc_score(y_train, RF_model.predict_proba(X_train)[:, 1])
                train_auc.append(train_roc)

            #appending to f1
            else:
                y_predict = RF_model.predict(X_test)#predictions on test set
                f_score = f1_score(y_test, y_predict)
                f1_scores.append(f_score)
                f_train = f1_score(y_train, RF_model.predict(X_train))
                train_f1.append(f_train)
                
    return acc_scores, auc_scores, f1_scores, train_acc, train_auc, train_f1

In [21]:
rf_acc, rf_auc, rf_f1, rftrain_acc, rftrain_auc, rftrain_f1= RF_model(X_labelencode, y) #one hot encoding bad, use label ecodeded
print(rf_acc)
print(rf_auc)
print(rf_f1)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   28.7s finished


Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   27.3s finished


Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   28.7s finished


Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   27.8s finished


Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   27.9s finished


[0.8506894049346879, 0.8527213352685051, 0.8497460087082729, 0.8520682148040638, 0.8507982583454281]
[0.9030347730155646, 0.9012716670900147, 0.9024817060261783, 0.9014701705455497, 0.9015870721841605]
[0.6571358146413606, 0.669200291097275, 0.6575698505523065, 0.6643126950238134, 0.6610057708161583]


In [22]:
print("TEST SET PERFORMANCE:")
rf_scores = pd.DataFrame([rf_acc, rf_auc, rf_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
rf_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(rf_scores)
print()
print("TRAIN SET PERFORMANCE:")
rft_scores = pd.DataFrame([rftrain_acc, rftrain_auc, rftrain_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
rft_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(rft_scores)

TEST SET PERFORMANCE:
      Trial 1   Trial 2   Trial 3   Trial 4   Trial 5
ACC  0.850689  0.852721  0.849746  0.852068  0.850798
ROC  0.903035  0.901272  0.902482  0.901470  0.901587
FSC  0.657136  0.669200  0.657570  0.664313  0.661006

TRAIN SET PERFORMANCE:
     Trial 1  Trial 2  Trial 3  Trial 4  Trial 5
ACC      1.0      1.0      1.0      1.0      1.0
ROC      1.0      1.0      1.0      1.0      1.0
FSC      1.0      1.0      1.0      1.0      1.0


# Logistic Regression Results:

In [23]:
print("""TEST SET PERFORMANCE:""")
scores = pd.DataFrame([log_acc, log_auc, log_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(scores)
print()

logt_scores = pd.DataFrame([log_train_acc, log_train_auc, log_train_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
logt_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print("TRAIN SET PERFORMANCE:")
print(logt_scores)

TEST SET PERFORMANCE:
      Trial 1   Trial 2   Trial 3   Trial 4   Trial 5
ACC  0.847496  0.849565  0.850218  0.847206  0.848730
ROC  0.903878  0.904139  0.903968  0.905178  0.904846
FSC  0.658619  0.662414  0.660714  0.648858  0.651101

TRAIN SET PERFORMANCE:
      Trial 1   Trial 2   Trial 3   Trial 4   Trial 5
ACC  0.863600  0.850200  0.853400  0.850000  0.860200
ROC  0.913989  0.910883  0.911407  0.909116  0.912944
FSC  0.673807  0.669319  0.666364  0.666976  0.671368


# KNN Results:

In [24]:
print("""TEST SET PERFORMANCE:""")
k_scores = pd.DataFrame([knn_acc, knn_auc, knn_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
k_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(k_scores)
print()

print("TRAIN SET PERFORMANCE:")
kt_scores = pd.DataFrame([knn_train_acc, knn_train_auc, knn_train_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
kt_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(kt_scores)

TEST SET PERFORMANCE:
      Trial 1   Trial 2   Trial 3   Trial 4   Trial 5
ACC  0.837627  0.836502  0.836865  0.833055  0.836067
ROC  0.889834  0.888025  0.890112  0.887853  0.887768
FSC  0.635252  0.628240  0.617463  0.632594  0.632653

TRAIN SET PERFORMANCE:
      Trial 1  Trial 2   Trial 3   Trial 4  Trial 5
ACC  1.000000    0.847  1.000000  0.999800   0.8488
ROC  1.000000    1.000  1.000000  0.889163   1.0000
FSC  0.691813    1.000  0.684473  0.681641   1.0000


# Random Forest Results:

In [25]:
print("TEST SET PERFORMANCE:")
rf_scores = pd.DataFrame([rf_acc, rf_auc, rf_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
rf_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(rf_scores)
print()
print("TRAIN SET PERFORMANCE:")
rft_scores = pd.DataFrame([rftrain_acc, rftrain_auc, rftrain_f1], 
                     columns= ["Trial 1", "Trial 2", "Trial 3", "Trial 4", "Trial 5"])
rft_scores.rename(index={0: "ACC", 1: "ROC", 2: "FSC"}, inplace=True)
print(rft_scores)

TEST SET PERFORMANCE:
      Trial 1   Trial 2   Trial 3   Trial 4   Trial 5
ACC  0.850689  0.852721  0.849746  0.852068  0.850798
ROC  0.903035  0.901272  0.902482  0.901470  0.901587
FSC  0.657136  0.669200  0.657570  0.664313  0.661006

TRAIN SET PERFORMANCE:
     Trial 1  Trial 2  Trial 3  Trial 4  Trial 5
ACC      1.0      1.0      1.0      1.0      1.0
ROC      1.0      1.0      1.0      1.0      1.0
FSC      1.0      1.0      1.0      1.0      1.0
