# Modeling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [3]:
df = pd.read_csv('./data/large_clean.csv')

## Baseline Model

In [4]:
df['wage'].value_counts(normalize=True)

0    0.75919
1    0.24081
Name: wage, dtype: float64

## Split Data

In [5]:
# seperating the data
X = df.drop(columns = 'wage')
y = df['wage']

In [6]:
# train/test splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify = y)

## XGboost

In [7]:
params = {
    'n_estimators': [50, 75, 100, 125],
    'max_features': [None, 'auto', 'log2'],
    'max_depth': [2, 3, 4]
}

sXGB = XGBClassifier()
gXGB = GridSearchCV(sXGB, params, n_jobs=4)
gXGB.fit(X_train, y_train);

In [8]:
print(f"XGBoost Training Accuracy: {gXGB.score(X_train, y_train)} ")
print(f"XGBoost Testing Accuracy: {gXGB.score(X_test, y_test)} ")

XGBoost Training Accuracy: 0.8618755118755119 
XGBoost Testing Accuracy: 0.8565286819801007 


In [9]:
gXGB.best_params_

{'max_depth': 2, 'max_features': None, 'n_estimators': 125}

## Logistic Regression

In [10]:
# Scale data
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

# Fit model
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train_sc, y_train)

print(f'Train: {round(logreg.score(X_train_sc, y_train), 4)}')
print(f'Test: {round(logreg.score(X_test_sc, y_test), 4)}')

Train: 0.8422
Test: 0.8441


## Coefficients

In [11]:
coefs = zip(X_train.columns, logreg.coef_)
coef_df = pd.DataFrame(logreg.coef_, columns = X_train.columns).T

coef_df.sort_values(by=0)

Unnamed: 0,0
education-num^2 hours-per-week,-0.576253
age education-num^2,-0.537581
fnlwgt education-num^2,-0.391873
relationship_Own-child,-0.355963
age fnlwgt hours-per-week,-0.268905
age hours-per-week,-0.219709
marital-status_Never-married,-0.19154
fnlwgt capital-gain,-0.107518
education_Bachelors,0.029738
sex_Male,0.057148


## Decision Tree

In [12]:
rf = RandomForestClassifier()
et = ExtraTreesClassifier()

In [13]:
print(f'RF score: {cross_val_score(rf, X_train, y_train, cv=5).mean()}')
print(f'ET score: {cross_val_score(et, X_train, y_train, cv=5).mean()}')

RF score: 0.8346437346437346
ET score: 0.822972972972973


Random forest model performs better.

In [14]:
model_params = {}
count = 0

In [15]:
rf = RandomForestClassifier(random_state=33, n_jobs = 4)

params = {
    'n_estimators': [50, 75, 100, 125],
    'max_features': [None, 'auto', 'log2'],
    'max_depth': [3, 4, 5]
}

gs = GridSearchCV(rf, param_grid=params, cv=5)

gs.fit(X_train, y_train)

count += 1

gs.best_params_['score'] = gs.best_score_
model_params[f'model_{count}'] = gs.best_params_

model_df = pd.DataFrame.from_dict(model_params, orient='index')

In [16]:
model_params

{'model_1': {'max_depth': 5,
  'max_features': None,
  'n_estimators': 50,
  'score': 0.8464373464373465}}

In [17]:
gs.score(X_test, y_test)

0.8505097653850878

## Gaussian NB Modeling

In [18]:
# Scale
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.fit_transform(X_test)

gnb = GaussianNB()
gnb.fit(X_train_sc, y_train)

print(f'Train: {gnb.score(X_train_sc, y_train)}')
print(f'Test: {gnb.score(X_test_sc, y_test)}')

Train: 0.826986076986077
Test: 0.830241985014126
