# Modeling

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve

In [None]:
df = pd.read_csv('marriage.csv')

# Preprocessing

### Split dataset into features and target variables

In [None]:
X = df.loc[:, ['age', 'workclass', 'education_num', 'occupation', 'race', 'capital_gain', 'capital_loss',
                'hours_per_week', 'native_country', 'income']]
y = df.loc[:, 'marital_status']

Create a list for categorical columns

In [None]:
# cat_columns = ['workclass', 'education_num', 'occupation', 'race', 'native_country', 'income']
# cont_columns = ['age', 'capital_gain', 'capital_loss', 'hours_per_week']

Encode each categorical feature separately

In [None]:
# label_encoders = {}
# for col in cat_columns:
#     print('Encoding {}'.format(col))
#     new_LE = LabelEncoder()
#     X[col] = new_LE.fit_transform(df[col])
#     label_encoders[col] = new_LE

In [None]:
# cat_columns_index = [X.columns.get_loc(col) for col in cat_columns]

In [None]:
# # One-hot encode the training data and show the resulting DataFrame with proper column names
# ohe = OneHotEncoder(categories=cat_columns, sparse=False, handle_unknown='ignore')

# X_np = ohe.fit_transform(df)
# # X_train_ohe = ohe.transform(X_train)#.toarray()

This was from Dan's GoT presentation

In [None]:
# df_ohe = pd.concat([X[cont_columns], pd.get_dummies(X[cat_columns])], axis=1)

In [None]:
# df_ohe.head(1)

### Get dummy variables for categorical data

In [None]:
df_dummies = pd.get_dummies(X, drop_first=True)

In [None]:
df_dummies.head(1)

### Train Test Split (70/30)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_dummies, y, test_size=.3, random_state=42)

### Scale/Normalize data

Column Transform continuous features

In [None]:
ct = ColumnTransformer([
    ('standardized', StandardScaler(), ['age', 'capital_gain', 'capital_loss', 'hours_per_week'])
], remainder='passthrough')

Z_train = ct.fit_transform(X_train)
Z_test = ct.transform(X_test)

# Model with Multiple Classifiers

In [None]:
# Try a collection of classifiers
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
         "Random Forest", "XGBoost", "Neural Net", "Logistic Regression"]

classifiers = [
#     3 or 5 KNN seem ideal
    KNeighborsClassifier(3),
#     ['linear', 'poly', 'rbf', 'sigmoid'] all produce similar results
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    DecisionTreeClassifier(criterion='entropy'),
    RandomForestClassifier(criterion='gini', max_depth=5, n_estimators=10, max_features=3, random_state=42),
    XGBClassifier(n_estimators=1000, learning_rate=.02, random_state=42),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    LogisticRegression(solver='liblinear', random_state=42)
]

In [None]:
# Untuned classifiers
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    print(f'       F1 - {name}: {f1_score(y_test, clf.predict(Z_test)):.2f}')
    print(f'Precision - {name}: {precision_score(y_test, clf.predict(Z_test)):.2f}')
    print(f'   Recall - {name}: {recall_score(y_test, clf.predict(Z_test)):.2f}')
    print(f' Accuracy - {name}: {accuracy_score(y_test, clf.predict(Z_test)):.2f}')
    print('\n')

# Hyperparameter Tuning with GridSearchCV

K Nearest Neighbor

In [None]:
KNN_clf = KNeighborsClassifier()

In [None]:
param_grid = {
    'n_neighbors': range(1,5),
    'weights': ['uniform', 'distance'],
    'metric': ['manhattan', 'euclidean', 'minkowski'],
    'n_jobs': [-1]
}

gs_tree = GridSearchCV(KNN_clf, param_grid, cv=3)
gs_tree.fit(X_train, y_train)

gs_tree.best_params_

Support Vector Classifier

In [None]:
SVC_clf = SVC()

In [None]:
param_grid = {
    'C': np.linspace(.1,1,10),
    'degree': [2,3,4,5],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

gs_tree = GridSearchCV(SVC_clf, param_grid, cv=3)
gs_tree.fit(X_train, y_train)

gs_tree.best_params_

Decision Tree Classifier

In [None]:
dtree_clf = DecisionTreeClassifier()

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(2,20)
}

gs_tree = GridSearchCV(dtree_clf, param_grid, cv=3)
gs_tree.fit(X_train, y_train)

gs_tree.best_params_

Random Forest

In [None]:
rforest_clf = RandomForestClassifier()

In [None]:
param_grid = {
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'max_depth': range(2,20,2),
    'n_estimators': range(10,100,10)
}

gs_tree = GridSearchCV(rforest_clf, param_grid, cv=3)
gs_tree.fit(X_train, y_train)

gs_tree.best_params_

XG Boost

In [None]:
XGB_clf = XGBClassifier()

In [None]:
list(XGB_clf.get_params().keys())

In [None]:
param_grid = {
   'learning_rate': np.linspace(.3,.7,5),
    'n_estimators': range(100,1000,100),
    'gamma': [.5,1,1.5,2],
    'max_depth': [2,3,4,5,6]
    
}

gs_tree = GridSearchCV(XGB_clf, param_grid, cv=3, verbose=True, n_jobs=-1)
gs_tree.fit(X_train, y_train)

gs_tree.best_params_

Neural Net

In [None]:
MLP_clf = MLPClassifier(max_iter=100)

In [None]:
list(MLP_clf.get_params().keys())

In [None]:
param_grid = {
    'alpha': 10.0**-np.arange(1,10), 
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['logistic', 'relu', 'tanh'],
    'solver': ['sgd', 'adam'],
}

gs_tree = GridSearchCV(MLP_clf, param_grid, cv=3, verbose=True, n_jobs=-1)
gs_tree.fit(X_train, y_train)

gs_tree.best_params_

### Logistic Regression ROC/AUC

In [None]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

In [None]:
y_pred_class = logreg.predict(X_test)

In [None]:
logreg.predict(X_test)[0:10]

In [None]:
y_pred_prob = logreg.predict_proba(X_test)[:,1]

In [None]:
# IMPORTANT: first argument is true values, second argument is predicted probabilities
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for marriage classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)