In [None]:
# math
import numpy as np
import pandas as pd
pd.options.display.max_columns = None

# models
from sklearn.model_selection import train_test_split
#from rulefit import RuleFit
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# metrics
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import accuracy_score

# visual
import matplotlib.pyplot as plt
from xgboost import plot_tree
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'

# utils
import random
SEED = 42
random.seed(SEED)
import warnings
warnings.filterwarnings('ignore')

import sys
# explainability
#import shap
# print the JS visualization code to the notebook
#shap.initjs()

In [None]:
folder = "../data/raw/"#"/content/gdrive/Shareddrives/Personalized Recourse/data/raw/"
dataset_name = "adult_income" #"GiveMeSomeCredit" #"HELOC"
target_variable = "income" #"RiskPerformance" #"SeriousDlqin2yrs"
model_type = "rf" #lr, mlp, xgb, rf

file_name = f"{dataset_name}.csv"

In [None]:
df = pd.read_csv(folder+file_name)
df.head()

In [None]:
df.columns

In [None]:
print("The dataset has " + str(df.shape[0])  + " instances and " + str(df.shape[1]-1) + " features.")
df.describe()

In [None]:
if "HELOC" in dataset_name:
    df = df.replace("Bad",0)
    df = df.replace("Good",1)
elif "adult" in dataset_name:
    df = df.replace(" <=50K",0)
    df = df.replace(" >50K",1)

### Train the classifier

In [None]:
X = df.drop(columns = target_variable)
features = X.columns
y = df[target_variable].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=SEED, stratify = y)

In [None]:
if model_type == "xgb":
    model = xgb.XGBClassifier(objective='reg:logistic', colsample_bytree=0.2, alpha=10,
                            learning_rate=0.1, max_depth=4, n_estimators=100)
elif model_type == "rf":
    model = RandomForestClassifier(random_state=SEED)
elif model_type == "lr": #Logistic Regression
    model = LogisticRegression(random_state=SEED, penalty='l1', solver='liblinear',)
elif model_type == "mlp": #Multi-layer Perceptron
    model = MLPClassifier(random_state=SEED, max_iter=100, verbose=True)
else:
    raise ValueError("Model type not supported")

model.fit(X_train.values,y_train)

In [None]:
%%time
print('Training accuracy:', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy:', accuracy_score(y_test, model.predict(X_test)))

In [None]:
# Save xgb_mod to file
import pickle
pickle.dump(model, open(f"../out/models/{dataset_name}_{model_type}_model.pkl", "wb"))

In [None]:
# Load xgb_mod from file
model = pickle.load(open(f"../out/models/{dataset_name}_{model_type}_model.pkl", "rb"))

In [None]:
y_pred = model.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)