In [6]:
#Importing Libraries and moduels 
import pickle
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

from sklearn.model_selection import train_test_split

# Classification models
from sklearn.linear_model import LogisticRegression as c_lm
from sklearn.neighbors import KNeighborsClassifier as c_knn
from sklearn.tree import DecisionTreeClassifier as c_dt
from sklearn.ensemble import RandomForestClassifier as c_rf, GradientBoostingClassifier as c_gbf
from sklearn.svm import SVC as c_svm
from sklearn.neural_network import MLPClassifier as c_mlp

# Hyperparam tuning
from sklearn.model_selection import GridSearchCV

# Model evaluation
from ml_pipeline.model_evaluation import evaluate_classification

In [7]:
# Import data, perform SAME (match random state) train/test split
path = "data/dirty_credit_data (1).csv"
data = pd.read_csv(path)

# Split target from data
target = 'Defaulted'

# Drop records w/o target (can't train if no target)
data.dropna(subset = [target], inplace=True)

# Drop duplicates
data.drop_duplicates(inplace=True)

# Separate target from the rest of the data
cols = list(data.columns)
cols.remove(target)

# Define dependent and independent variables
y = data[target]
X = data[cols]

# Immediately train test split
# ! THE RANDOM SEED MUST MATCH WHAT YOU DID DURING DATA PREP!!!
x_tr, x_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
# Import pickle and transform data
with open("./pickles/data_pipeline.pickle", 'rb') as handle:
    pipe = pickle.load(handle)
    x_tr = pipe.transform(x_tr)
    x_te = pipe.transform(x_te)

x_tr.head()



Unnamed: 0,age,credit_score,income,wtd_ave_debt_interest,ln(total_debt),ln(loan_value),employment_status_full-time,employment_status_part-time,employment_status_unemployed,employment_status_full-tme,...,loan_purpose_credit_card,loan_purpose_auto,loan_purpose_business,loan_purpose_mortgage,married_single,prior_default_n,education_level_2,education_level_3,education_level_1,education_level_0
9069,35.9,690.0,105072.0,0.1457,10.476245,10.476273,0,1,0,0,...,1,0,0,0,1,1,1,0,0,0
2603,36.6,643.0,47074.0,0.1337,9.019664,9.028219,1,0,0,0,...,1,0,0,0,0,1,0,0,1,0
7738,46.3,740.0,54489.0,0.1041,8.470102,8.494129,1,0,0,0,...,0,0,0,0,1,1,1,0,0,0
1579,54.7,737.0,131456.0,0.1269,11.85014,11.850176,1,0,0,0,...,0,0,0,1,1,1,1,0,0,0
5058,44.8,633.0,67539.0,0.1557,8.916506,8.913819,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0


In [5]:
# Model Two
# Linear Regression
# Instantiate    
    # No Hyperparameters for this model
lm_model = c_lm()

# Train the model 
lm_model.fit(x_tr, y_tr)

# Generate Predictions
lm_tr = lm_model.predict(x_tr)
lm_te = lm_model.predict(x_te)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Evaluate
metrics, blindcm, traincm, testcm = evaluate_classification(y_tr, lm_tr, y_te, lm_te)
metrics

In [None]:
# Model 3 
# Grid Search and GBF Model 
model = gbf()

# Define parameter ranges
param_grid = {
              'n_estimators': [100, 150, 200, 250],
              'max_depth': [2,3,4,6],
              'max_features': ['sqrt', 'log2', None]
             }

# Create GridSearchCV object
folds = 5
grid_search = GridSearchCV(model, param_grid, cv=folds)

# Fit the model to the data
grid_search.fit(x_tr, y_tr)

# Print the best parameters
best = grid_search.best_params_
print("Best parameters: ", best)

# Generate Predictions
grid_tr = grid_search.predict(x_tr)
grid_te = grid_search.predict(x_te)

In [None]:
# Evaluate
metrics, blindcm, traincm, testcm = evaluate_classification(y_tr, gbf_tr, y_te, gbf_te)
metrics

In [None]:
# Training set confusion matrix
traincm.confusion_matrix

In [None]:
traincm.confusion_matrix_rel

In [None]:
# Test Set confusion matrix
testcm.confusion_matrix_rel

In [None]:
# No clue what this does but needed for gbf 

import matplotlib.pyplot as plt
import seaborn as sns

lbl = list(gbf_model.feature_names_in_)
imp = list(gbf_model.feature_importances_)
impdf = pd.DataFrame({'variable':lbl,'importance':imp})
impdf.sort_values('importance', ascending=False, inplace=True)

fig = plt.figure(figsize = (12,12))
sns.barplot(x=impdf['importance'], y=impdf['variable'])

In [None]:
# Finalize the model 
X = pd.concat([x_tr, x_te], axis = 0, ignore_index = True)
y = pd.concat([y_tr, y_te], axis = 0, ignore_index = True)

# We use the same exact hyperparameters from tuning
gbf_model = c_gbf(n_estimators = 50, max_depth = 2)

# Train the model 
gbf_model.fit(X, y)

In [None]:
# Pickle and save the model
with open("./pickles/supply_chain_classifier.pickle", 'wb') as handle:
    pickle.dump(gbf_model, handle)