In [25]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.ticker as mtick
plt.style.use('ggplot')
import seaborn as sns
pd.options.display.float_format = '{:,.0f}'.format
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import dataframe_image as dfi
from pandas.api.types import CategoricalDtype
from matplotlib.ticker import MultipleLocator, FuncFormatter
from scipy.stats import loguniform
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [12]:
#Import Accepted_Rejected
Accepted_Rejected_Ord = pd.read_csv('Accepted_Rejected_OrdEncoded.csv')

In [13]:
Accepted_Rejected_Ord.head()

Unnamed: 0,Loan_Amount,Risk_Score,Debt-Income,State,Employment_Length,Year,Credit_Policy,Application
0,3600,677,6,PA,10,2015,1,0
1,24700,717,16,SD,10,2015,1,0
2,20000,697,11,IL,10,2015,1,0
3,35000,787,17,NJ,10,2015,1,0
4,10400,697,25,PA,3,2015,1,0


In [14]:
X_Num_Ord = Accepted_Rejected_Ord.drop(['Application', 'State'], axis=1)
X_Nominal = Accepted_Rejected_Ord[['State']]
Y = Accepted_Rejected_Ord['Application']

In [15]:
#One Hot Encode Nominal (State)
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_Nominal_One_Hot = encoder.fit_transform(X_Nominal)
X_Nominal_One_Hot_df = pd.DataFrame(
    X_Nominal_One_Hot,
    columns=encoder.get_feature_names_out(X_Nominal.columns),
    index=Accepted_Rejected_Ord.index
)
#Merge
X = pd.merge(
    X_Num_Ord,
    X_Nominal_One_Hot_df,
    left_index=True,
    right_index=True,
    how='left'
)

In [16]:
X.shape

(11004999, 56)

In [26]:
#Train Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, stratify=Y, random_state=2)
X_tr, X_val, Y_tr, Y_val = train_test_split(X_train, Y_train, test_size=.2, stratify=Y_train, random_state=2)

In [27]:
#Logistic Regression (Base)
logreg = LogisticRegression(
    max_iter=100,
    class_weight='balanced',
    solver='saga',
    penalty='l2',
    n_jobs=-1,
    tol=1e-3
)

logreg.fit(X_tr, Y_tr)

logreg_val_auc = roc_auc_score(Y_val, logreg.predict_proba(X_val)[:, 1])  #Validation
logreg_test_auc = roc_auc_score(Y_test, logreg.predict_proba(X_test)[:, 1])  #Final test

print('Validation ROC-AUC:', logreg_val_auc)
print('Test ROC-AUC:', logreg_test_auc)

KeyboardInterrupt: 

In [None]:
#Logistic Regression (RandomizedSearchCV)
param_dist = {
    'C': loguniform(1e-3, 1e2)
}

logreg_rand = RandomizedSearchCV(
    estimator=logreg,
    param_distributions=param_dist,
    n_iter=10,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    random_state=5,
    error_score='nan'
)

logreg_rand.fit(X_train, Y_train)
best_logreg = logreg_rand.best_estimator_

best_logreg_cv_scores = cross_val_score(
    best_logreg,
    X_train,
    Y_train,
    cv=StratifiedKFold(5, shuffle=True, random_state=2),
    scoring='roc_auc',
    n_jobs=-1
)

best_logreg_test_auc = roc_auc_score(
    Y_test,
    best_logreg.predict_proba(X_test)[:, 1]
)

print('Test ROC-AUC:', best_logreg_test_auc)
print('Mean CV ROC-AUC:', best_logreg_cv_scores.mean())
print('CV ROC-AUC Scores:', best_logreg_cv_scores)
print('Best Params:', logreg_rand.best_params_)