In [None]:
import pandas as pd
import numpy as np
import time
from datetime import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
X_train = pd.read_csv("/Users/admin/Downloads/LGD/PD_LGD_EAD_model/loan_data_train.csv", index_col = 0)
X_test = pd.read_csv("/Users/admin/Downloads/LGD/PD_LGD_EAD_model/loan_data_test.csv", index_col = 0)
y_train = pd.read_csv("/Users/admin/Downloads/LGD/PD_LGD_EAD_model/loan_data_label_train.csv", index_col = 0)
y_test = pd.read_csv("/Users/admin/Downloads/LGD/PD_LGD_EAD_model/loan_data_label_test.csv", index_col = 0)

Select the features which you have handled properly

In [None]:
def cols_to_use(df):
        categorical_cols = ['grade_', 'home_ownership', 'verification_status__', 'loan_status_', 'purpose_', 'addr_state_', 'initial_list_status_']
        other_cols = categorical_cols + \
        ['earliest_cr_line_weeks_', 'acc_now_delinq_', 'total_acc_', 'pub_rec_', 'open_acc_', 'inq_last_6mths_', 'delinq_2yrs_', 'emp_length_'] + \
        ['emp_length_', 'term_', 'issue_d_', 'int_rate_', 'annual_inc_', 'mths_since_last_delinq_', 'dti_', 'mths_since_last_record_']
        cols_list = []
        for col in other_cols:
                cols_list += ((df.filter(like=col)).columns.values.tolist())
        inputs_train = df[cols_list]
        a = inputs_train.loc[:, ~inputs_train.columns.str.contains('sub_grade|loan_status|categories|cats')]
        a = a.loc[:, ~a.columns.duplicated()]
        cols = ['home_ownership', 'home_ownership_ANY', 'home_ownership_NONE', 'home_ownership_OTHER', 'home_ownership_RENT', 'purpose_car',
                'purpose_small_business', 'purpose_educational', 'purpose_moving', 'purpose_house',
                'purpose_renewable_energy', 'purpose_medical', 'purpose_wedding','purpose_vacation', 'purpose_major_purchase', 'purpose_car',
                'addr_state_AK', 'addr_state_AL', 'addr_state_AR', 'addr_state_AZ', 'addr_state_CO', 'addr_state_CT', 'addr_state_DC',
        'addr_state_DE', 'addr_state_FL', 'addr_state_GA', 'addr_state_HI',
        'addr_state_IA', 'addr_state_ID', 'addr_state_IL', 'addr_state_IN',
        'addr_state_KS', 'addr_state_KY', 'addr_state_LA', 'addr_state_MA',
        'addr_state_MD', 'addr_state_ME', 'addr_state_MI', 'addr_state_MN',
        'addr_state_MO', 'addr_state_MS', 'addr_state_MT', 'addr_state_NC',
        'addr_state_NE', 'addr_state_NH', 'addr_state_NJ', 'addr_state_NM',
        'addr_state_NV', 'addr_state_OH', 'addr_state_OK',
        'addr_state_OR', 'addr_state_PA', 'addr_state_RI', 'addr_state_SC',
        'addr_state_SD', 'addr_state_TN', 'addr_state_UT',
        'addr_state_VA', 'addr_state_VT', 'addr_state_WA', 'addr_state_WI',
        'addr_state_WV', 'addr_state_WY', 'addr_state_ND']

        a.drop(columns=cols, inplace=True)
        a.drop(columns=['open_acc_6m', 'emp_length_int', 'term_int', 'issue_d_days', 'issue_d_weeks', 'annual_inc_joint', 'dti_joint'], inplace=True)
        reference_category = ['grade_G', 'home_ownership:RENT_OTHER_ANY_NONE', 'purpose_worst', 'addr_state_worst',
                        'initial_list_status_w', 'earliest_cr_line_weeks_(-2.552, 255.2]', 'total_acc_<9',
                        'emp_length_0','term_60', 'open_acc_<4', 'inq_last_6mths_>3', 'issue_d_weeks_>391', 'int_rate_(23.996, 26.06]',
                        'annual_inc_<38K', 'dti_(37.991, 39.99]']
        a.drop(columns=reference_category, inplace=True)
        return df, inputs_train, a, reference_category

In [None]:
main_train_df, sub_train_df, sub_main_cats_train_df, reference_cats = cols_to_use(X_train)
main_test_df, sub_test_df, sub_main_cats_test_df, reference_cats = cols_to_use(X_test)

PD MODEL ESTIMATION

In [None]:
reg = LogisticRegression(max_iter=1000)
reg.fit(sub_main_cats_train_df, np.array(y_train).ravel())

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
feature_names = sub_main_cats_train_df.columns.values

In [None]:
summary_table = pd.DataFrame(columns=['Feature names'], data=feature_names)
summary_table['Coefficients'] = np.transpose(reg.coef_)
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table.sort_index(inplace=True)
summary_table

MODEL LG with P values

In [None]:
import statsmodels.api as sm

In [None]:
print(y_train.dtypes)
print(sub_main_cats_train_df.dtypes)

In [None]:
sub_main_cats_train_df = sub_main_cats_train_df.apply(lambda x: x.astype(int) if x.dtype=='bool' else x)
sub_main_cats_test_df = sub_main_cats_test_df.apply(lambda x: x.astype(int) if x.dtype=='bool' else x)
# sub_main_cats_train_df

In [None]:
sub_main_cats_train_df = sm.add_constant(sub_main_cats_train_df)
sub_main_cats_test_df = sm.add_constant(sub_main_cats_test_df)
model = sm.Logit(y_train, sub_main_cats_train_df)
result = model.fit()

In [None]:
result.summary()

In [None]:
summary_table = pd.DataFrame({
    'Features': result.params.index,
    'Coefficients': result.params.values,
    'p_values': result.pvalues.values
})
summary_table.to_csv("PD_model_params.csv")

After Observing the p-values - we remove, open_acc, total_acc

In [None]:
sub_main_cats_train_df = sub_main_cats_train_df.loc[:,~sub_main_cats_train_df.columns.str.contains('open_acc|total_acc')]
sub_main_cats_test_df = sub_main_cats_test_df.loc[:,~sub_main_cats_test_df.columns.str.contains('open_acc|total_acc')]

We fit the model again

In [None]:
sub_main_cats_train_df = sm.add_constant(sub_main_cats_train_df)
sub_main_cats_test_df = sm.add_constant(sub_main_cats_test_df)
model = sm.Logit(y_train, sub_main_cats_train_df)
result = model.fit()
summary_table = pd.DataFrame({
    'Features': result.params.index,
    'Coefficients': result.params.values,
    'p_values': result.pvalues.values
})
summary_table.to_csv("PD_model_params_inclusive_p_vals.csv")

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions_prob = result.predict(sub_main_cats_test_df)  # Predicted probabilities
predictions = (predictions_prob > 0.85).astype(int)  # Convert probabilities to binary predictions

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.4f}')

In [None]:
# predictions_prob.plot
report = classification_report(y_test, predictions)
print(report)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
roc_curve(y_test, predictions_prob)

In [None]:
fpr, tpr, tr = roc_curve(y_test, predictions_prob)

In [None]:
plt.plot(fpr, tpr)

In [None]:
auc = roc_auc_score(y_test, predictions_prob)
auc

GINI AND KOLMOGOROV SMIRNOV PERFORMANCE MEASURE

In [None]:
y_test = pd.concat([y_test,pd.Series(predictions_prob)], axis=1)
y_test.columns = ['actual_class', 'predicted_probabilities']

In [None]:
y_test.sort_values('predicted_probabilities', inplace=True)

In [None]:
df_test = y_test.copy()
df_test.reset_index(inplace=True)
df_test['cumulative_N_population'] = df_test.index+1
df_test['cumulative_N_good'] = df_test['actual_class'].cumsum()
df_test['cumulative_N_bad'] = df_test['cumulative_N_population'] - df_test['cumulative_N_good']
df_test['cumulative_%_population'] = df_test['cumulative_N_population']/len(df_test)
df_test['cumulative_%_good'] = df_test['cumulative_N_good']/df_test['actual_class'].sum()
df_test['cumulative_%_bad'] = df_test['cumulative_N_bad']/(len(df_test) - df_test['actual_class'].sum())
df_test


GINI coef is the cum % pop vs cum % bad
GINI = AUROC*2 -1

In [None]:
plt.plot(df_test['cumulative_%_population'],df_test['cumulative_%_bad'])
plt.plot(df_test['cumulative_%_population'], df_test['cumulative_%_population'])

KOLMOGOROV basically tells the difference in distb of good vs bad

perfect model - 
K-S=1

predicting by chance - 
K-S=0

In [None]:
plt.plot(df_test['predicted_probabilities'], df_test['cumulative_%_bad'], color='r')
plt.plot(df_test['predicted_probabilities'], df_test['cumulative_%_good'], color='b')

In [None]:
KS = max(df_test['cumulative_%_bad'] - df_test['cumulative_%_good'])
KS