In [1]:
#%pip install mlxtend

In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegressionCV 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
import swifter
from sklearn.multioutput import MultiOutputClassifier 
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier.rocauc import roc_auc
import numpy as np
from yellowbrick.classifier.threshold import discrimination_threshold
from sklearn.metrics import accuracy_score
from mlxtend.plotting import plot_confusion_matrix as mlx_plot_cm
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_excel(r"C:\Users\David gathara marigi\Downloads\default of credit card clients.xls", 
                   skiprows=1, 
                   index_col=0) 
target_col = 'default payment next month' 
features = df.drop(target_col, axis=1) 
targets = df[target_col]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, 
                                                    targets, 
train_size=0.75,  
stratify=targets)

In [None]:
lr_model = LogisticRegression(max_iter=1000) 
lr_model.fit(x_train, y_train)

In [None]:
# accuracy

In [None]:
print(lr_model.score(x_train, y_train)) 
print(lr_model.score(x_test, y_test))

In [None]:
df['default payment next month'].value_counts() / df.shape[0]

In [None]:
scaler = StandardScaler() 
scaled_features = scaler.fit_transform(features) 
gnb = GaussianNB() 
lr_cv = LogisticRegressionCV() 
lr_cv.fit(features, targets) 
lr_best_c = LogisticRegression(C=lr_cv.C_[0]) 
knn = KNeighborsClassifier()

In [None]:
 
print(cross_val_score(gnb, features, targets, 
n_jobs=-1).mean()) 
print(cross_val_score(lr_best_c, features, targets, 
n_jobs=-1).mean()) 
print(cross_val_score(knn, features, targets, 
n_jobs=-1).mean()) 
print(cross_val_score(knn, scaled_features, targets, 
n_jobs=-1).mean())

In [None]:
cohen_kappa_score(y_test, lr_model.predict(x_test))

In [None]:
print(cross_val_score(lr_model, 
                      features, 
                      targets, 
                      scoring=make_scorer(cohen_kappa_score), 
                      n_jobs=-1).mean())

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(lr_model, 
                                      x_test, 
                                      y_test, 
                                      cmap=plt.cm.Blues, 
                                      display_labels=['not defaulted', 
                                                       'defaulted'], 
                                      colorbar=False)

In [None]:
pay_0_target = df['PAY_0'].replace({i: 1 for i in range(1, 
9)}) 
pay_0_features = df.drop(['PAY_0', 'default payment next month'], axis=1) 
lr_multi = LogisticRegression(max_iter=1000) 
lr_multi.fit(pay_0_features, pay_0_target)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
disp = ConfusionMatrixDisplay.from_estimator(lr_multi, 
                                             pay_0_features, 
                                             pay_0_target, 
                                             display_labels=['no consumption', 
                                                             'paid on time',  
                                                             'revolving credit',  
                                                             'late'], 
                                             cmap=plt.cm.Blues, 
                                             colorbar=False)
disp.ax_.grid(False)

In [None]:
pay_cols = ['PAY_0'] + [f'PAY_{i}' for i in range(2, 7)] 
mo_targets = df[pay_cols].copy() 
mo_targets = mo_targets.swifter.apply(lambda x: (x > 
0).astype(int), axis=1) 
mo_features = df[[c for c in df.columns if c not in pay_cols + 
                  ['default payment next month']]]

In [None]:
mo_clf = MultiOutputClassifier(LogisticRegression(max_iter=1000), 
n_jobs=-1) 
mo_clf.fit(mo_features, mo_targets)

In [None]:
ml_cm = multilabel_confusion_matrix(mo_targets, 
mo_clf.predict(mo_features)) 
for i, cl in enumerate(mo_targets.columns): 
    f = ConfusionMatrixDisplay(ml_cm[i], display_labels=['on time', 'late']) 
    f.plot(cmap=plt.cm.Blues, colorbar=False) 
    plt.title(cl) 
    plt.grid(False)

In [None]:
print(classification_report(y_test, 
                            lr_model.predict(x_test), 
                            target_names=['no default', 
'default']))

In [None]:
f, ax = plt.subplots(1, 1) 
viz = ClassificationReport(lr_model, 
                           support=True, 
                           classes=['no default', 'default'], 
                           cmap='Blues') 
viz.score(x_test, y_test) 
plt.gcf().delaxes(f.axes[1]) 
viz.show()

In [None]:
print(classification_report(pay_0_target, 
                            lr_multi.predict(pay_0_features), 
                            target_names=['no consumption', 
                                          'on time', 
                                          'credit', 
                                          'late'] 
                           ) 
     )

In [None]:
roc = roc_auc(lr_model, 
              x_train, 
              y_train, 
              x_test, 
              y_test, 
              classes=['no default', 'default'], 
              macro=False, 
              micro=False)

In [None]:
roc = roc_auc(lr_multi, 
              pay_0_features, 
              pay_0_target, 
              macro=False, 
              micro=False)

In [None]:
roc = roc_auc(lr_model, 
              x_train, 
              y_train, 
              x_test, 
              y_test, 
              classes=['no default', 'default'])

In [None]:
from sklearn.metrics import RocCurveDisplay

In [None]:
roc = RocCurveDisplay.from_estimator(lr_model, x_test, y_test) 
plt.plot([0, 1], [0, 1], c='k', linestyle='dotted', label='random model') 
plt.plot([0, 0, 1], [0, 1, 1], c='k', linestyle='dashed', label='perfect model') 
plt.legend()

In [None]:
roc = RocCurveDisplay.from_estimator(lr_model, x_test, y_test, drop_intermediate=False)
youdens_idx = np.argmax(roc.tpr - roc.fpr) 
thresholds = np.unique(lr_model.predict_proba(x_test)[:, 1]) 
thresholds.sort() 
thresholds = [1] + list(thresholds[::-1])  
y_thresh = thresholds[youdens_idx]

In [None]:
upper_left_array = np.vstack((np.zeros(roc.tpr.shape[0]), 
                              np.ones(roc.tpr.shape[0]))).T 
roc_curve_points = np.vstack((roc.fpr, roc.tpr)).T 
topleft_idx = np.argmin(np.linalg.norm(upper_left_array - 
roc_curve_points, 
                                       axis=1)) 
tl_thresh = thresholds[topleft_idx]

In [None]:
dt = discrimination_threshold(lr_model, x_train, y_train) 
f_idx = dt.cv_scores_['fscore'].argmax() 
f_thresh = dt.thresholds_[f_idx]

In [None]:
for t in [y_thresh, tl_thresh, f_thresh]: 
    print(accuracy_score(y_test, 
lr_model.predict_proba(x_test)[:, 1] >= t))

In [None]:
from mlxtend.plotting import plot_confusion_matrix as mlx_plot_cm 

In [None]:
predictions = lr_model.predict_proba(x_test)[:, 1] >= y_thresh 
mlx_plot_cm(confusion_matrix(y_test, predictions))

In [None]:
# downsampling

In [None]:
_, x_sample, _, y_sample = train_test_split(features, 
                                            targets, 
                                            test_size=0.1, 
                                            stratify=targets, 
                                            random_state=42)

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
rus = RandomUnderSampler(random_state=0) 
x_resampled, y_resampled = rus.fit_resample(features, targets)

In [None]:
# oversampling

In [None]:
from sklearn.metrics import roc_auc_score 
from imblearn.over_sampling import RandomOverSampler

In [None]:
ros = RandomOverSampler(random_state=0) 
x_resampled, y_resampled = ros.fit_resample(x_train, y_train) 
lr_model = LogisticRegressionCV(max_iter=5000) 
lr_model.fit(x_train, y_train)
print('unmodified:', 
      roc_auc_score(y_test, 
                    lr_model.predict_proba(x_test)[:, 1]) 
     ) 
lr_model_rs = LogisticRegressionCV(max_iter=1000) 
lr_model_rs.fit(x_resampled, y_resampled) 
print('resampled:', 
      roc_auc_score(y_test, 
                    lr_model_rs.predict_proba(x_test)[:, 1]) 
     ) 

In [None]:
from imblearn.over_sampling import KMeansSMOTE

In [None]:
kmSMOTE = KMeansSMOTE(k_neighbors=5, 
                      cluster_balance_threshold=0.2, 
                      random_state=42, 
                      n_jobs=-1) 
x_resampled, y_resampled = kmSMOTE.fit_resample(x_train, 
y_train) 
lr_model_rs = LogisticRegressionCV(max_iter=5000) 
lr_model_rs.fit(x_resampled, y_resampled) 
print('resampled:', roc_auc_score(y_test, lr_model_rs.predict_proba(x_test)[:, 1]))

In [None]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN 
from imblearn.combine import SMOTEENN, SMOTETomek

In [None]:
samplers = [ 
    SMOTE(random_state=42), 
    BorderlineSMOTE(random_state=42, kind="borderline-1"), 
    BorderlineSMOTE(random_state=42, kind="borderline-2"), 
    SVMSMOTE(random_state=42), 
    ADASYN(random_state=42), 
    SMOTEENN(random_state=42), 
    SMOTETomek(random_state=42) 
]

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train.astype(np.float64))
x_test_scaled = scaler.transform(x_test.astype(np.float64))


In [None]:
for s in samplers:
    x_resampled, y_resampled = s.fit_resample(x_train_scaled, y_train)
    lr_model_rs = LogisticRegressionCV(max_iter=5000, solver='saga', tol=1e-3)
    lr_model_rs.fit(x_resampled, y_resampled)
    ra_score = roc_auc_score(y_test, lr_model_rs.predict_proba(x_test_scaled)[:, 1])
    print(f'{str(s):<55} {ra_score}')