In [1]:
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn import preprocessing
from sklearn import neighbors

import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline
plt.style.use('seaborn-white')

In [6]:
df = pd.read_excel('Data/Default.xlsx')
df.drop(columns=df.columns[0], inplace=True)
print(df.shape)
df.head()

  warn("Workbook contains no default style, apply openpyxl's default")


(10000, 4)


Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


In [14]:
df.replace({"Yes": 1, "No": 0}, inplace=True)
df.head()

Unnamed: 0,default,student,balance,income
0,0,0,729.526495,44361.625074
1,0,1,817.180407,12106.1347
2,0,0,1073.549164,31767.138947
3,0,0,529.250605,35704.493935
4,0,0,785.655883,38463.495879


In [54]:
from sklearn.model_selection import train_test_split

def split_df(random_state=42):
    df_train, df_test = train_test_split(
        df, test_size=0.2, 
        stratify=df[["default", "student"]],
        shuffle=True,
        random_state=random_state
    )
    X_train = df_train.drop(columns="default")
    X_test = df_test.drop(columns="default")
    y_train = df_train["default"]
    y_test = df_test["default"]
    return X_train, X_test, y_train, y_test

# for col in ["default", "student"]:
#     print(
#         "Train proportion of " + col + "s: ", 
#         df_train[col].mean(), "\n",
#         "Test proportion of " + col + "s: ", 
#         df_test[col].mean(), "\n",
#     )

### Using `income` and `balance`

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import zero_one_loss

res=[]

for X_train, X_test, y_train, y_test in [split_df(i) for i in range(100)]:
    logit = LogisticRegression().fit(X_train[["income", "balance"]], y_train)
    pred_test = logit.predict(X_test[["income", "balance"]])
    error_rate = zero_one_loss(y_test, pred_test)
#     print("Test error rate:", error_rate)
    res.append(error_rate)
    
plt.plot(np.arange(1, len(res)+1), res)
plt.gca().set_xlabel("split")
plt.gca().set_ylabel("error rate")

Conclusion: test error differs depending on the particular choice of the train/validation partition. </br>
(test set here really refers to the validation set) </br></br>
Though this is true, it doesn't affect the model selection process, since different partitions tend to agree on their argmin value (where minimization is w.r.t to all candidate models) (Figure 5.2 on p.200)

In [73]:
# corresponds to the 42 (default) train test split
test_error = res[42]

### Adding `student`

In [74]:
X_train, X_test, y_train, y_test = split_df()
logit = LogisticRegression().fit(X_train, y_train)
pred_test = logit.predict(X_test)
test_error_student = zero_one_loss(y_test, pred_test)

test_error, test_error_student

(0.027000000000000024, 0.02749999999999997)

Conclusion: the test error for the logit model with `student` is higher, so the model is not preferable.

### Bootstraping the estimates
In particular the SD of the coefficients of our Logit model. </br>
We'll compare those to the theoretically obtained (the ones which rely on several assumptions)

In [75]:
X_train.drop(columns="student", inplace=True)
X_test.drop(columns="student", inplace=True)

In [103]:
logit_sm = sm.Logit(y_train, X_train).fit()
logit_sm.summary()

Optimization terminated successfully.
         Current function value: 0.174151
         Iterations 8


0,1,2,3
Dep. Variable:,default,No. Observations:,8000.0
Model:,Logit,Df Residuals:,7998.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 05 Sep 2022",Pseudo R-squ.:,-0.1905
Time:,15:41:01,Log-Likelihood:,-1393.2
converged:,True,LL-Null:,-1170.3
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
balance,0.0004,7.84e-05,5.204,0.000,0.000,0.001
income,-0.0001,4.12e-06,-30.498,0.000,-0.000,-0.000


In [114]:
B = 100
logit = LogisticRegression()

coeffs = pd.DataFrame(np.zeros((B, 3)), columns=["balance", "income", "intercept"])
for k in range(B):
    # resample
    index_resamp_k = np.random.choice(X_train.index, size=X_train.shape[0])
    X_train_resamp_k = X_train.loc[index_resamp_k, :]
    y_train_resamp_k = y_train.loc[index_resamp_k]
    logit.fit(X_train_resamp_k, y_train_resamp_k)
    coeffs.loc[k, :] = np.r_[logit.coef_.ravel(), logit.intercept_]

In [115]:
coeffs.agg(["mean", "std"])

Unnamed: 0,balance,income,intercept
mean,0.002745,-5.8e-05,-5.218478
std,0.002602,7.5e-05,5.805531


the bootstrapped mean is coeff is significantly (in informal sense) different from the coefficients estimated by the model. </br>
This suggests there is a bias. (?) </br>
Stds are also different from the estimated.
This suggests that some of the assumptions are not satisfied. </br>
Perhaps the structure of the model $$logodds = \beta_0 + \beta_{balance}X_1 + \beta_{income}X_2$$
is not satisfied, or the error is not normally distributed. (?)