In [2]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize)
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import \
     (LinearDiscriminantAnalysis as LDA,
      QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from functools import partial
from sklearn.model_selection import \
     (cross_validate,
      KFold,
      ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm

### Question 5

We will estimate the test error on the logistic regression model from Ch 04 Exercises.

a) Fit a logistic regression model that uses income and balance to
 predict default.

In [5]:
default = pd.read_csv("Default.csv")
default

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.134700
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879
...,...,...,...,...
9995,No,No,711.555020,52992.378914
9996,No,No,757.962918,19660.721768
9997,No,No,845.411989,58636.156984
9998,No,No,1569.009053,36669.112365


In [11]:
np.unique(default[['default','student']])

array(['No', 'Yes'], dtype=object)

In [13]:
np.unique(np.isnan(default[['balance','income']]))

array([False])

No missing values to worry about.

In [22]:
allvars = default[['balance','income']]
design = MS(allvars)
X = design.fit_transform(default)
y = default.default == 'Yes'
glm = sm.GLM(y,
             X,
             family=sm.families.Binomial())
results = glm.fit()
summarize(results)

Unnamed: 0,coef,std err,z,P>|z|
intercept,-11.5405,0.435,-26.544,0.0
balance,0.0056,0.0,24.835,0.0
income,2.1e-05,5e-06,4.174,0.0


b) Using the validation set approach, estimate the test error of this  model.

In [109]:
#split train/test data - 5000/5000
def_train, def_valid = train_test_split(default[['balance','income','default']],
                                          test_size=5000,
                                          random_state=3)

#retrain logistic model on training data
X_train = design.fit_transform(def_train)
y_train = def_train.default == 'Yes'
glm = sm.GLM(y_train,
             X_train,
             family=sm.families.Binomial())
results = glm.fit()

#prediction of test data and classifying predictions
X_test = design.fit_transform(def_valid)
y_test = def_valid.default == 'Yes'
probs = results.predict(exog=X_test)

y_test = np.where(y_test==0,"No","Ye")
labels = np.array(["No"]*5000)
labels[probs>0.5] = 'Yes'
confusion_table(labels, y_test)

Truth,No,Ye
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
No,4817,110
Ye,14,59


In [111]:
np.unique(labels)

array(['No', 'Ye'], dtype='<U2')

I don't know why labels is only assigning "Ye" instead of "Yes," so we're going to work with some funny labeling.

In [114]:
(14+110)/5000

0.0248

The validation set error is 2.5%

c) Repeat the process in (b) three times, using three different splits  of the observations into a training set and a validation set. Co
ment on the results obtaine.


In [136]:
#split train/test data - 2500/7500
def_train, def_valid = train_test_split(default[['balance','income','default']],
                                          test_size=7500,
                                          random_state=26)

#retrain logistic model on training data
X_train = design.fit_transform(def_train)
y_train = def_train.default == 'Yes'
glm = sm.GLM(y_train,
             X_train,
             family=sm.families.Binomial())
results = glm.fit()

#prediction of test data and classifying predictions
X_test = design.fit_transform(def_valid)
y_test = def_valid.default == 'Yes'
probs = results.predict(exog=X_test)

y_test = np.where(y_test==0,"No","Ye")
labels = np.array(["No"]*7500)
labels[probs>0.5] = 'Yes'
confusion_table(labels, y_test)

Truth,No,Ye
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
No,7211,181
Ye,29,79


In [138]:
#split train/test data - 7500/2500
def_train, def_valid = train_test_split(default[['balance','income','default']],
                                          test_size=2500,
                                          random_state=7)

#retrain logistic model on training data
X_train = design.fit_transform(def_train)
y_train = def_train.default == 'Yes'
glm = sm.GLM(y_train,
             X_train,
             family=sm.families.Binomial())
results = glm.fit()

#prediction of test data and classifying predictions
X_test = design.fit_transform(def_valid)
y_test = def_valid.default == 'Yes'
probs = results.predict(exog=X_test)

y_test = np.where(y_test==0,"No","Ye")
labels = np.array(["No"]*2500)
labels[probs>0.5] = 'Yes'
confusion_table(labels, y_test)

Truth,No,Ye
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
No,2416,55
Ye,6,23


In [140]:
#split train/test data - 9000/1000
def_train, def_valid = train_test_split(default[['balance','income','default']],
                                          test_size=1000,
                                          random_state=31)

#retrain logistic model on training data
X_train = design.fit_transform(def_train)
y_train = def_train.default == 'Yes'
glm = sm.GLM(y_train,
             X_train,
             family=sm.families.Binomial())
results = glm.fit()

#prediction of test data and classifying predictions
X_test = design.fit_transform(def_valid)
y_test = def_valid.default == 'Yes'
probs = results.predict(exog=X_test)

y_test = np.where(y_test==0,"No","Ye")
labels = np.array(["No"]*1000)
labels[probs>0.5] = 'Yes'
confusion_table(labels, y_test)

Truth,No,Ye
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
No,968,20
Ye,1,11


In [142]:
print("Split 1:",(14+110)/5000,'\n',"Split 2:",(29+181)/7500,'\n',"Split 3:",(6+55)/2500,'\n',"Split 4:",(21)/1000)

Split 1: 0.0248 
 Split 2: 0.028 
 Split 3: 0.0244 
 Split 4: 0.021


The validation set error varies a lot depending on the proportion of observations held out.  the fewer observations used for testing, the more likely the model is overfitting the data.

d) Now consider a logistic regression model that predicts the prob
ability of default using income, balance, and a dummy variabl 
 for student. Estimate the test error for this model using the vl
idation set approach. Comment on whether or not includin a
 d ummyvariable for student leads to a reduction in the test e or
 rate.

In [163]:
#split train/test data - 5000/5000
default['student'] = default['student'].astype("category")
def_train, def_valid = train_test_split(default,
                                          test_size=5000,
                                          random_state=3)

allvars = default[['balance','income','student']]
design = MS(allvars)

#retrain logistic model on training data
X_train = design.fit_transform(def_train)
y_train = def_train.default == 'Yes'
glm = sm.GLM(y_train,
             X_train,
             family=sm.families.Binomial())
results = glm.fit()

#prediction of test data and classifying predictions
X_test = design.fit_transform(def_valid)
y_test = def_valid.default == 'Yes'
probs = results.predict(exog=X_test)

y_test = np.where(y_test==0,"No","Ye")
labels = np.array(["No"]*5000)
labels[probs>0.5] = 'Yes'
confusion_table(labels, y_test)

Truth,No,Ye
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
No,4815,110
Ye,16,59


In [165]:
(16+110)/5000

0.0252

The validation set error is 2.5% , which with rounding is the same as the logistic model without using student as adummy variable.
### Question 6