In [27]:
from ISLP import load_data

import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

import statsmodels.api as sm

In [2]:
Default = load_data('Default')
Default.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


In [8]:
# Fit Logistic Regression model using income and balance to predict default.

predictors = ['income', 'balance']
X = Default[predictors]
y = Default['default']

clf = LogisticRegression(random_state=0).fit(X, y)
clf.score(X, y)

0.9737

In [9]:
# Validation set approach: split the sample into train and validation set.
s = 0.2 # n_validation / n
n_train = int(X.shape[0]*(1-s))
n_test = X.shape[0] - n_train

X_train, y_train = X.iloc[:n_train], y.iloc[:n_train]
X_test, y_test = X.iloc[:n_test], y.iloc[:n_test]

In [15]:
# Fit logistic regression using only train set

clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [19]:
print('Classes:', clf.classes_)
probs_pred = clf.predict_proba(X_test)

correct = 0
for p, y in zip(probs_pred, y_test):
    if (p[1] > 0.5 and y == 'Yes') or (p[1] <= 0.5 and y == 'No'):
        correct += 1

print('score:', correct/y_test.shape[0])

Classes: ['No' 'Yes']
score: 0.9735


In [20]:
clf.score(X_test, y_test)

0.9735

In [24]:
# Choose three different splits
def split_train(X, y, s=0.2):
    '''
    s: fraction of validation observations relative to the entire population
    '''

    n_train = int(X.shape[0]*(1-s))
    n_test = X.shape[0] - n_train

    X_train, y_train = X.iloc[:n_train], y.iloc[:n_train]
    X_test, y_test = X.iloc[:n_test], y.iloc[:n_test]

    clf = LogisticRegression(random_state=0).fit(X_train, y_train)

    probs_pred = clf.predict_proba(X_test)

    correct = 0
    for p, y in zip(probs_pred, y_test):
        if (p[1] > 0.5 and y == 'Yes') or (p[1] <= 0.5 and y == 'No'):
            correct += 1

    print(f'score for s={s}: {correct/y_test.shape[0]}')

In [25]:
predictors = ['income', 'balance']
X = Default[predictors]
y = Default['default']

for s in (0.2, 0.3, 0.4):
    split_train(X, y, s)

score for s=0.2: 0.9735
score for s=0.3: 0.9663333333333334
score for s=0.4: 0.96525


In [32]:
# Encode Student variable
student_dummies = pd.get_dummies(Default.student, prefix='student')
Default = pd.concat((Default, student_dummies), 1)

Default.head()

  Default = pd.concat((Default, student_dummies), 1)


Unnamed: 0,default,student,balance,income,student_No,student_Yes
0,No,No,729.526495,44361.625074,1,0
1,No,Yes,817.180407,12106.1347,0,1
2,No,No,1073.549164,31767.138947,1,0
3,No,No,529.250605,35704.493935,1,0
4,No,No,785.655883,38463.495879,1,0


In [34]:
predictors = ['income', 'balance', 'student_Yes']
X = Default[predictors]
y = Default['default']

for s in (0.2, 0.3, 0.4):
    split_train(X, y, s)

score for s=0.2: 0.964
score for s=0.3: 0.966
score for s=0.4: 0.96625


In [35]:
# The score of the model does not seem to improve when including student variable (dummy)