In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline
import matplotlib as mpl
import seaborn as sns 
mpl.rcParams['figure.dpi'] = 400 

In [5]:
df = pd.read_csv('../data/chap_1_cleaned_data.csv')
df.head(5)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,EDUCATION_CAT,graduate school,high school,others,university
0,798fc410-45c1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,1,university,0,0,0,1
1,8a8c8f3b-8eb4,120000,2,2,2,26,-1,2,0,0,...,1000,1000,0,2000,1,university,0,0,0,1
2,85698822-43f5,90000,2,2,2,34,0,0,0,0,...,1000,1000,1000,5000,0,university,0,0,0,1
3,0737c11b-be42,50000,2,2,1,37,0,0,0,0,...,1200,1100,1069,1000,0,university,0,0,0,1
4,3b7f77cc-dbc0,50000,1,2,1,57,-1,0,-1,0,...,10000,9000,689,679,0,university,0,0,0,1


In [6]:
def sigmoid(X):
    Y = 1 / (1 + np.exp(-X))
    return Y

In [15]:
X = df[['PAY_1', 'LIMIT_BAL']].to_numpy()
X

array([[     2,  20000],
       [    -1, 120000],
       [     0,  90000],
       ...,
       [     4,  30000],
       [     1,  80000],
       [     0,  50000]])

In [16]:
y = df['default payment next month'].to_numpy()
y

array([1, 1, 0, ..., 1, 1, 1])

In [7]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=24)

In [18]:
print(X_train.shape)
print(X_test.shape)

(21331, 2)
(5333, 2)


In [19]:
from sklearn.linear_model import LogisticRegression

In [21]:
lr_model = LogisticRegression(solver='liblinear')
lr_model

In [22]:
lr_model.fit(X_train, y_train)

In [25]:
y_pred = lr_model.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [26]:
y_pred_proba = lr_model.predict_proba(X_test)
y_pred_proba

array([[0.74826924, 0.25173076],
       [0.584297  , 0.415703  ],
       [0.79604453, 0.20395547],
       ...,
       [0.584297  , 0.415703  ],
       [0.82721498, 0.17278502],
       [0.66393435, 0.33606565]])

In [27]:
print(lr_model.coef_, lr_model.intercept_)

[[ 8.27451187e-11 -6.80876727e-06]] [-6.57647457e-11]


In [28]:
np.ones((X_test.shape[0],1)).shape

(5333, 1)

In [29]:
ones_and_features = np.hstack([np.ones((X_test.shape[0],1)), X_test])
ones_and_features

array([[ 1.0e+00,  2.0e+00,  1.6e+05],
       [ 1.0e+00,  1.0e+00,  5.0e+04],
       [ 1.0e+00, -1.0e+00,  2.0e+05],
       ...,
       [ 1.0e+00, -1.0e+00,  5.0e+04],
       [ 1.0e+00,  1.0e+00,  2.3e+05],
       [ 1.0e+00,  2.0e+00,  1.0e+05]])

In [30]:
intercept_and_coefs = np.concatenate([lr_model.intercept_.reshape(1,1), lr_model.coef_], axis=1)
intercept_and_coefs

array([[-6.57647457e-11,  8.27451187e-11, -6.80876727e-06]])

In [34]:
X_lin_comb = np.dot(intercept_and_coefs, np.transpose(ones_and_features))
X_lin_comb

array([[-1.08940276, -0.34043836, -1.36175345, ..., -0.34043836,
        -1.56601647, -0.68087673]])

In [35]:
y_pred_proba_manual = sigmoid(X_lin_comb)
y_pred_proba_manual

array([[0.25173076, 0.415703  , 0.20395547, ..., 0.415703  , 0.17278502,
        0.33606565]])

In [37]:
y_pred_manual = y_pred_proba_manual >= 0.5
y_pred_manual

array([[False, False, False, ..., False, False, False]])

In [38]:
y_pred.shape

(5333,)

In [39]:
y_pred_manual.shape

(1, 5333)

In [40]:
np.array_equal(y_pred.reshape(1,-1), y_pred_manual)

True

In [41]:
from sklearn.metrics import roc_auc_score

In [42]:
y_test.shape

(5333,)

In [43]:
y_pred_proba_manual.shape

(1, 5333)

In [44]:
roc_auc_score(y_test, y_pred_proba_manual.reshape(y_pred_proba_manual.shape[1],))

0.627207450280691

In [45]:
roc_auc_score(y_test, y_pred_proba[:,1])

0.627207450280691