In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline 
mpl.rcParams['figure.dpi'] = 200

In [3]:
df = pd.read_csv('cleaned_data.csv')

df.shape, df.columns

((26664, 30),
 Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1',
        'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
        'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
        'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
        'default payment next month', 'EDUCATION_CAT', 'graduate school',
        'high school', 'others', 'university'],
       dtype='object'))

Now to find proportion of the positive class for the target variable. As this is a binary classification, __.mean()__ suffices.

In [4]:
df['default payment next month'].mean()

0.2217971797179718

In [13]:
df.groupby('default payment next month')['ID'].count() #this gives us the actual count (i.e. 5914/(20750+5914) = 0.2217..)

default payment next month
0    20750
1     5914
Name: ID, dtype: int64

As the class fraction (proportion of positive and negative samples) are not equal (a balanced data set := 50/50 split), this dataset is (somewhat) __imbalanced__. <br>
Various methods to undertake: <br>
1. __Undersampling__ the majority class: randomly throwing out samples from the majority class until the class fractions are equal ot less imbalanced. <br>
2. __Oversampling__ the minority class: randomly adding duplicate samples of the minority class to achieve the same goal.
3. __Weighting samples__: This is performed as part of the training step, so minority class collectively has as much *emphasis* as the majority class in this fitted model.

In [44]:
from sklearn.linear_model import LogisticRegression 

my_lr = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1,
max_iter=100,multi_class= 'auto', n_jobs=None, penalty='l2', random_state=None, solver='warn', tol=0.0001,
verbose=0, warm_start=False) #explicitly instantiating the attributes/parameters (in this case default settings)
my_lr

LogisticRegression(solver='warn')

In [45]:
my_lr.C = 0.1
my_lr.solver = 'liblinear'
my_lr

LogisticRegression(C=0.1, solver='liblinear')

We will create take a sample of our data set to use for modeling.

In [46]:
X = df['EDUCATION'][0:10].values.reshape(-1,1) #reshape into column vector
X

array([[2],
       [2],
       [2],
       [2],
       [2],
       [1],
       [1],
       [2],
       [3],
       [3]])

In [47]:
y = df['default payment next month'][0:10].values
y #this remains a row vector (shape (10,) = (10,1) )

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [48]:
my_lr.fit(X,y)

LogisticRegression(C=0.1, solver='liblinear')

In [50]:
new_X = df['EDUCATION'][10:20].values.reshape(-1,1)
new_X.shape

(10, 1)

In [51]:
my_lr.predict(new_X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [53]:
df['default payment next month'][10:20].values

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0])