In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import matplotlib.pyplot as plt
%matplotlib inline

# Logistic Regression case study: classify credit card defaults

__dataset:__ UC Irvine Machine Learning Repository

   - https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
   - a dataset of customers defaulting on their next credit card payment
    
__modeling exercise__

   - can we use machine learning to predict whether a customer will default on their next credit card payment, based on the features in this dataset?
   - since defaults have been encoded as a binary response variable (0 or 1) this is a _classification_ problem

__download and prepare the dataset for working on it with python__

    1. download the data
        - follow the link and click Download: Data Folder
    
    2. move the data to the data/ directory in this directory 
        - try it with your linux shell
            - list your Downloads folder 
            - We see the filename is full of spaces, let's clean that up and move the file in one line
            - open the .xls file
                - .xls files are not ASCII readable (can't read it with Pandas or a text editor)
                - all the data is in the first sheet, so export it as a .csv
                    - in Excel: SaveAs --> .csv

~~~
linux:
$ ls ~/Downloads
default of credit card clients.xls

/ move the file from ~/Downloads/ to data/ AND rename it
$ mv ~/Downloads/default\ of\ credit\ card\ clients.xls data/cred_card_defaults.xls

/ check the data/ folder after exporting .xls as .csv
$ ls data
cred_card_defaults.csv	cred_card_defaults.xls
~~~
    



## Read in the dataset with pandas
   - what does header=1 do?

In [2]:
datafile = 'data/cred_card_defaults.csv'
data = pd.read_csv(datafile, header=1) 
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


## Explore the data

In [3]:
# look at datatypes and missing data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
ID                            30000 non-null int64
LIMIT_BAL                     30000 non-null int64
SEX                           30000 non-null int64
EDUCATION                     30000 non-null int64
MARRIAGE                      30000 non-null int64
AGE                           30000 non-null int64
PAY_0                         30000 non-null int64
PAY_2                         30000 non-null int64
PAY_3                         30000 non-null int64
PAY_4                         30000 non-null int64
PAY_5                         30000 non-null int64
PAY_6                         30000 non-null int64
BILL_AMT1                     30000 non-null int64
BILL_AMT2                     30000 non-null int64
BILL_AMT3                     30000 non-null int64
BILL_AMT4                     30000 non-null int64
BILL_AMT5                     30000 non-null int64
BILL_AMT6               

In [4]:
# look at data ranges
data.describe()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,15000.5,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,8660.398374,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,1.0,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7500.75,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,15000.5,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,22500.25,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,30000.0,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


### our response variable has a long column name, lets rename it
   - I can use the DataFrame.rename() method and pass a dict with {old_name:new_name}
   - what is the argument 'axis=1' doing?
   - what is the argument 'inplace=True' doing?

In [5]:

data.rename({'default payment next month':'default'}, axis=1, inplace=True)
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [6]:
# sns.pairplot(data) # takes forever

### import some useful linear modeling libraries

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score

### Separate the dataset into features and a target
   - helps to look at the column names
   - you can copy and paste from this list
   - we can slice the DataFrame with 2 lists of columns: one for features, on for target
   - think critically about which columns are legit features
       - do you think the feature can be predictive?
       - e.g. should ID be a feature?

In [9]:
data.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default'],
      dtype='object')

In [11]:
feat_cols = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
targ_col = 'default'

X = data[feat_cols]
y = data[targ_col]

# check the shapes. Do these make sense?
print(f'feature matrix shape: {X.shape}')
print(f'target vector shape: {y.shape}')

feature matrix shape: (30000, 23)
target vector shape: (30000,)


### randomly split the dataset into train and test sets

In [12]:
test_pcnt = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_pcnt)

### fit a logistic regression model to the training data

In [18]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)

# make predictions
preds = logistic.predict(X_test)



In [21]:
# take a peek to see what the true values and predictions look like
print('true: predicted')
[print(x) for x in zip(y_test[:10], preds[:10])]

true: predicted
(0, 0)
(1, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(0, 0)
(1, 0)
(1, 0)
(1, 0)


[None, None, None, None, None, None, None, None, None, None]

In [23]:
# score with a few methods
acc = accuracy_score(y_test, preds)
rec = recall_score(y_test, preds)
prec = precision_score(y_test, preds)

for label, score in zip(['accuracy','recall','precision'],[acc, rec, prec]):
    print(f'{label}, {round(score,3)}')

accuracy, 0.777
recall, 0.0
precision, 0.0


  'precision', 'predicted', average, warn_for)


In [None]:
## what's going on? The model never predicted 1. So it's pretty accurate, but 

In [27]:
n = data.shape[0]
n_0 = data.loc[data.default == 0].shape[0]
n_1 = data.loc[data.default == 1].shape[0]
print(n_0, n_1, n_1/n)

23364 6636 0.2212


In [28]:
## the data is only 22% fraud cases. We have a _class imbalance_ problem

In [34]:
## simple solution: just remove exampes of the 0 class
data_0 = data.loc[data.default == 0].copy()
data_1 = data.loc[data.default == 1].copy()

# n to reach 60%

idx = np.random.choice(n_0, size=8000)
data_0_undersamp = data_0.iloc[idx, :]
print(data_0_undersamp.shape[0])

# concat them together
data_bal = pd.concat([data_0_undersamp, data_1])

8000


In [36]:
## model again
# feature and target 
X = data_bal[feat_cols]
y = data_bal[targ_col]

# train test split 
test_pcnt = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_pcnt)

logistic = LogisticRegression()
logistic.fit(X_train, y_train)

# make predictions
preds = logistic.predict(X_test)



In [37]:
# score with a few methods
acc = accuracy_score(y_test, preds)
rec = recall_score(y_test, preds)
prec = precision_score(y_test, preds)

for label, score in zip(['accuracy','recall','precision'],[acc, rec, prec]):
    print(f'{label}, {round(score,3)}')

accuracy, 0.67
recall, 0.563
precision, 0.668


In [None]:
# tada!