# Import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [3]:
pd.set_option('display.max_columns', 500)

# Load the dataframe

In [4]:
df = pd.read_csv('cs1 - train_dataset.csv',low_memory=False)

In [5]:
# see dataframe
df.head()

Unnamed: 0,client_id,d_open,d_start,d_last,d_close,client_type,loan_purpose,max_amount,interest,period,category,cur,overdue,pmt_due,amt_paid,amt_overdue,max_overdue,delay,npl_5,npl_5_29,npl_30,npl_30_59,npl_60_89,npl_90plus,pmt_history,bki_id,d_request,d_confirm,default
0,12608,40425,40611.0,42251.0,,1,9,146398,0,3,0,810,1360357.0,,,0.0,172778.0,0.0,1,0,1,0,0,0,11111111,1,40641,40610,0
1,17459,40344,40497.0,41440.0,,1,9,250000,0,0,0,810,,,42600.0,0.0,0.0,29.0,0,2,2,0,0,0,XAA110,3,40501,40475,0
2,10050,40279,40489.0,72697.0,,1,7,15000,0,3,0,810,12941.0,280.0,6146.0,0.0,0.0,0.0,0,0,0,0,0,0,X1111111,3,40495,40488,0
3,16804,40121,40305.0,40304.0,40305.0,1,9,11490,0,0,13,810,0.0,,0.0,0.0,0.0,,0,0,0,0,0,0,1111111,3,40581,40313,0
4,19775,39603,40029.0,39933.0,40029.0,1,7,200000,0,3,13,810,,0.0,234469.0,0.0,0.0,,0,0,0,0,0,0,XXXXXXXXXXXXXXX,3,40511,40473,0


### Datatype and missing value of each columns

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135000 entries, 0 to 134999
Data columns (total 29 columns):
client_id       135000 non-null int64
d_open          135000 non-null int64
d_start         127379 non-null float64
d_last          126170 non-null float64
d_close         66120 non-null float64
client_type     135000 non-null int64
loan_purpose    135000 non-null int64
max_amount      135000 non-null int64
interest        135000 non-null int64
period          134901 non-null object
category        135000 non-null int64
cur             135000 non-null object
overdue         108872 non-null float64
pmt_due         61524 non-null float64
amt_paid        49294 non-null float64
amt_overdue     134985 non-null float64
max_overdue     134985 non-null float64
delay           109435 non-null float64
npl_5           135000 non-null int64
npl_5_29        135000 non-null int64
npl_30          135000 non-null int64
npl_30_59       135000 non-null int64
npl_60_89       135000 non-null int6

## Select columns

In [7]:
df = df[['client_type','loan_purpose','max_amount','interest','category','overdue','max_overdue','delay','npl_5','npl_5_29','npl_30','npl_30_59','npl_60_89','npl_90plus','default']]

In [8]:
# See Distribution
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135000 entries, 0 to 134999
Data columns (total 15 columns):
client_type     135000 non-null int64
loan_purpose    135000 non-null int64
max_amount      135000 non-null int64
interest        135000 non-null int64
category        135000 non-null int64
overdue         108872 non-null float64
max_overdue     134985 non-null float64
delay           109435 non-null float64
npl_5           135000 non-null int64
npl_5_29        135000 non-null int64
npl_30          135000 non-null int64
npl_30_59       135000 non-null int64
npl_60_89       135000 non-null int64
npl_90plus      135000 non-null int64
default         135000 non-null int64
dtypes: float64(3), int64(12)
memory usage: 15.4 MB


In [9]:
# Drop columns which have max missing values

df = df.dropna(subset=['delay','overdue'])

## Label Distribution

In [10]:
df['default'].value_counts()

0    83641
1     9677
Name: default, dtype: int64

## Spilt the dataset to train and test

In [11]:
# Take 80% data as traing set and 20% as testing set

msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

In [12]:
# Training set distribution

train['default'].value_counts()

0    66891
1     7761
Name: default, dtype: int64

In [13]:
# Testing set distribution

test['default'].value_counts()

0    16750
1     1916
Name: default, dtype: int64

In [14]:
# Separate input features (X) and target variable (y)

y_test = test.default
X_test = test.drop('default', axis=1)

# Modeling

## Apply Logistic Regression model 

In [15]:
# Separate input features (X) and target variable (y)
labels = train.default
features = train.drop('default', axis=1)

# Train model
clf_0 = LogisticRegression().fit(features, labels)
 
# Predict on testing set
pred_y_0 = clf_0.predict(X_test)

In [16]:
# How's the accuracy?
print( accuracy_score(pred_y_0, y_test) )

0.8973534769098896


### Confusion matrix 

TP TN


FP FN

In [17]:
confusion_matrix(pred_y_0, y_test)

array([[16750,  1916],
       [    0,     0]])

### Roc Score

In [18]:
prob_y_0 = clf_0.predict_proba(X_test)
prob_y_0 = [p[1] for p in prob_y_0]
 
print( roc_auc_score(y_test, prob_y_0) )

0.5230723522263422


### Only '0' class predicted as we have uneven distibution of data

In [19]:
print( np.unique( pred_y_0 ) )
# [0]

[0]


# Use Sampling method to make equal distribution

### 1. Up-sample Minority Class

In [20]:
from sklearn.utils import resample

In [21]:
# Separate majority and minority classes
df_majority = train[train.default==0]
df_minority = train[train.default==1]

In [22]:
df_majority.shape

(66891, 15)

In [23]:
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=66924,    # to match majority class
                                 random_state=123) # reproducible results

In [24]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [25]:
df_upsampled.default.value_counts()

1    66924
0    66891
Name: default, dtype: int64

In [26]:
labels = df_upsampled['default']
features = df_upsampled.drop(['default'],axis=1)

In [27]:
from sklearn.linear_model import LogisticRegression


#X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42)


# Train model
clf_1 = LogisticRegression().fit(features, labels)
 
# Predict on training set
pred_y_1 = clf_1.predict(X_test)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_1 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y_test, pred_y_1) )

[0 1]
0.8709418193506911


In [28]:
# What about Area Under ROC?
prob_y_1 = clf_1.predict_proba(X_test)
prob_y_1 = [p[1] for p in prob_y_1]
print( roc_auc_score(y_test, prob_y_1) )

0.5358999158695043


## Slightly better result

In [29]:
confusion_matrix(y_test, pred_y_1)

array([[16171,   579],
       [ 1830,    86]])

# 2. Down-sample Majority Class

In [30]:
# Separate majority and minority classes
df_majority = train[train.default==0]
df_minority = train[train.default==1]

In [31]:
df_minority.shape

(7761, 15)

In [32]:
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=7680,     # to match minority class
                                 random_state=123) # reproducible results

In [33]:
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.default.value_counts()

1    7761
0    7680
Name: default, dtype: int64

In [34]:
# Separate input features (X) and target variable (y)
labels = df_downsampled.default
features = df_downsampled.drop('default', axis=1)
 
    

#X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42)
    
    
# Train model
clf_2 = LogisticRegression()
clf_2.fit(features, labels)
 
# Predict on training set
pred_y_2 = clf_2.predict(X_test)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_2 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y_test, pred_y_2) )
# 0.581632653061

[0 1]
0.721257902067931


In [35]:
# Predict class probabilities
prob_y_2 = clf_2.predict_proba(X_test)
 
# Keep only the positive class
prob_y_2 = [p[1] for p in prob_y_2]

print( roc_auc_score(y_test, prob_y_2))

0.538518929361543


In [36]:
confusion_matrix(y_test, pred_y_2)

array([[12918,  3832],
       [ 1371,   545]])

# 4. Penalize Algorithms (Cost-Sensitive Training)

In [42]:
from sklearn.svm import SVC

In [None]:

#```Python

# Separate input features (X) and target variable (y)
#labels = train.default
#features = train.drop('default', axis=1)


#X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42)    

# Train model
clf_3 = SVC(kernel='linear', 
            class_weight='balanced', # penalize
            probability=True)

clf_3.fit(features, labels)

# Predict on training set
pred_y_3 = clf_3.predict(X_test)

# Is our model still predicting just one class?
print( np.unique( pred_y_3 ) )
# [0 1]

# How's our accuracy?
print( accuracy_score(y_test, pred_y_3) )


# What about AUROC?
prob_y_3 = clf_3.predict_proba(X_test)
prob_y_3 = [p[1] for p in prob_y_3]
print( roc_auc_score(y_test, prob_y_3) )
#```

# 5. Use Tree-Based Algorithms

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [40]:
# Separate input features (X) and target variable (y)
#labels = train.default
#features = train.drop('default', axis=1)
 
    
#X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42)   
 
# Train model
clf_4 = RandomForestClassifier()
clf_4.fit(features, labels)
 
# Predict on training set
pred_y_4 = clf_4.predict(X_test)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_4 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y_test, pred_y_4) )

# What about AUROC?
prob_y_4 = clf_4.predict_proba(X_test)
prob_y_4 = [p[1] for p in prob_y_4]
print( roc_auc_score(y_test, prob_y_4) )

[0 1]
0.5804671595414121
0.5942468295266881


## Better Result with downsampling and decision Tree

In [41]:
confusion_matrix(y_test, pred_y_4)

array([[9760, 6990],
       [ 841, 1075]])