In [2]:
#import dataset
import pandas as pd
import numpy as np
#some variables are numeric but in different formats as int and float
#casting all as float
df = pd.read_csv('data/loan.csv',dtype = np.float64)
df.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f770,f771,f772,f773,f774,f775,f776,f777,f778,loss
0,1.0,126.0,10.0,0.686842,1100.0,3.0,13699.0,7201.0,4949.0,126.75,...,5.0,2.14,-1.54,1.18,0.1833,0.7873,1.0,0.0,5.0,0.0
1,2.0,121.0,10.0,0.782776,1100.0,3.0,84645.0,240.0,1625.0,123.52,...,6.0,0.54,-0.24,0.13,0.1926,-0.6787,1.0,0.0,5.0,0.0
2,3.0,126.0,10.0,0.50008,1100.0,3.0,83607.0,1800.0,1527.0,127.76,...,13.0,2.89,-1.73,1.04,0.2521,0.7258,1.0,0.0,5.0,0.0
3,4.0,134.0,10.0,0.439874,1100.0,3.0,82642.0,7542.0,1730.0,132.94,...,4.0,1.29,-0.89,0.66,0.2498,0.7119,1.0,0.0,5.0,0.0
4,5.0,109.0,9.0,0.502749,2900.0,4.0,79124.0,89.0,491.0,122.72,...,26.0,6.11,-3.82,2.51,0.2282,-0.5399,0.0,0.0,5.0,0.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105471 entries, 0 to 105470
Columns: 771 entries, id to loss
dtypes: float64(771)
memory usage: 620.4 MB


In [4]:
#Check for missing values
df.isna().sum()


id         0
f1         0
f2         0
f3         0
f4         0
        ... 
f775    1525
f776       0
f777       0
f778       0
loss       0
Length: 771, dtype: int64

In [15]:
#Fill the missing values with mean
df.fillna(df.mean(),inplace=True)

In [16]:
df.isna().sum().mean()

0.0

In [30]:
#import numpy as np
#df.describe(include = np.object).transpose()

In [6]:
df.describe(include='all').transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,105471.0,52736.000000,30446.999458,1.000000,26368.50000,52736.000000,79103.500000,105471.000000
f1,105471.0,134.603171,14.725467,103.000000,124.00000,129.000000,148.000000,176.000000
f2,105471.0,8.246883,1.691535,1.000000,8.00000,9.000000,9.000000,11.000000
f3,105471.0,0.499066,0.288752,0.000006,0.24895,0.498267,0.749494,0.999994
f4,105471.0,2678.488874,1401.010943,1100.000000,1500.00000,2200.000000,3700.000000,7900.000000
...,...,...,...,...,...,...,...,...
f775,105471.0,0.014797,1.031897,-18.439600,-0.69885,0.353600,0.732900,11.092000
f776,105471.0,0.310246,0.462597,0.000000,0.00000,0.000000,1.000000,1.000000
f777,105471.0,0.322847,0.467567,0.000000,0.00000,0.000000,1.000000,1.000000
f778,105471.0,175.951589,298.294043,2.000000,19.00000,40.000000,104.000000,1212.000000


In [17]:
#we will do classification of loan approval so making loss binary
df['bin_loss'] = df.loss>0

In [18]:
df.bin_loss

0         False
1         False
2         False
3         False
4         False
          ...  
105466    False
105467    False
105468    False
105469    False
105470    False
Name: bin_loss, Length: 105471, dtype: bool

In [19]:
#checking the value_counts to realise the data is highly imbalanced
df.bin_loss.value_counts()

False    95688
True      9783
Name: bin_loss, dtype: int64

In [20]:
#extract independent and dependent vars
X = df.drop(['id','loss','bin_loss'],axis=1)#id, loss are not going to be used
y = df.bin_loss

In [21]:
#split to train and test for cross-validation
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [22]:
#scale the features using training set
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = pd.DataFrame(sc.fit_transform(X_train),columns=X.columns)
X_test_scaled = pd.DataFrame(sc.transform(X_test),columns=X.columns)

In [23]:
#import the model
from sklearn.linear_model import LogisticRegression
#We use saga solver for large dataset 
#and class_weight to deal with the imbalance
model = LogisticRegression(max_iter=500,tol=0.001,solver='saga', 
                           class_weight={False:1, True:9})

In [14]:
#Training the model 
model.fit(X_train_scaled,y_train)

LogisticRegression(C=1.0, class_weight={False: 1, True: 9}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=500, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.001, verbose=0,
                   warm_start=False)

In [15]:
#Validating the model

model.score(X_test_scaled,y_test)

0.6829530371025851

In [32]:
from sklearn.metrics import confusion_matrix, classification_report

In [16]:
y_pred = model.predict(X_test_scaled)
print(classification_report(y_test,y_pred))
confusion_matrix(y_test,y_pred)

              precision    recall  f1-score   support

       False       0.95      0.69      0.80     28660
        True       0.17      0.62      0.27      2982

    accuracy                           0.68     31642
   macro avg       0.56      0.66      0.53     31642
weighted avg       0.87      0.68      0.75     31642



array([[19753,  8907],
       [ 1125,  1857]], dtype=int64)

The above exercise showed us that with more than 700 features it will take very long to run a simple model like Logistic Regression. 

One option is to apply Dimensionality reduction



In [24]:
'''
PCA uses Singular Value Decomposition to calculate 
new features that help discrimate based on their differences
'''
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
pca.fit(X_train_scaled)

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [25]:
#Explained variance shows discriminating power of each new feature 
#the first five are chosen as the values are significant
pca.explained_variance_ratio_

array([0.16439148, 0.14749705, 0.07365848, 0.05880162, 0.05526846])

In [26]:
#the new features (principal components) do not have much 
#physical interpretation as the original features 
#but are calculated/derived using the original features 
X_train_pca = pd.DataFrame(pca.transform(X_train_scaled))
X_train_pca.head()

Unnamed: 0,0,1,2,3,4
0,-10.350902,7.415833,8.910038,1.542881,11.834651
1,6.671817,-0.574774,5.034939,-2.590924,-2.23814
2,-3.985748,-2.494332,3.732511,-5.164439,6.421786
3,-0.967143,-2.798823,4.178289,-5.402487,7.732958
4,-13.773151,4.306439,-2.717403,-2.801944,1.575983


In [29]:
#remodelling in the new feature space
model_pca = LogisticRegression(solver='saga',class_weight={False:1, True:9})
model_pca.fit(X_train_pca,y_train)

LogisticRegression(C=1.0, class_weight={False: 1, True: 9}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
#the test set needs to be transformed as well
X_test_pca = pd.DataFrame(pca.transform(X_test_scaled))
model_pca.score(X_test_pca,y_test)

0.6543202073193857

In [33]:
#scoring the model on the new featureset
y_pred_pca = model_pca.predict(X_test_pca)
print(classification_report(y_test,y_pred_pca))
confusion_matrix(y_test,y_pred_pca)

              precision    recall  f1-score   support

       False       0.93      0.67      0.78     28660
        True       0.14      0.54      0.23      2982

    accuracy                           0.65     31642
   macro avg       0.54      0.60      0.50     31642
weighted avg       0.86      0.65      0.73     31642



array([[19100,  9560],
       [ 1378,  1604]], dtype=int64)

The results above show that we can arrive at similar test-scores after we reduce the data dimension significantly using PCA. The scores are slightly less the ones when we used all the 700+ features but the model is much faster in execution. So we need to do the trade-off between time of computation and model performance in terms of accuracy, precision etc. 