## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, jaccard_similarity_score, log_loss

## Reading Data

In [2]:
df = pd.read_csv("loans.csv")
df.head(10)

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0.0,0.0,0.0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0.0,0.0,0.0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1.0,0.0,0.0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1.0,0.0,0.0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0.0,1.0,0.0,0
5,1,credit_card,0.0788,125.13,11.904968,16.98,727,6120.041667,50807,51.0,0.0,0.0,0.0,0
6,1,debt_consolidation,0.1496,194.02,10.714418,4.0,667,3180.041667,3839,76.8,0.0,0.0,1.0,1
7,1,all_other,0.1114,131.22,11.0021,11.08,722,5116.0,24220,68.6,0.0,0.0,0.0,1
8,1,home_improvement,0.1134,87.19,11.407565,17.25,682,3989.0,69909,51.1,1.0,0.0,0.0,0
9,1,debt_consolidation,0.1221,84.12,10.203592,10.0,707,2730.041667,5630,23.0,1.0,0.0,0.0,0


## Data Cleaning

In [3]:
print ("Shape of dataset before cleaning: ", df.size)

df = df[pd.notnull(df)]
df = df.reset_index(drop=True)

print ("Shape of dataset after cleaning: ", df.size)

Shape of dataset before cleaning:  134092
Shape of dataset after cleaning:  134092


In [4]:
df = df.fillna(df.mean())
np.isnan(df.values.any())

False

## Feature Selection

In [5]:
df.columns

Index(['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc',
       'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'not.fully.paid'],
      dtype='object')

In [6]:
X = df[['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc',
       'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec']].values
Y = df['not.fully.paid'].values
X[:3]

array([[1, 'debt_consolidation', 0.1189, 829.1, 11.35040654, 19.48, 737,
        5639.958333, 28854, 52.1, 0.0, 0.0, 0.0],
       [1, 'credit_card', 0.1071, 228.22, 11.08214255, 14.29, 707,
        2760.0, 33623, 76.7, 0.0, 0.0, 0.0],
       [1, 'debt_consolidation', 0.1357, 366.86, 10.37349118, 11.63, 682,
        4710.0, 3511, 25.6, 1.0, 0.0, 0.0]], dtype=object)

## Preprocessing

In [7]:
from sklearn import preprocessing

print(df['purpose'].unique())

le_purpose = preprocessing.LabelEncoder()
le_purpose.fit(['debt_consolidation','credit_card','all_other','home_improvement',
                'small_business','major_purchase','educational'])
X[:,1] = le_purpose.transform(X[:,1])

X[:3]

['debt_consolidation' 'credit_card' 'all_other' 'home_improvement'
 'small_business' 'major_purchase' 'educational']


array([[1, 2, 0.1189, 829.1, 11.35040654, 19.48, 737, 5639.958333, 28854,
        52.1, 0.0, 0.0, 0.0],
       [1, 1, 0.1071, 228.22, 11.08214255, 14.29, 707, 2760.0, 33623,
        76.7, 0.0, 0.0, 0.0],
       [1, 2, 0.1357, 366.86, 10.37349118, 11.63, 682, 4710.0, 3511,
        25.6, 1.0, 0.0, 0.0]], dtype=object)

## Train Test Split

In [8]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

## kNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier
k = 22
neigh = KNeighborsClassifier(n_neighbors = k).fit(x_train, y_train)

In [10]:
y_hat_knn = neigh.predict(x_test)

In [11]:
knn_f1 = f1_score(y_test, y_hat_knn, average='weighted')
knn_jaccard = jaccard_similarity_score(y_test, y_hat_knn)
knn_logloss = log_loss(y_test, y_hat_knn)

print("F1 Score", knn_f1)
print("Jaccard index", knn_jaccard)
print("LogLoass", knn_logloss)

F1 Score 0.7835941563714334
Jaccard index 0.8517745302713987
LogLoass 5.119526354986761


  'precision', 'predicted', average, warn_for)


## Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier

Ltree = DecisionTreeClassifier(criterion = "entropy", max_depth = 12)
Ltree

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=12,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [13]:
Ltree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=12,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [14]:
y_hat_dt = Ltree.predict(x_test)
print(y_hat_dt)

[0 0 0 ... 0 1 0]


In [15]:
dt_f1 = f1_score(y_test, y_hat_dt, average='weighted')
dt_jaccard = jaccard_similarity_score(y_test, y_hat_dt)
dt_logloss = log_loss(y_test, y_hat_dt)

print(dt_f1)
print(dt_jaccard)
print(dt_logloss)

0.7924760814885411
0.8147181628392485
6.399456353598745




## SVM

In [16]:
from sklearn import svm
clf = svm.SVC(kernel='rbf')
clf.fit(x_train, y_train) 



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [17]:
y_hat_svm = clf.predict(x_test)
y_hat_svm [0:5]

array([0, 0, 0, 0, 0])

In [18]:
svm_f1 = f1_score(y_test, y_hat_svm, average='weighted')
svm_jaccard = jaccard_similarity_score(y_test, y_hat_svm)
svm_logloss = log_loss(y_test, y_hat_svm)

print(svm_f1)
print(svm_jaccard)
print(svm_logloss)

0.7835941563714334
0.8517745302713987
5.119526354986761


  'precision', 'predicted', average, warn_for)


## Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(C = 0.01, solver='liblinear').fit(x_train, y_train)
LR

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
y_hat_lr = LR.predict(x_test)

In [21]:
lr_f1 = f1_score(y_test, y_hat_lr, average='weighted')
lr_jaccard = jaccard_similarity_score(y_test, y_hat_lr)
lr_logloss = log_loss(y_test, y_hat_lr)

print(lr_f1)
print(lr_jaccard)
print(lr_logloss)

0.7855758904330514
0.8517745302713987
5.119527189639611




## Comparing Results

In [22]:
print("knn Results - ")
print("F1 Score", knn_f1)
print("Jaccard index", knn_jaccard)
print("LogLoass", knn_logloss,"\n")

print("Decisio Tree Results - ")
print("F1 Score", dt_f1)
print("Jaccard index", dt_jaccard)
print("LogLoass", dt_logloss,"\n")

print("SVM Results - ")
print("F1 Score", svm_f1)
print("Jaccard index", svm_jaccard)
print("LogLoass", svm_logloss,"\n")

print("Logestic Regressio Results - ")
print("F1 Score", lr_f1)
print("Jaccard index", lr_jaccard)
print("LogLoass", lr_logloss,"\n")

knn Results - 
F1 Score 0.7835941563714334
Jaccard index 0.8517745302713987
LogLoass 5.119526354986761 

Decisio Tree Results - 
F1 Score 0.7924760814885411
Jaccard index 0.8147181628392485
LogLoass 6.399456353598745 

SVM Results - 
F1 Score 0.7835941563714334
Jaccard index 0.8517745302713987
LogLoass 5.119526354986761 

Logestic Regressio Results - 
F1 Score 0.7855758904330514
Jaccard index 0.8517745302713987
LogLoass 5.119527189639611 

