In [1]:
%matplotlib inline
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('data/train.csv')

In [24]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,annual_inc,loan_status,pymnt_plan,...,MORTGAGE,NONE,OTHER,OWN,RENT,Not Verified,Source Verified,Verified,issue_d_le,earliest_cr_line_le
0,15000.0,0,11.99,498.15,1,7,4,70000.0,1,0,...,1,0,0,0,0,0,0,1,2013.75,1991.916667
1,3725.0,0,6.03,113.38,0,0,0,52260.0,1,0,...,1,0,0,0,0,0,1,0,2012.75,2000.75
2,16000.0,0,11.14,524.89,1,6,3,67500.0,1,0,...,0,0,0,0,1,0,1,0,2013.25,2001.5
3,4200.0,0,13.33,142.19,2,12,1,21600.0,0,0,...,1,0,0,0,0,1,0,0,2015.166667,2003.333333
4,6500.0,0,12.69,218.05,1,9,10,41000.0,1,0,...,0,0,0,0,1,1,0,0,2012.0,1990.666667


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200189 entries, 0 to 200188
Data columns (total 43 columns):
loan_amnt                     200189 non-null float64
term                          200189 non-null int64
int_rate                      200189 non-null float64
installment                   200189 non-null float64
grade                         200189 non-null int32
sub_grade                     200189 non-null int32
emp_length                    200189 non-null int32
annual_inc                    200189 non-null float64
loan_status                   200189 non-null int64
pymnt_plan                    200189 non-null int32
purpose                       200189 non-null int32
zip_code                      200189 non-null int32
addr_state                    200189 non-null int32
dti                           200189 non-null float64
delinq_2yrs                   200189 non-null float64
inq_last_6mths                200189 non-null float64
mths_since_last_delinq        200189 non-nu

In [5]:
df.emp_length.fillna(value=0,inplace=True)
df.revol_util.fillna(value=0,inplace=True)
df.collections_12_mths_ex_med.fillna(value=df.collections_12_mths_ex_med.mean(),inplace=True)
df.mths_since_last_delinq.fillna(value=df.mths_since_last_delinq.mean(),inplace=True)
df.tot_coll_amt.fillna(value=df.tot_coll_amt.mean(),inplace=True)
df.tot_cur_bal.fillna(value=df.tot_coll_amt.mean(),inplace=True)
df.total_rev_hi_lim.fillna(value=df.total_rev_hi_lim.mean(),inplace=True)
df['emp_length'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
df['emp_length'] = df['emp_length'].astype(int)
df['term'] = pd.factorize(df['term'])[0]

In [6]:
df['is_title_known'] = df['emp_title'].map(lambda x: 0 if x == 'n/a' else 1)
df.drop('emp_title', axis=1, inplace=True)

In [7]:
df['zip_code'] = df['zip_code'].where(df['zip_code'].str.len() == 4, 
                                               df['zip_code'].str[:3])
df['zip_code'] = df['zip_code'].astype(int)

In [8]:
label_encoder = LabelEncoder()
categorical_columns = df[['grade','sub_grade','pymnt_plan','initial_list_status','purpose','application_type','addr_state']]
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])

In [9]:
one_hot_encoded_home_ownership = pd.get_dummies(df.home_ownership)
df = df.drop('home_ownership',axis = 1)

df = df.join(one_hot_encoded_home_ownership)

In [10]:
one_hot_encoded_verification_status = pd.get_dummies(df.verification_status)
df = df.drop('verification_status',axis = 1)

df = df.join(one_hot_encoded_verification_status)

In [11]:
def month_to_decimal(month):
    month_dict = {'Jan':0, 'Feb':1/12., 'Mar':2/12., 'Apr':3/12., 'May':4/12., 'Jun':5/12., 
     'Jul':6/12., 'Aug':7/12., 'Sep':8/12., 'Oct':9/12., 'Nov':10/12., 'Dec':11/12.}
    return month_dict[month]

def convert_date(month_year):
    month_and_year = month_year.split('-')
    return float(month_and_year[1]) + month_to_decimal(month_and_year[0])

def encode_with_func(df, column_name, func_name):
    df[column_name+'_le'] = df[column_name].map(func_name)
    df.drop(column_name, axis=1, inplace=True)

encode_with_func(df, 'issue_d', convert_date)

In [12]:
encode_with_func(df, 'earliest_cr_line', convert_date)

In [13]:
Y = df['loan_status'].values

X = df.drop(['loan_status'], axis=1)

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X, Y = ros.fit_resample(X, Y)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=42)

In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [15]:
train = list(zip(x_train,y_train))
test = list(zip(x_test, y_test))

In [16]:
import math

def get_distance(data1, data2):
    points = zip(data1, data2)
    diffs_squared_distance = [pow(a - b, 2) for (a, b) in points]
    return math.sqrt(sum(diffs_squared_distance))

In [17]:
get_distance(train[0][0], train[1][0])

7.182771078973325

In [18]:
from operator import itemgetter
 
def get_neighbours(training_set, test_instance, k):
    distances = [_get_tuple_distance(training_instance, test_instance) for training_instance in training_set]
 
    sorted_distances = sorted(distances, key=itemgetter(1))
 
    sorted_training_instances = [tuple[0] for tuple in sorted_distances]
 
    return sorted_training_instances[:k]
 
def _get_tuple_distance(training_instance, test_instance):
    return (training_instance, get_distance(test_instance, training_instance[0]))

In [19]:
_get_tuple_distance(train[0], test[0][0])

((array([-0.49089867, -0.6215305 , -0.92188138, -0.42547464, -0.76845234,
         -0.77652616,  1.20301746, -0.0039659 , -0.00605898, -0.33115486,
          1.19717364, -1.2250574 , -1.13959708, -0.34958176,  1.02333407,
          0.01331555, -1.02226144, -0.33211692, -0.37061741, -0.79014721,
         -1.60696257, -0.65532249, -0.08660204,  0.        , -0.00479001,
         -0.05714052, -0.12211125, -0.49804979, -0.22614945,  0.33531945,
          0.        ,  0.        , -0.94724692, -0.01303119, -0.0213188 ,
          3.18589241, -0.88059253,  1.50300789, -0.6681186 , -0.79044445,
          0.9693762 ,  0.54855416]), 0), 9.49961755650653)

In [20]:
from collections import Counter
 
def get_majority_vote(neighbours):
    classes = [neighbour[1] for neighbour in neighbours]
    count = Counter(classes)
    return count.most_common()[0][0]

In [None]:
predictions = [] 
k = 3
for x in range(len(x_test)):
    print ('Classifying test instance number ' + str(x) + ":")
    neighbours = get_neighbours(training_set=train, test_instance=test[x][0], k=3)
    majority_vote = get_majority_vote(neighbours)
    predictions.append(majority_vote) 
    print ('Predicted label=' + str(majority_vote) + ', Actual label=' + str(test[x][1]))
print ('\nThe overall accuracy of the model is: ' + str(accuracy_score(y_test, predictions)) + "\n")
print("Precision:",metrics.precision_score(y_test, predictions))
print("Recall:",metrics.recall_score(y_test, predictions))

Classifying test instance number 0:
Predicted label=0, Actual label=0
Classifying test instance number 1:
Predicted label=0, Actual label=1
Classifying test instance number 2:
Predicted label=0, Actual label=0
Classifying test instance number 3:
Predicted label=0, Actual label=0
Classifying test instance number 4:
Predicted label=0, Actual label=1
Classifying test instance number 5:
Predicted label=1, Actual label=1
Classifying test instance number 6:
Predicted label=1, Actual label=1
Classifying test instance number 7:
Predicted label=0, Actual label=0
Classifying test instance number 8:
Predicted label=1, Actual label=0
Classifying test instance number 9:
Predicted label=1, Actual label=1
Classifying test instance number 10:
Predicted label=0, Actual label=1
Classifying test instance number 11:
Predicted label=1, Actual label=1
Classifying test instance number 12:
Predicted label=1, Actual label=1
Classifying test instance number 13:
Predicted label=1, Actual label=1
Classifying test