In [173]:
# Importing relevant packages
import pandas as pd
import numpy as np
import matplotlib as plt

In [174]:
# Read train csv into a dataframe
df = pd.read_csv("Downloads/data/train.csv")

In [175]:
# fill null values of Self_Employed with majority (above 80%)
df['Self_Employed'].fillna('No',inplace=True)

In [176]:
# Pivot table to represent relation between self employed and education
table = df.pivot_table(values='LoanAmount', index='Self_Employed' ,columns='Education', aggfunc=np.median)
table

Education,Graduate,Not Graduate
Self_Employed,Unnamed: 1_level_1,Unnamed: 2_level_1
No,130.0,113.0
Yes,157.5,130.0


In [177]:
# Returns values of pivot table
def fage(x):
 return table.loc[x['Self_Employed'],x['Education']]

In [178]:
# fill null values from the pivot table
df['LoanAmount'].fillna(df[df['LoanAmount'].isnull()].apply(fage, axis=1), inplace=True)

In [179]:
# demonstration of log loanAmount as demonstrated in class
df['LoanAmount_log'] = np.log(df['LoanAmount'])
df['LoanAmount_log'].hist(bins=20)

<matplotlib.axes._subplots.AxesSubplot at 0x96e0a58>

In [180]:
# demonstration of log total income as demonstrated in class
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['TotalIncome_log'] = np.log(df['TotalIncome'])
df['LoanAmount_log'].hist(bins=20) 

<matplotlib.axes._subplots.AxesSubplot at 0x96e0a58>

In [181]:
# fill null values of Loan_Amount_Term with majority (above 80%)
df['Loan_Amount_Term'].fillna(360, inplace=True)

In [182]:
# fill null values of Credit_History with majority (above 80%)
df['Credit_History'].fillna(1, inplace=True)

In [183]:
# fill nulls in Gender according to mean wage of women (4643)
for i in df[df.Gender.isnull()].index:
    if df.loc[i, 'ApplicantIncome']<= 4643.0:
        df.loc[i, 'Gender'] = 'Female'
    else:
        df.loc[i, 'Gender'] = 'Male'

In [184]:
# remove three rows of Married field
df = df[pd.notnull(df['Married'])]

In [185]:
# Since the majority of persons without dependents were not married 
for i in df[df.Dependents.isnull()].index:
    if df.loc[i, 'Married']== 'No':
        df.loc[i, 'Dependents'] = 0

In [186]:
# fill the rest of nulls in Dependents field with majority
df['Dependents'].fillna(0, inplace=True)

In [187]:
# all nulls filled or removed
df.apply(lambda x: sum(x.isnull()),axis=0) 

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
LoanAmount_log       0
TotalIncome          0
TotalIncome_log      0
dtype: int64

In [188]:
# transform types of dataset
from sklearn.preprocessing import LabelEncoder
var_mod = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']
le = LabelEncoder()
for i in var_mod:
    df[i] = le.fit_transform(df[i].astype(str))

In [189]:
# Train a Linear SVM model
from sklearn import svm 
clf = svm.LinearSVC() 
predictor_var = ['TotalIncome_log','LoanAmount_log','Credit_History','Dependents','Property_Area']
outcome_var = ['Loan_Status']
clf.fit(df[predictor_var], df[outcome_var])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [198]:
# Train a Linear KNN model
from sklearn import neighbors 
knn = neighbors.KNeighborsClassifier(5) 
predictor_var = ['TotalIncome_log','LoanAmount_log','Credit_History','Dependents','Property_Area']

knn.fit(df[predictor_var], df[outcome_var]) 




KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [191]:
# Perfrom pre-processing tasks on test set
dftest = pd.read_csv("Downloads/data/test.csv")
dftest['Self_Employed'].fillna('No',inplace=True)
table2 = dftest.pivot_table(values='LoanAmount', index='Self_Employed' ,columns='Education', aggfunc=np.median)

def fage2(x):
 return table2.loc[x['Self_Employed'],x['Education']]

dftest['LoanAmount'].fillna(dftest[dftest['LoanAmount'].isnull()].apply(fage2, axis=1), inplace=True)
dftest['LoanAmount_log'] = np.log(dftest['LoanAmount'])
dftest['LoanAmount_log'].hist(bins=20)

dftest['TotalIncome'] = dftest['ApplicantIncome'] + dftest['CoapplicantIncome']
dftest['TotalIncome_log'] = np.log(dftest['TotalIncome'])
dftest['LoanAmount_log'].hist(bins=20) 

dftest['Loan_Amount_Term'].fillna(360, inplace=True)
dftest['Credit_History'].fillna(1, inplace=True)

for i in dftest[dftest.Gender.isnull()].index:
    if dftest.loc[i, 'ApplicantIncome']<= 4643.0:
        dftest.loc[i, 'Gender'] = 'Female'
    else:
        dftest.loc[i, 'Gender'] = 'Male'
                              
dftest = dftest[pd.notnull(dftest['Married'])]
                              
for i in dftest[dftest.Dependents.isnull()].index:
    if dftest.loc[i, 'Married']== 'No':
        dftest.loc[i, 'Dependents'] = 0
                              
dftest['Dependents'].fillna(0, inplace=True)

In [192]:
# transform types of dataset
from sklearn.preprocessing import LabelEncoder
var_mod = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area']
le = LabelEncoder()
for i in var_mod:
    dftest[i] = le.fit_transform(dftest[i].astype(str))

In [193]:
# Prediction for linear SVM model
predictor_var = ['TotalIncome_log','LoanAmount_log','Credit_History','Dependents','Property_Area']
svmPredict = clf.predict(dftest[predictor_var])

In [194]:
# Change values to comply with submission rules
c = pd.DataFrame({'Loan_ID':dftest['Loan_ID'] ,'Loan_Status': svmPredict})
for index, row in c.iterrows():
    if c.loc[index, 'Loan_Status'] == 1:
        c.loc[index, 'Loan_Status'] = 'Y'
    else:
        c.loc[index, 'Loan_Status'] = 'N'


In [195]:
# Create csv
c.to_csv(r'C:\Users\Shahar\Desktop\New folder\svm.csv',index = False)

In [196]:
# Prediction for KNN model
k = knn.predict(dftest[predictor_var]) 

In [197]:
# Change values to comply with submission rules
# Create csv
s = pd.DataFrame({'Loan_ID':dftest['Loan_ID'] ,'Loan_Status': k})
for index, row in s.iterrows():
    if s.loc[index, 'Loan_Status'] == 1:
        s.loc[index, 'Loan_Status'] = 'Y'
    else:
        s.loc[index, 'Loan_Status'] = 'N'
s.to_csv(r'C:\Users\Shahar\Desktop\New folder\knn.csv',index = False)