In [None]:
# Run SVM algorithm on the loan dataset

In [2]:
# all necessary imports

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [3]:
dataset = pd.read_excel('loan.xlsx')
new_dataset = dataset
dataset.head()

Unnamed: 0,Sex,Age,Time_at_address,Res_status,Telephone,Occupation,Job_status,Time_employed,Time_bank,Liab_ref,Acc_ref,Home_Expn,Balance,Decision
0,M,50.75,0.585,owner,given,unemploye,unemploye,0,0,f,given,145,0,reject
1,M,19.67,10.0,rent,not_given,labourer,governmen,0,0,t,given,140,0,reject
2,F,52.830002,15.0,owner,given,creative_,private_s,5,14,f,given,0,2200,accept
3,M,22.67,2.54,rent,not_given,creative_,governmen,2,0,f,given,0,0,accept
4,M,29.25,13.0,owner,given,driver,governmen,0,0,f,given,228,0,reject


In [4]:
# data preprocessing for loan.xlsx file was done
# in the previous assignment. The reference for preprocessing
# has been taken from the previous assignment

# dataset summary
dataset.describe()

Unnamed: 0,Age,Time_at_address,Time_employed,Time_bank,Home_Expn,Balance
count,429.0,429.0,429.0,429.0,429.0,429.0
mean,31.510163,4.650758,1.871795,2.27972,176.727273,898.382284
std,11.843595,4.804037,3.254023,3.966105,142.590659,3814.56534
min,15.17,0.0,0.0,0.0,0.0,0.0
25%,22.67,1.0,0.0,0.0,80.0,0.0
50%,28.5,2.75,1.0,0.0,160.0,10.0
75%,38.25,7.0,2.0,3.0,272.0,484.0
max,76.75,25.209999,20.0,23.0,760.0,51100.0


In [5]:
# finding out null data => missing data
dataset.isnull().sum()

# inference => no missing data

Sex                0
Age                0
Time_at_address    0
Res_status         0
Telephone          0
Occupation         0
Job_status         0
Time_employed      0
Time_bank          0
Liab_ref           0
Acc_ref            0
Home_Expn          0
Balance            0
Decision           0
dtype: int64

In [6]:
dataset.shape

# before calculating correlation, we shall one. hot encode Sex, Res_Status, Telephone, Occupation, Job Status, Acc_ref, Decision

(429, 14)

In [7]:
# one hot encoding
onehot_encoder = OneHotEncoder(handle_unknown='ignore')

# label encoder
label_encoder = LabelEncoder()

In [8]:
# sex
print('Sex: ', dataset['Sex'].nunique())

new_dataset['Sex'] = label_encoder.fit_transform(dataset['Sex'])

Sex:  2


In [9]:
# Res_Status
print('Res_status: ', dataset['Res_status'].nunique())

dataset['Res_status'] = label_encoder.fit_transform(dataset['Res_status'])

Res_status:  2


In [10]:
# Telephone
print('Telephone: ', dataset['Telephone'].nunique())

dataset['Telephone'] = label_encoder.fit_transform(dataset['Telephone'])

Telephone:  2


In [11]:
# Acc_ref
print('Acc_ref: ', dataset['Acc_ref'].nunique())

dataset['Acc_ref'] = label_encoder.fit_transform(dataset['Acc_ref'])

Acc_ref:  2


In [12]:
# Liab_ref
print('Liab_ref: ', dataset['Liab_ref'].nunique())

dataset['Liab_ref'] = label_encoder.fit_transform(dataset['Liab_ref'])

Liab_ref:  2


In [13]:
# Decision
print(dataset['Decision'].dtype)
print('Decision: ', dataset['Decision'].nunique())

dataset['Decision'] = label_encoder.fit_transform(dataset['Decision'])

object
Decision:  2


In [14]:
# Job_status

print('Job_status: ', dataset['Job_status'].unique())

Job_status:  ['unemploye' 'governmen' 'private_s' 'self_empl' 'retired' 'student'
 'military']


In [15]:
# Occupation
print('Occupation: ', dataset['Occupation'].unique())

# dataset['Occupation'] = label_encoder.fit_transform(dataset['Res_Status'])

Occupation:  ['unemploye' 'labourer' 'creative_' 'driver' 'professio' 'manager'
 'guard_etc' 'executive' 'office_st' 'productio' 'semi_pro' 'sales']


In [None]:
features_array = onehot_encoder.fit_transform(dataset[['Job_status', 'Occupation']]).toarray()

In [None]:
feature_labels = ['governmen', 'military', 'private_s', 'retired', 'self_empl',
        'student', 'unemploye', 'creative_', 'driver', 'executive', 'guard_etc', 'labourer',
        'manager', 'office_st', 'productio', 'professio', 'sales',
        'semi_pro', 'unemploye']
feature_labels

In [None]:
features = pd.DataFrame(features_array, columns = feature_labels)

In [None]:
dataset = pd.concat([dataset, features], axis=1)

In [None]:
dataset.drop(['Occupation','Job_status'], axis=1, inplace=True)

In [None]:
dataset.columns

In [None]:
# data prepared for correlation

corr = dataset.corr()
corr

In [None]:
# dropping columns based on correlations

# also dropping Time_at_address as it plays trivial role in decision making

dataset.drop(['Telephone', 'Time_at_address'], axis=1, inplace=True)

In [None]:
dataset.columns

# inference => we have 33 variables

In [None]:
dataset.head()

In [None]:
X = dataset[['Sex', 'Age', 'Res_status', 'Time_employed', 'Time_bank', 'Liab_ref',
       'Acc_ref', 'Home_Expn', 'Balance', 'governmen', 'military',
       'private_s', 'retired', 'self_empl', 'student', 'unemploye',
       'creative_', 'driver', 'executive', 'guard_etc', 'labourer', 'manager',
       'office_st', 'productio', 'professio', 'sales', 'semi_pro',
       'unemploye']].iloc[:, :].values

y = dataset[['Decision']].iloc[:].values

In [None]:
# 2 popular kernels are Linear and RBF

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

C = 1.0  # SVM regularization parameter
svc_classifier = svm.SVC(kernel='linear', C=C)
svc = svc_classifier.fit(X_train, y_train)

rbf_svc_classifier = svm.SVC(kernel='rbf', gamma=0.7, C=C)
rbf_svc = rbf_svc_classifier.fit(X_train, y_train)

poly_svc_classifier = svm.SVC(kernel='poly', degree=3, C=C).fit(X_train, y_train)
poly_svc = poly_svc_classifier.fit(X_train, y_train)

lin_svc_classifier = svm.LinearSVC(C=C)
lin_svc = lin_svc_classifier.fit(X_train, y_train)

In [None]:
y_pred_svc = svc.predict(X_test)
print("SVC Prediction: ", y_pred_svc)

from sklearn.metrics import confusion_matrix
cm_svc = confusion_matrix(y_test, y_pred_svc)
print("\nSVC Confusion Matrix: \n", cm_svc)


In [None]:
y_pred_rbf_svc = rbf_svc.predict(X_test)
print("RBF_SVC Prediction: ", y_pred_rbf_svc)

from sklearn.metrics import confusion_matrix
cm_rbf_svc = confusion_matrix(y_test, y_pred_rbf_svc)
print("\nRBF_SVC Confusion Matrix: \n", cm_rbf_svc)

In [None]:
y_pred_poly_svc = poly_svc.predict(X_test)
print("poly_svc Prediction: ", y_pred_poly_svc)

from sklearn.metrics import confusion_matrix
cm_poly_svc = confusion_matrix(y_test, y_pred_poly_svc)
print("\npoly_svc Confusion Matrix: \n", cm_poly_svc)

In [None]:
y_pred_lin_svc = lin_svc.predict(X_test)
print("lin_svc Prediction: ", y_pred_lin_svc)

from sklearn.metrics import confusion_matrix
cm_lin_svc = confusion_matrix(y_test, y_pred_lin_svc)
print("\nlin_svc Confusion Matrix: \n", cm_lin_svc)

In [None]:
# After being fitted, the model can then be used to predict new values

# SVMs decision function depends on some
# subset of the training data, called the
# support vectors.

# get support vectors
print("svc: ",svc.support_vectors_)
print("rbf_svc: ",rbf_svc.support_vectors_)
print("poly_svc: ",poly_svc.support_vectors_)

In [None]:
# get indices of support vectors
print("svc: ",svc.support_)
print("rbf_svc: ",rbf_svc.support_)
print("poly_svc: ",poly_svc.support_)

In [None]:
# get number of support vectors for each class

print("svc: ",svc.n_support_)
print("rbf_svc: ",rbf_svc.n_support_)
print("poly_svc: ",poly_svc.n_support_)

In [None]:
# the complexity of Linear Classifier < RBF Classifier
# because RBF uses an exponential classification system

# Linear SVC doesn't operate on the concept of n Support vectors
# thus we cannot compute the above 3 properties

In [None]:
from matplotlib.colors import ListedColormap
X_Set, Y_Set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_Set[:, :].min() - 1, stop = X_Set[:, :].max() + 1, step = 0.01),
                     np.arange(start = X_Set[:, :].min() - 1, stop = X_Set[:, :].max() + 1, step = 0.01))
plt.contourf(X1, X2, svc.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(Y_Set)):
    plt.scatter(X_Set[Y_Set == j, 0], X_Set[Y_Set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)