### PAVANA LAKSHMI VENUGOPAL
### UBIT ID- 50464513

In [137]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, mean_squared_error

from dmba import classificationSummary

In [138]:
%matplotlib inline

## Part 1: Data exploration and preprocessing

In [139]:
# read in data file

bank = pd.read_csv('UniversalBank_unprocessed.csv')
bank

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Education,Mortgage,CD Account,Online,CreditCard,Personal Loan
0,1,39,13.0,58.0,3,2.10,Undergraduate,169,0,1,0,0
1,2,51,25.0,18.0,1,0.30,Advanced,93,0,0,1,0
2,3,43,13.0,38.0,3,2.00,Advanced,0,0,1,0,0
3,4,37,12.0,60.0,4,2.10,Advanced,217,0,1,0,0
4,5,23,,149.0,1,6.33,Undergraduate,305,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1112,1113,37,12.0,123.0,4,3.10,Masters,253,1,1,1,1
1113,1114,37,13.0,158.0,2,2.30,Masters,0,1,1,1,1
1114,1115,53,29.0,120.0,4,2.70,Masters,111,1,1,0,1
1115,1116,26,0.0,179.0,4,2.10,Masters,0,0,0,0,1


In [140]:
# print the data types of each column
bank.dtypes

ID                 int64
Age                int64
Experience       float64
Income           float64
Family             int64
CCAvg            float64
Education         object
Mortgage           int64
CD Account         int64
Online             int64
CreditCard         int64
Personal Loan      int64
dtype: object

In [141]:
# print the shape of the dataframe
bank.shape

(1117, 12)

In [142]:
#1
predictors_df = bank[['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 
              'CD Account', 'Online', 'CreditCard']]
response_df = bank['Personal Loan']

In [143]:
# 2. look at each category in "Personal Loan"
print(bank["Personal Loan"].value_counts())

0    637
1    480
Name: Personal Loan, dtype: int64


In [144]:
#3 check for null values
bank.isnull().sum()

ID               0
Age              0
Experience       4
Income           3
Family           0
CCAvg            0
Education        0
Mortgage         0
CD Account       0
Online           0
CreditCard       0
Personal Loan    0
dtype: int64

In [145]:
#4
print(bank["Education"].value_counts())

Undergraduate    389
Advanced         383
Masters          345
Name: Education, dtype: int64


In [146]:
#5
predictors_df.corr()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard
Age,1.0,0.994236,-0.043482,-0.046942,-0.036068,0.020851,0.024537,0.034029,0.024464
Experience,0.994236,1.0,-0.03969,-0.05389,-0.038741,0.026466,0.025706,0.028488,0.033042
Income,-0.043482,-0.03969,1.0,-0.045998,0.629227,0.25096,0.257815,0.016826,-0.013544
Family,-0.046942,-0.05389,-0.045998,1.0,-0.012103,0.029502,0.025519,0.024708,0.012454
CCAvg,-0.036068,-0.038741,0.629227,-0.012103,1.0,0.131017,0.197364,0.011636,0.00341
Mortgage,0.020851,0.026466,0.25096,0.029502,0.131017,1.0,0.14219,0.00424,0.037236
CD Account,0.024537,0.025706,0.257815,0.025519,0.197364,0.14219,1.0,0.26287,0.377198
Online,0.034029,0.028488,0.016826,0.024708,0.011636,0.00424,0.26287,1.0,0.002696
CreditCard,0.024464,0.033042,-0.013544,0.012454,0.00341,0.037236,0.377198,0.002696,1.0


In [147]:
# 6 Removing column which is not needed
bank = bank.drop(["ID"], axis = 1)

In [148]:
#6
predictors_df = predictors_df.drop(["Age"], axis = 1)

In [149]:
predictors_df.corr()

Unnamed: 0,Experience,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard
Experience,1.0,-0.03969,-0.05389,-0.038741,0.026466,0.025706,0.028488,0.033042
Income,-0.03969,1.0,-0.045998,0.629227,0.25096,0.257815,0.016826,-0.013544
Family,-0.05389,-0.045998,1.0,-0.012103,0.029502,0.025519,0.024708,0.012454
CCAvg,-0.038741,0.629227,-0.012103,1.0,0.131017,0.197364,0.011636,0.00341
Mortgage,0.026466,0.25096,0.029502,0.131017,1.0,0.14219,0.00424,0.037236
CD Account,0.025706,0.257815,0.025519,0.197364,0.14219,1.0,0.26287,0.377198
Online,0.028488,0.016826,0.024708,0.011636,0.00424,0.26287,1.0,0.002696
CreditCard,0.033042,-0.013544,0.012454,0.00341,0.037236,0.377198,0.002696,1.0


In [150]:
# 7 flag categorical varibales
predictors_df = pd.get_dummies(predictors_df, columns=['Education'])

In [151]:
predictors_df.dtypes

Experience                 float64
Income                     float64
Family                       int64
CCAvg                      float64
Mortgage                     int64
CD Account                   int64
Online                       int64
CreditCard                   int64
Education_Advanced           uint8
Education_Masters            uint8
Education_Undergraduate      uint8
dtype: object

In [152]:
#impute NA values with k-NN imputer 
# in the code below, be sure you change "predictors_df" if you use a different name for your predictors DataFrame

imputer = KNNImputer(n_neighbors=5)
predictors_df = pd.DataFrame(imputer.fit_transform(predictors_df), columns = predictors_df.columns)
predictors_df

Unnamed: 0,Experience,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard,Education_Advanced,Education_Masters,Education_Undergraduate
0,13.0,58.0,3.0,2.10,169.0,0.0,1.0,0.0,0.0,0.0,1.0
1,25.0,18.0,1.0,0.30,93.0,0.0,0.0,1.0,1.0,0.0,0.0
2,13.0,38.0,3.0,2.00,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,12.0,60.0,4.0,2.10,217.0,0.0,1.0,0.0,1.0,0.0,0.0
4,17.8,149.0,1.0,6.33,305.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
1112,12.0,123.0,4.0,3.10,253.0,1.0,1.0,1.0,0.0,1.0,0.0
1113,13.0,158.0,2.0,2.30,0.0,1.0,1.0,1.0,0.0,1.0,0.0
1114,29.0,120.0,4.0,2.70,111.0,1.0,1.0,0.0,0.0,1.0,0.0
1115,0.0,179.0,4.0,2.10,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [156]:
predictors_df.isnull().sum()

Experience                 0
Income                     0
Family                     0
CCAvg                      0
Mortgage                   0
CD Account                 0
Online                     0
CreditCard                 0
Education_Advanced         0
Education_Masters          0
Education_Undergraduate    0
dtype: int64

In [157]:
# create train and test splits

X_classifier = predictors_df
y_classifier = response_df
train_X_classifier, test_X_classifier, train_y_classifier, test_y_classifier = train_test_split(X_classifier, 
                                                            y_classifier, test_size=0.3, random_state=616)

In [159]:
# 8 normalize predictors in data using standardization (we are using only the training data to calcualte the 
# means and standard deviations)

z_score_norm1 = preprocessing.StandardScaler()
z_score_norm1.fit(train_X_classifier)
train_X_classifier = pd.DataFrame(z_score_norm1.transform(train_X_classifier), 
                                          columns = predictors_df.columns)
test_X_classifier = pd.DataFrame(z_score_norm1.transform(test_X_classifier), 
                                          columns = predictors_df.columns)
test_X_classifier

Unnamed: 0,Experience,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard,Education_Advanced,Education_Masters,Education_Undergraduate
0,-0.559880,-0.271739,0.462597,-0.704687,0.242704,-0.423979,0.791886,1.547789,1.411502,-0.674074,-0.739280
1,-0.647431,-0.709504,-0.407977,-0.152407,0.114159,-0.423979,0.791886,-0.646083,-0.708465,1.483516,-0.739280
2,-1.085188,1.369879,1.333170,-0.750710,-0.566378,2.358607,0.791886,1.547789,1.411502,-0.674074,-0.739280
3,-1.785598,1.005075,-0.407977,2.056711,-0.566378,-0.423979,-1.262808,-0.646083,-0.708465,-0.674074,1.352668
4,-1.085188,1.369879,-0.407977,1.826595,0.204897,-0.423979,0.791886,-0.646083,-0.708465,-0.674074,1.352668
...,...,...,...,...,...,...,...,...,...,...,...
331,1.103595,-0.818946,1.333170,-0.106384,-0.566378,-0.423979,0.791886,-0.646083,-0.708465,1.483516,-0.739280
332,0.403184,1.333399,-1.278550,-1.026850,-0.566378,-0.423979,0.791886,-0.646083,-0.708465,1.483516,-0.739280
333,-0.472329,-1.037828,1.333170,-0.474570,0.174651,-0.423979,-1.262808,-0.646083,-0.708465,-0.674074,1.352668
334,0.928492,-1.092549,-0.407977,-1.118897,-0.566378,2.358607,0.791886,1.547789,-0.708465,-0.674074,1.352668


## Part 2: $k$-NN

In [160]:
# 1 train the k-NN model and look at performance on train data

knn = KNeighborsClassifier(n_neighbors=5).fit(train_X_classifier, train_y_classifier)
predicted_y_training = knn.predict(train_X_classifier)
f1_score(train_y_classifier, predicted_y_training)

0.9427710843373495

In [161]:
# 2 performance of k-NN on test data

predicted_y_test = knn.predict(test_X_classifier)
f1_score(test_y_classifier, predicted_y_test)

0.8960573476702508

In [162]:
#3
results = []
for k in range(1, 20):
    knn2 = KNeighborsClassifier(n_neighbors=k).fit(train_X_classifier, train_y_classifier)
    results.append({
        'k': k,
        'f1_score': f1_score(test_y_classifier, knn2.predict(test_X_classifier))
    })

# Convert results to a pandas data frame
results = pd.DataFrame(results)
print(results)

     k  f1_score
0    1  0.884058
1    2  0.854962
2    3  0.902527
3    4  0.883019
4    5  0.896057
5    6  0.882353
6    7  0.878571
7    8  0.874074
8    9  0.873646
9   10  0.880597
10  11  0.880866
11  12  0.868914
12  13  0.890511
13  14  0.865672
14  15  0.875912
15  16  0.865672
16  17  0.883212
17  18  0.864662
18  19  0.874074


In [174]:
knn = KNeighborsClassifier(n_neighbors=3).fit(train_X_classifier, train_y_classifier)
predicted_y_training = knn.predict(train_X_classifier)
f1_score(train_y_classifier, predicted_y_training)

0.9610778443113772

In [175]:
predicted_y_test = knn.predict(test_X_classifier)
f1_score(test_y_classifier, predicted_y_test)

0.9025270758122744

## Part 3: Logistic regression and model comparison

In [164]:
# train the LR model

logistic_model = LogisticRegression()
logistic_model = logistic_model.fit(train_X_classifier, train_y_classifier)
predicted_y_training2 = logistic_model.predict(train_X_classifier)
f1_score(train_y_classifier, predicted_y_training2)

0.8828828828828829

In [165]:
#1
predicted_y_test2 = logistic_model.predict(test_X_classifier)
f1_score(test_y_classifier, predicted_y_test2)

0.861111111111111

In [179]:
# print performance metrics (i.e., confusion matrix and accuracy) on training set

predicted_y_training = logistic_model.predict(train_X_classifier)
classificationSummary(train_y_classifier, predicted_y_training)

Confusion Matrix (Accuracy 0.9001)

       Prediction
Actual   0   1
     0 409  38
     1  40 294


In [183]:
predicted_y_test = logistic_model.predict(test_X_classifier)
classificationSummary(test_y_classifier, predicted_y_test)

Confusion Matrix (Accuracy 0.8810)

       Prediction
Actual   0   1
     0 172  18
     1  22 124
