In [1]:
#importing libraries

import pandas as pd
import numpy as np

In [2]:
#reading the csv file

data = pd.read_csv(r'https://raw.githubusercontent.com/KiranmayiR/Credit_Shiny/master/Credit_Approval.csv')

In [3]:
#summary
data.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [4]:
#information on data inside
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
Gender            690 non-null object
Age               690 non-null object
Debt              690 non-null float64
Married           690 non-null object
BankCustomer      690 non-null object
EducationLevel    690 non-null object
Ethnicity         690 non-null object
YearsEmployed     690 non-null float64
PriorDefault      690 non-null object
Employed          690 non-null object
CreditScore       690 non-null int64
DriversLicense    690 non-null object
Citizen           690 non-null object
ZipCode           690 non-null object
Income            690 non-null int64
Approved          690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


In [5]:
#statistical summary of numerical data
data.describe()

Unnamed: 0,Debt,YearsEmployed,CreditScore,Income
count,690.0,690.0,690.0,690.0
mean,4.758725,2.223406,2.4,1017.385507
std,4.978163,3.346513,4.86294,5210.102598
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.2075,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [6]:
#creating a shallow copy
df = data.copy()

In [7]:
df.tail()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260,0,-
686,a,22.67,0.75,u,g,c,v,2.0,f,t,2,t,g,200,394,-
687,a,25.25,13.5,y,p,ff,ff,2.0,f,t,1,t,g,200,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280,750,-
689,b,35.0,3.375,u,g,c,h,8.29,f,f,0,t,g,0,0,-


In [8]:
#inspecting for the missing values

df.isnull().sum()

Gender            0
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
ZipCode           0
Income            0
Approved          0
dtype: int64

In [9]:
#there might be some special characters which must have definitely been recognised as values.
#hence we need to rectify those first

df = df.replace(to_replace = '?', value = np.NaN)



In [10]:
df.isnull().sum()

Gender            12
Age               12
Debt               0
Married            6
BankCustomer       6
EducationLevel     9
Ethnicity          9
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
ZipCode           13
Income             0
Approved           0
dtype: int64

In [11]:
#now we see there are missing values once we replaced it with a '?'
#hence to handle missing values data

#imputing missing values with mean imputation
df.fillna(df.mean(), inplace = True)

In [12]:
#inspecting the missing values status
df.isnull().sum()

Gender            12
Age               12
Debt               0
Married            6
BankCustomer       6
EducationLevel     9
Ethnicity          9
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
ZipCode           13
Income             0
Approved           0
dtype: int64

In [13]:
#from the results above, comparing we understand no correction has been made to the numerical data so fa.
#hence we need to look out for the categorical data

#we perform modal imputation where we impute the mode of the columns
#wherein we iterate over each column and check the frequency of the data points in the row
#and impute it with the one having highest frequency.


In [14]:
#iterating over each column in dataframe
#selecting columns of object datatype
#selecting the highest frequency f objects, ie, mode
#and then replacing it with the mode

for i in list(df):
    if df[i].dtypes == 'object':
        df = df.fillna(df[i].value_counts().index[0])

In [15]:
#rechecking missing values
df.isnull().sum()

Gender            0
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
ZipCode           0
Income            0
Approved          0
dtype: int64

In [None]:
#label encoding the features by looping over the values present in rows

In [23]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for cols in df:
    if df[cols].dtypes == 'object':
        df[cols] = label_encoder.fit_transform(df[cols])
    

In [24]:
df.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,1,156,0.0,2,1,13,8,1.25,1,1,1,0,0,68,0,0
1,0,328,4.46,2,1,11,4,3.04,1,1,6,0,0,11,560,0
2,0,89,0.5,2,1,11,4,1.5,1,0,0,0,0,96,824,0
3,1,125,1.54,2,1,13,8,3.75,1,1,5,1,0,31,3,0
4,1,43,5.625,2,1,13,8,1.71,1,0,0,0,2,37,0,0


In [25]:
df.tail()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
685,1,52,10.085,3,3,5,4,1.25,0,0,0,0,0,90,0,1
686,0,71,0.75,2,1,2,8,2.0,0,1,2,1,0,67,394,1
687,0,97,13.5,3,3,6,3,2.0,0,1,1,1,0,67,1,1
688,1,20,0.205,2,1,0,8,0.04,0,0,0,0,0,96,750,1
689,1,197,3.375,2,1,2,4,8.29,0,0,0,1,0,0,0,1


In [26]:
#removing features which are not useful

df = df.drop(['DriversLicense','ZipCode'], axis = 1)

In [27]:
#converting to numpy

df = df.values

In [29]:
#splitting dataset
#first segregating

x,y = df[:, 0:13] , df[:,13]

In [30]:
from sklearn.model_selection import train_test_split

#splitting into train and test sets

train_x, test_x, train_y, test_y= train_test_split(x,y, test_size = 0.3 , random_state = 123)

In [36]:
#performing feature scaling

from sklearn.preprocessing import MinMaxScaler

#scaling x_train, x_test

scaler = MinMaxScaler(feature_range = (0,1))
rescaledtrain_x = scaler.fit_transform(train_x)
rescaledtest_x = scaler.transform(test_x)

In [39]:
#using logistic regression

from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

logistic = LogisticRegression()

logistic.fit(rescaledtrain_x , train_y )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
#predicting 

prediction_y = logistic.predict(rescaledtest_x)

In [42]:
prediction_y.shape

(207,)

In [43]:
#creating the confusion matrix

from sklearn.metrics import confusion_matrix

confusion_matrix(test_y, prediction_y)

array([[87, 11],
       [11, 98]], dtype=int64)

In [44]:
#getting accuracy


accuracy = logistic.score(rescaledtest_x, test_y)

print(accuracy)

0.893719806763285


In [45]:
#hence we built a logistic regression classifier which gave us an accuracy of 89.37%
#which helped us in determining credit card approval detector