# 1. Credit card approvals

In this notebook we will build credit card approval predictor using machine learning techniques.
first we will load the credit card approval dataset from the UCI repository.

In [1]:
import pandas as pd
import numpy as np

#load dataset
cca = pd.read_csv("crx.csv", header=None)
cca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


# 2. Inspecting and Handling missing data

 Here the dataset is anonymized to protect privacy data. Therefore they replaced the column names to numbers. We need to inspect further into the data.

In [2]:
# Checking statistics
print(cca.describe())
print(cca.info())

               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-

In [3]:
# inspecting missimg values in dataset
cca.tail(17)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
673,?,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-
682,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140,2,-


Since our dataset contains both numeric and non-numeric data. Here the columns 2,7,10,14 contains numerical data and others contain non-numerical data. On looking on the tail side we can see a '?' symbol we need to replace this with 'nan' sice it is given to missing values.

In [4]:
# Inspect missing values
print(cca.isnull().sum())

# Replace "?" with NaN
cca = cca.replace("?", np.NaN)
cca.tail(17)

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
673,,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-
682,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140,2,-


In [5]:
# Replacing NaN values with mean 
cca = cca.fillna(cca.mean())

# Counting the numer of NaN values
print(cca.isnull().values.sum())

print(cca.info())

67
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       678 non-null    object 
 1   1       678 non-null    object 
 2   2       690 non-null    float64
 3   3       684 non-null    object 
 4   4       684 non-null    object 
 5   5       681 non-null    object 
 6   6       681 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-null    object 
 9   9       690 non-null    object 
 10  10      690 non-null    int64  
 11  11      690 non-null    object 
 12  12      690 non-null    object 
 13  13      677 non-null    object 
 14  14      690 non-null    int64  
 15  15      690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB
None


  cca = cca.fillna(cca.mean())


In [6]:
# To take care the missing values in non-numeric columns
for col in cca.columns:
    
    # checking the column type
    if cca[col].dtypes == 'object':
        # Replace the missing values with nearest values
        cca[col] = cca[col].fillna(cca[col].value_counts().index[0])
# Counting the number of NaNs        
print(cca.isnull().values.sum())

0


# 3. Data pre-processing

Now we need to convert non-numeric values to numeric and scale these feature values to a uniform range.

In [7]:
# Importing labelencoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# For extracting datatypes over columns
for col in cca.columns:
    # Compare the datatype to object
    if cca[col].dtype=='object':
        # Using labelencoder for numerical transformation
        cca[col] = le.fit_transform(cca[col])

In [8]:
from sklearn.preprocessing import MinMaxScaler

# Dropping columns 10 & 13 to convert th dataframe to array
cca = cca.drop([cca.columns[10], cca.columns[13]], axis=1)
cca = cca.values

# Seggregate features and labels to separate variables
X,y = cca[:,0:13], cca[:,13]

# Instantiate MinMaxScaler to rescale
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX = scaler.fit_transform(X)

# 4. Splitting and training model

Let's split the data and apply logistic regression.

In [9]:
from sklearn.model_selection import train_test_split
# Split into train and test sets
x_train,x_test,y_train,y_test = train_test_split(rescaledX, y, test_size=0.33, random_state=42)

In [10]:
# Importing Logistic Regrssion
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
# Fitting the model to train set
model.fit(x_train, y_train)

LogisticRegression()

# 5. Making predictions and evaluating performance

we'll now evaluate our model.

In [11]:
# Importing confusion_matrix
from sklearn.metrics import confusion_matrix

# Using model to predict 
y_pred = model.predict(x_test)

# Finding accuracy
print("Accuracy of logistic regression classifier:", model.score(x_test, y_test))
# printing confusion matrix
confusion_matrix(y_test, y_pred)


Accuracy of logistic regression classifier: 0.8421052631578947


array([[94,  9],
       [27, 98]], dtype=int64)

# 6. Grid searching and improving the model

our model performed well. Let's see if we can improve further.

In [12]:
from sklearn.model_selection import GridSearchCV

# Define the grid values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys
param_grid = dict(tol=tol, max_iter=max_iter)
print(param_grid)

{'tol': [0.01, 0.001, 0.0001], 'max_iter': [100, 150, 200]}


In [13]:
grid_model = GridSearchCV(estimator=model, param_grid=param_grid,cv=5)

# Fit data to grid model
grid_model_result = grid_model.fit(rescaledX, y)

# Model Summary
best_score, best_params = grid_model_result.best_score_,grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

Best: 0.850725 using {'max_iter': 100, 'tol': 0.01}
