> Objective: build a credit card approval predictor <br>
> Strategy: logistic regression and GridSearching <br>
> Data Source: UCI Machine Learning Repository - http://archive.ics.uci.edu/ml/datasets/credit+approval

In [58]:
# Import pandas
import pandas as pd

# Load dataset
df= pd.read_csv("crx.data", header=None)

# Inspect data
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
5,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+
6,b,33.17,1.04,u,g,r,h,6.5,t,f,0,t,g,164,31285,+
7,a,22.92,11.585,u,g,cc,v,0.04,t,f,0,f,g,80,1349,+
8,b,54.42,0.5,y,p,k,h,3.96,t,f,0,f,g,180,314,+
9,b,42.5,4.915,y,p,w,v,3.165,t,f,0,t,g,52,1442,+


In [59]:
# Print summary statistics
print(df.describe())

# Print DataFrame information
print(df.info())

               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
0     690 non-null object
1     690 non-null object
2     690 non-null float64
3     690 non-null object
4     690 non-null object
5     690 non-null object
6     690 non-null object
7     690 non-null float64
8     690 non-null object
9     690 non-null object
10    690 non-null int64
11    690 non-null object
12    690 non-null object
13    690 non-null object
14    690 non-null int64
1

> Dealing with missing values

In [60]:
# Import numpy
import numpy as np

# Replace the '?'s with NaN
df = df.replace('?', np.NaN)

# Impute the missing values with mean imputation
df.fillna(df.mean(), inplace=True)
print(df.isnull().sum())

# Iterate over each column of cc_apps
for col in df:
    # Check if the column is of object type
    if df[col].dtypes == 'object':
        # Impute with the most frequent value
        df = df.fillna(df[col].value_counts().index[0])

# Count the number of NaNs in the dataset and print the counts to verify
df.isnull().sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

In [61]:
# Iterate over each column of cc_apps
for col in df:
    # Check if the column is of object type
    if df[col].dtypes == 'object':
        # Impute with the most frequent value
        df = df.fillna(df[col].value_counts().index[0])

# Count the number of NaNs in the dataset and print the counts to verify
df.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

> Preprocessing the data <br>

In [62]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Instantiate LabelEncoder
le=LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in df.columns:
    # Compare if the dtype is object
    if df[col].dtype=='object':
    # Use LabelEncoder to do the numeric transformation
        df[col]=le.fit_transform(df[col])

> Splitting the dataset into train and test sets <br>

In [63]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Drop features of DriversLicense and ZipCode, 
df = df.drop([df.columns[11],df.columns[13]], axis=1)
# Convert it to a NumPy array
df = df.values

# Segregate features and labels into separate variables
X,y = df[:,0:13], df[:,13]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

> Scaling of the data <br>

In [64]:
# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)


> Logistic regression fitting and prediction

In [65]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
y_pred = logreg.predict(rescaledX_test)

> Evaluating the performance

In [66]:
# Import confusion_matrix
from sklearn.metrics import confusion_matrix

# Accuracy score of the logreg model 
print("Accuracy of logreg mode: ", logreg.score(rescaledX_test, y_test).round(3))

# Confusion matrix of the logreg model
confusion_matrix(y_test, y_pred)

Accuracy of logreg mode:  0.842


array([[95,  8],
       [28, 97]], dtype=int64)

> Grid searching

In [67]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define the grid of values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

# Create a dictionary 
param_grid = dict(tol=tol, max_iter=max_iter)
param_grid

{'tol': [0.01, 0.001, 0.0001], 'max_iter': [100, 150, 200]}

> Finding the best performing model

In [68]:
# Instantiate GridSearchCV 
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Use scaler to rescale X 
rescaledX = scaler.fit_transform(X)

# Fit data to grid_model
grid_model_result = grid_model.fit(rescaledX, y)

# Summary
best_score, best_params = grid_model_result.best_score_,grid_model_result.best_params_
print("Best: %.3f using %s" % (best_score, best_params))

Best: 0.854 using {'max_iter': 100, 'tol': 0.01}
