Credit Scoring is a statistical analysis method to gauge the credit worthiness of a loan applicant.
In simple terms, it helps find the credit score of the borrower.

In [46]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import statistics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score
from matplotlib.colors import ListedColormap
import sklearn.metrics as metrics

Load the dataset

In [47]:
df = pd.read_csv('Dataset_CreditScoring.csv')
df.head()

Unnamed: 0,TARGET,ID,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,1,582,3,3,0,4,0.0,5,117,27,...,3.0,92%,21%,2,3,7,21%,4,4,0%
1,1,662,15,9,0,3,1.0,3,14,14,...,1.0,80%,0%,0,0,0,100%,12,0,100%
2,1,805,0,0,0,1,5.0,1,354,7,...,5.0,36%,65%,0,1,1,73%,1,1,53%
3,1,1175,8,5,0,6,1.0,10,16,4,...,3.0,91%,25%,1,1,1,75%,7,1,133%
4,1,1373,3,1,0,9,0.0,8,130,52,...,1.0,125%,0%,0,1,4,14%,3,1,0%


In [48]:
df.drop("ID", axis = 1, inplace = True)

Check if there are any null values

In [49]:
df.isna().sum()

TARGET               0
DerogCnt             0
CollectCnt           0
BanruptcyInd         0
InqCnt06             0
InqTimeLast        188
InqFinanceCnt24      0
TLTimeFirst          0
TLTimeLast           0
TLCnt03              0
TLCnt12              0
TLCnt24              0
TLCnt                3
TLSum               40
TLMaxSum            40
TLSatCnt             4
TLDel60Cnt           0
TLBadCnt24           0
TL75UtilCnt         99
TL50UtilCnt         99
TLBalHCPct          41
TLSatPct             4
TLDel3060Cnt24       0
TLDel90Cnt24         0
TLDel60CntAll        0
TLOpenPct            3
TLBadDerogCnt        0
TLDel60Cnt24         0
TLOpen24Pct          3
dtype: int64

In [50]:
int_columns = df.select_dtypes(include=['object']).columns.tolist()
int_columns

['TLSum', 'TLMaxSum', 'TLBalHCPct', 'TLSatPct', 'TLOpenPct', 'TLOpen24Pct']

In [51]:
# import label encoder
from sklearn.preprocessing import LabelEncoder
# instantiate labelencoder object

label_encoder = LabelEncoder()

In [52]:
df['TLSum'] = label_encoder.fit_transform(df['TLSum'])
df['TLMaxSum'] = label_encoder.fit_transform(df['TLMaxSum'])
df['TLBalHCPct'] = label_encoder.fit_transform(df['TLBalHCPct'])
df['TLSatPct'] = label_encoder.fit_transform(df['TLSatPct'])
df['TLOpenPct'] = label_encoder.fit_transform(df['TLOpenPct'])
df['TLOpen24Pct'] = label_encoder.fit_transform(df['TLOpen24Pct'])

Fill the missing values with mean of the column using simple impute

In [53]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(df)
df = pd.DataFrame(imputer.transform(df), columns = df.columns)

In [54]:
df.isna().sum()

TARGET             0
DerogCnt           0
CollectCnt         0
BanruptcyInd       0
InqCnt06           0
InqTimeLast        0
InqFinanceCnt24    0
TLTimeFirst        0
TLTimeLast         0
TLCnt03            0
TLCnt12            0
TLCnt24            0
TLCnt              0
TLSum              0
TLMaxSum           0
TLSatCnt           0
TLDel60Cnt         0
TLBadCnt24         0
TL75UtilCnt        0
TL50UtilCnt        0
TLBalHCPct         0
TLSatPct           0
TLDel3060Cnt24     0
TLDel90Cnt24       0
TLDel60CntAll      0
TLOpenPct          0
TLBadDerogCnt      0
TLDel60Cnt24       0
TLOpen24Pct        0
dtype: int64

Predict the target (the first column) given the remaining columns. There are a total of 30 columns.

In [55]:
y = df.iloc[:, 0].values
X = df.iloc[:, 1:29].values

In [56]:
print(X.shape)
print(y.shape)

(3000, 28)
(3000,)


Divided the train and test dataset into 80:20 ratio 

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0,
                                                    stratify=y)

In [58]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Import linear Classifier from models directory

In [59]:
%run models/linear_classifier.ipynb

classifier = train_linear_classifier(X_train, y_train)

# Predicting on the test set
X_test_with_bias = np.column_stack((np.ones(len(X_test)), X_test))
predictions = classifier.predict(X_test_with_bias)

# Evaluating the accuracy
accuracy = np.mean((predictions >= 0.5) == y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100))


Accuracy: 82.33%


Import KNN classifier from models directory

In [60]:
%run models/knn.ipynb

knn_classifier = train_knn_classifier(X_train, y_train)

# Predicting on the test set
predictions = knn_classifier.predict(X_test)

# Evaluating the accuracy
accuracy = np.mean(predictions == y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100))


Accuracy: 81.50%


Import Decision Tree from models directory

In [61]:
%run models/decision_tree.ipynb

# Convert labels to integers
y_train = y_train.astype(int)
y_test = y_test.astype(int)

decision_tree = train_decision_tree(X_train, y_train)

# Predict using the trained tree
predictions = decision_tree.predict(X_test)

# Calculate accuracy
accuracy = np.mean(predictions == y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100))


Accuracy: 82.50%


Import Support Vector Machine from models directory

In [33]:
%run models/svm.ipynb

svm = train_svm_classifier(X_train, y_train)

predictions = svm._predict(X_test)
accuracy = np.mean(np.sign(predictions) == y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100))


Accuracy: 83.33%
