In [43]:
# General libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

#Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#modelling libraries

#Data splitting
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor,AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import GaussianNB

# Import metrics

# Classification metrics
from sklearn.metrics import classification_report

# Regression metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



Data Obtainance, Cleaning & Feature Engineering

Load the data

In [44]:
kopa = pd.read_csv("KOPA.csv")
kopa.head()

Unnamed: 0,Product,CustomerGender,Location,Region,TotalPrice,StartDate,Deposit,DailyRate,TotalDays,AmountPaid30,AmountPaid60,AmountPaid360,LoanStatus360
0,Product B,Male,Location 2,Region 2,16600,7/4/2014,2000,40,365,3121,4241,13621,Active
1,Product B,Male,Location 3,Region 3,16600,1/2/2015,2000,40,365,3061,4171,15041,Active
2,Product C,Female,Location 5,Region 5,15600,12/8/2012,1000,40,365,2160,3280,15340,Active
3,Product B,Male,Location 6,Region 1,16600,2/20/2015,2000,40,365,3041,4241,15321,Active
4,Product D,Female,Location 7,Region 2,20250,10/28/2013,2000,50,365,3470,4820,13720,Active


In [45]:
kopa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9195 entries, 0 to 9194
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Product         9195 non-null   object
 1   CustomerGender  9195 non-null   object
 2   Location        9195 non-null   object
 3   Region          9195 non-null   object
 4   TotalPrice      9195 non-null   int64 
 5   StartDate       9195 non-null   object
 6   Deposit         9195 non-null   int64 
 7   DailyRate       9195 non-null   int64 
 8   TotalDays       9195 non-null   int64 
 9   AmountPaid30    9195 non-null   int64 
 10  AmountPaid60    9195 non-null   int64 
 11  AmountPaid360   9195 non-null   int64 
 12  LoanStatus360   9195 non-null   object
dtypes: int64(7), object(6)
memory usage: 934.0+ KB


Cleaning the Data

In [46]:
# Check for irregularities in the data
report = ProfileReport(kopa)
report.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
#Drop unnecessary columns

to_drop = ['TotalDays', 'Location', 'StartDate', 'AmountPaid30', 'DailyRate']
kopa_clean = kopa.drop(to_drop, axis = 1)

Feature Engineering

Converts data into Numerical

In out case we use the label encoder by sklearn

In [None]:
# Let's create two datasets i.e. regression and classification
kopa_regression = kopa_clean.copy()
kopa_classification = kopa_clean.copy()

In [None]:
# Use the label encoder to convert data from categorical into numerical
# regression dataset
# Instantiate the encoder
encoder = LabelEncoder()

# Encode the categorical column - return columns that are categorical
cat_columns = [column for column in kopa_regression.columns if kopa_regression[column].dtype == 'O']

# Label encode 
for column in cat_columns:
    kopa_regression[column] = encoder.fit_transform(kopa_regression[column])

In [None]:
kopa_regression.head()

Unnamed: 0,Product,CustomerGender,Region,TotalPrice,Deposit,AmountPaid60,AmountPaid360,LoanStatus360
0,1,1,10,16600,2000,4241,13621,0
1,1,1,11,16600,2000,4171,15041,0
2,2,0,13,15600,1000,3280,15340,0
3,1,1,0,16600,2000,4241,15321,0
4,3,0,10,20250,2000,4820,13720,0


In [None]:
# kopa classification
# create a mapper dictionary to map data
mapper = {}
target_values = list(kopa_classification['LoanStatus360'].unique())
for value, key in enumerate(target_values):
    mapper[key] = value
    
print(mapper)

# replace the data in the target variables
kopa_classification['LoanStatus360'] = kopa_classification['LoanStatus360'].replace(mapper)

# Encode the categorical column - change categorical data into numerical
cat_columns = [column for column in kopa_classification.columns if kopa_classification[column].dtype == 'O']

# Label encode 
for column in cat_columns:
    kopa_classification[column] = encoder.fit_transform(kopa_classification[column])


{'Active': 0, 'Blocked': 1, 'Finished Payment': 2}


Split data into train, validation and test

It is an iterative process

In [None]:
# Split data into features and output data
x_c = kopa_classification.drop('LoanStatus360', axis = 1)
y_c = kopa_classification['LoanStatus360']

# create fisrt split
x_train_c, x_holdout_c, y_train_c, y_holdout_c = train_test_split(x_c, y_c, test_size = 0.3, random_state = 42)

# split into validation
x_test_c, x_val_c, y_test_c, y_val_c = train_test_split(x_holdout_c, y_holdout_c, test_size = 0.5, random_state = 42)



In [None]:
# Split data into features and output data
x_r = kopa_regression.drop('AmountPaid360', axis = 1)
y_r = kopa_regression['AmountPaid360']

# create fisrt split
x_train_r, x_holdout_r, y_train_r, y_holdout_r = train_test_split(x_r, y_r, test_size = 0.3, random_state = 42)

# split into validation
x_test_r, x_val_r, y_test_r, y_val_r = train_test_split(x_holdout_r, y_holdout_r, test_size = 0.3, random_state = 42)

Modelling

In [None]:
# Classification problem

# Instatiate our model
dt_classifier = DecisionTreeClassifier()

# Train Model
dt_classifier.fit(x_train_c, y_train_c)

# Predict on our Model
y_predictions = dt_classifier.predict(x_val_c)

# Evaluate Model
print(classification_report(y_val_c, y_predictions))

              precision    recall  f1-score   support

           0       0.96      0.93      0.95      1023
           1       0.50      0.62      0.56       109
           2       1.00      1.00      1.00       248

    accuracy                           0.92      1380
   macro avg       0.82      0.85      0.83      1380
weighted avg       0.93      0.92      0.93      1380



In [None]:
# Regression model
# Predict on our Model
y_test_predictions = dt_classifier.predict(x_test_c)

# Evaluate Model
print(classification_report(y_test_c, y_test_predictions))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      1018
           1       0.63      0.63      0.63       141
           2       1.00      1.00      1.00       220

    accuracy                           0.92      1379
   macro avg       0.86      0.86      0.86      1379
weighted avg       0.92      0.92      0.92      1379

