# **Linear Regression Modelling**

Demonstrate general lifecycle of Machine Learning Modelling

--- 

## 1. Initiate Library & Load Data

In [11]:
## EDA Standard Libary

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats as ss

In [54]:
#ML Library

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [13]:
import warnings 
warnings.filterwarnings('ignore')

In [14]:
bank = pd.read_csv('/Users/Dwika/My Projects/DATASETS/bankloan.csv')

## 2. Data Cleaning

In [15]:
#Check missing values
bank.isna().sum()

age         0
ed          0
employ      0
address     0
income      0
debtinc     0
creddebt    0
othdebt     0
default     0
dtype: int64

OK - No missing values

In [16]:
bank.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.65872,0.82128,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1


*Note*

- default = 1 -> Galbay
- default = 0 -> Not Galbay

## 3. Train Test Split

In [50]:
#Train test split
x = bank.drop('default', axis=1)  # All cols except default as ind var (x)
y = bank['default']               # default col as target (y)

xtrain, xtest, ytrain, ytest = train_test_split(
    x,
    y,
    test_size= 0.2,    # Jumlah size untuk test data, size training akan ngikut,
    random_state=20,   # Random seed
    stratify=y         # Proporsi antar train & test dari y yg diambil disamakan
)

## 4. Build Model

### 4.1 Logistic Regression Model

In [51]:
#Build model

logreg = LogisticRegression()
logreg.fit(xtrain,ytrain) #apply model to training data (xtrain,ytrain)

In [52]:
#Predict xtest
logreg.predict(xtest)

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1])

In [53]:
#Print report
pred = logreg.predict(xtest)
print(classification_report(ytest,pred))
print(accuracy_score(ytest,pred))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86       103
           1       0.63      0.51      0.57        37

    accuracy                           0.79       140
   macro avg       0.73      0.70      0.72       140
weighted avg       0.78      0.79      0.79       140

0.7928571428571428


### 4.2 Decission Tree Classifier Model

In [57]:
dt = DecisionTreeClassifier()
dt.fit(xtrain, ytrain) #Apply dt model with xtrain, ytrain data
pred_tree = dt.predict(xtest)


In [58]:
pred_tree

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1])

## **Comparing Accuracy Score**

In [61]:
print(f'Accuracy score of DT        =  {accuracy_score(ytest,pred_tree)}')
print(f'Accuracy score of LogReg    =  {accuracy_score(ytest,pred)}')

Accuracy score of DT        =  0.7285714285714285
Accuracy score of LogReg    =  0.7928571428571428


By accuracy score of 79.2% , the Logistic Regression model are more appropriate mode