## BINARY CLASSIFICATION WITH LOGISTIC REGRESSION

### 1.1 Import libraries 

In [None]:
# Importing Pandas, Numpy, Seaborn, Matplotlib, sqlalchemy, pymysql and getpass to protect our password

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sqlalchemy import create_engine
import pymysql
import getpass
password=getpass.getpass()

### 1.2 Connection to sql 

In [None]:
# Connection string:

connection_string='mysql+pymysql://root:'+password+'@localhost/bank'
engine=create_engine(connection_string)

In [None]:
# Query as a df:

df= pd.read_sql_query('''
select l.loan_id, l.status, count(distinct t.trans_id) as nooftrans, DATEDIFF(19981231, convert(a.date,date)) as ageindays, 
d.A12 as 95unemp, d.A13 as 96unemp, dp.type,
l.amount as loanamount, c.birth_number, d.A15 as crime95, d.A16 as crime96,
round((l.amount-l.payments)/l.amount,2) as ratiopaid
from loan l
left join trans t
using(account_id)
left join account a
using(account_id)
left join district d
on a.district_id = d.A1
left join disp dp
on a.account_id= dp.account_id 
left join client c
using(client_id)
WHERE l.status in ('A','B') AND dp.type = 'OWNER'
group by loan_id, l.amount, status, d.A12, d.A13, c.birth_number, d.A15, d.A16, DATEDIFF(19981231, convert(a.date,date)),
dp.type, round((l.amount-l.payments)/l.amount,2)
''', engine)

In [None]:
# Use head to confirm the df works as expected 

df.head()

###  2. EDA - Exploratory Data Analysis - get to know the data 

In [None]:
# Check number of records, columns, null values and data types:

df.info()

In [None]:
# Run descriptive statistics on numerical values:

df.describe()

In [None]:
# Check data types in a different view

df.dtypes

# Looks like we could drop loan_id and "birth_number"...

In [None]:
# After having a closer look at the unique values of "birth_number", it looks like it could actually add value...
# We could potentially extract gender and actual birthdates in date format, we will revisit during cleaning and wrangling

df.birth_number.unique()

### 2.1 Histograms or boxplots

In [None]:
# Visualize "number of transactions" based on "status" with a Histogram:

sns.displot(df, x='nooftrans', hue='status');

In [None]:
# Visualize "total loan amount" based on "status" with a Boxplot:

sns.barplot(x='loanamount', y='status', data=df);

In [None]:
# Quick check to understand how many counts of each category we have in our DataFrame

df['status'].value_counts()

# We can already see that the model we end up is very likely to be inbalanced

In [None]:
# Attempt to visualize "age of account" with another Histogram:

sns.displot(df['ageofaccount']);

In [None]:
# Visualising 95' and 96' unemployment to decide whether we want to drop one of the two...

fig = sns.kdeplot(df['95unemp'], shade=True, color="r")
fig = sns.kdeplot(df['96unemp'], shade=True, color="b")
plt.show()

# We could alternatively define a new column with the average of both.

### 2.2 Check for multicollinearity 

In [None]:
# Quick glance at multicollinearity using a table "snapshot" view:

correlation=df.corr()
correlation

In [None]:
# Seaborn heatmap to visualise correlation in a more eye-friendly way:

corr_matrix=df.corr(method='pearson')
fig, ax=plt.subplots(figsize =(10,8))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
ax=sns.heatmap(corr_matrix, mask=mask, annot=True)
plt.show()

### 2. 3 Clean and wrangling steps 

### Potential steps to follow when iterating:

 - Bucket into categories any fields
 - Should we drop any columns ? (iterative process) - we could drop loan_id!
 - Create avg of criminality / unemployment rate?
 - Crime - divide population?
 - Change duration to object type / categorical - 12,24,36,48,72  
 - Same for operation field 
 - Drop highly correlated fields
 - Bring in any missing fields
 - Change unemployment into High - Mid - Low
 - Split the data into num and cat --- > diff options cleaning / scaling 
 - Feature engineering - take an existing column and make it more useful
 - Check for multicollinearity
 - Extract gender

In [None]:
# Drop "loan_id" from our DataFrame since it is not adding any value

df.drop(['loan_id', 'type'], axis=1, inplace=True)

In [None]:
# Check that the column was actually dropped

df.head()

### 2. 4 Pre-processing 

 - label / encode categorical columns 
 - scale numerical columns

In [None]:
# Create DataFrame for categorical data ("status") and visualize it:

cat=df.select_dtypes(include=object)
cat.head()

In [None]:
# Create new categorical Dataframe with only 'B' status as a boolean:

categorical=pd.get_dummies(cat,columns=['status'],drop_first=True)
categorical.head()

In [None]:
# Numerical scaling: import Normalizer function:

from sklearn.preprocessing import Normalizer

In [None]:
# Create numerical DataFrame "X" and visualise it:

X = df.select_dtypes(include=np.number)
X.head()

In [None]:
# Scaling the numerical features to be normalised (gaussian distribution)
# Optional with logistic regresssion but might help model

scaler=Normalizer().fit(X)
scaled=scaler.transform(X)
scaled_X=pd.DataFrame(scaled)
scaled_X.head()

# Once we scale, we convert the array back to a dataframe

In [None]:
# TO NOTE: if you had more categorical features than just the label (status) we would concat
# X=np.concatenate((scaled_X, categorical)axis=1) --- This would bring back categorical data with numerical

### 2.5 Split off the dependant variable (label)

In [None]:
# y = dependant variable "status"

y = categorical['status_B']

# Independant variables are scaled_X -- Let´s redefine X:

X = scaled_X

In [None]:
# Visualisation of our Boolean dependant variable "y":

y

### 2.6 Train test split, get LOG REG model

In [None]:
# Import Logistic Regression model from Scikit learn:

from sklearn.linear_model import LogisticRegression

In [None]:
# Choose what amount(%) of data is destined to training the model and which (%) for testing:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=40)

## 3. Apply model and train model 

In [None]:
classification=LogisticRegression(solver='liblinear', multi_class='ovr').fit(X_train, y_train)

In [None]:
# add to the parameters as wished
# check scikit learn website -- logistic regression

### 3.1 Evaluate accuracy and test 

In [None]:
# Check accuracy of the model:

probabilities=classification.predict_proba(X_test)
preds=probabilities[:,1]
import sklearn.metrics as metrics
fpr, tpr, threshold=metrics.roc_curve(y_test, preds)
roc_auc=metrics.auc(fpr,tpr)
print(roc_auc)

#### 3.1.1 Next steps

+ Visualise the accuracy of the predictions in some ways 

+ also think about - is there something I could do to improve my model accuracy?? 

### 3.2 Visualising accuracy - ROC / AUC 

In [None]:
# ROC curve plot visualisation:

plt.title('Receiver Operating Characteristic')
plt.plot(fpr,tpr, label='AUC'%roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.ylabel('true positive rate')
plt.xlabel('false positive rate')
plt.show()

### 3.3 Visualising accuracy - Confusion Matrix

##### definitions 
+ tpr = true positive rate 
+ fpr = false positive rate

In [None]:
# Import accuracy score model from Scikitlearn, definde variable predictions:

from sklearn.metrics import accuracy_score

predictions = classification.predict(X_test)

In [None]:
# Import and run Confusion Matrix model and Plot

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

accuracy_score(y_test,predictions)
confusion_matrix(y_test, predictions)
plot_confusion_matrix(classification,X_test, y_test)
plt.show()

### 3.4 Data is highly imbalanced

this is affecting the accuracy of our predictions 
- what can be done to resolve that ?


+ option 1 - SMOTE 

+ option 2 - TOMEK LINKS 



In [None]:
# SMOTE -- 

In [None]:
# TOMEK LINKS -- 