# IMPORT LIBRARIES

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import plotly.express as px

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler


import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

# READING DATA FROM SOURCE

In [None]:
dfCustomers = pd.read_csv("customers.csv", delimiter = ",")
dfCustomers.set_index("CLIENT_ID", inplace = True)

In [None]:
dfCustomers.head()

In [None]:
dfTransactions = pd.read_csv("transactions.csv", delimiter = ",")
dfTransactions.set_index("TRANS_ID", inplace = True)

In [None]:
dfTransactions.head()

In [None]:
dfDistricts =  pd.read_csv("districts.csv", delimiter = ",")
dfDistricts.set_index("DISTRICT_ID", inplace = True)

In [None]:
dfDistricts.head()

# DATA EXPLORATION

## Exploring Uniqueness

### dfCustomers

In [None]:
len(dfCustomers.index.unique()) == len(dfCustomers.index)

### dfDistricts

In [None]:
len(dfDistricts.index.unique()) == len(dfDistricts.index)

### dfTransactions

dfTransactions contain some duplicated TRANS_IDs.

In [None]:
len(dfTransactions.index.unique()) == len(dfTransactions.index)

There are 20000 rows which are duplicated.

In [None]:
dfTransactions[dfTransactions.index.duplicated(keep=False)]

In [None]:
dfTransactions.loc[3457056]

## Exploring Data Types

### dfCustomers

"LOAN" field should be either 1 or 0 by description. Usually such fields have integer data types. However, it looks as float64, it s a sign that this field can include some unexpected data.

In [None]:
dfCustomers.info()

Confirming that each client has only 1 account id in dfCustomers dataset.

In [None]:
len(dfCustomers.index.unique()) == len(dfCustomers["ACCOUNT_ID"].unique())

### dfDistricts

"UNEMP_95" and "CRIME_95" fields look as object data type. By description of field, "UNEMP_95" should represent unemployment ratio which is float, however it looks as object. Similarly, "CRIME_95" field represents number of committed crimes which should be in integer type. However, it also looks as object. These are signs that, the fields of "UNEMP_95" and "CRIME_95" contain some unexpected data.

In [None]:
dfDistricts.info()

### dfTransactions

"ACCOUNT_ID" field's data type should be integer.

In [None]:
dfTransactions.info()

## Exploring Missing Data

### dfCustomers

Only field 'LOAN' includes NaN values in dfCustomers dataframe.

In [None]:
dfCustomers.columns[dfCustomers.isna().any()].tolist()

Checking unique fields to be sure if any other missing values except NaN. Unique values are sorted to detect missing values on the boundaries of the sorted array.

In [None]:
np.sort(dfCustomers.index.unique())

In [None]:
np.sort(dfCustomers["ACCOUNT_ID"].unique())

In [None]:
np.sort(dfCustomers["GENDER"].unique())

In [None]:
np.sort(dfCustomers["BIRTH_DT"].unique())

Confirming that each date value includes 8 characters.

In [None]:
dfCustomers["BIRTH_DT"].astype(str).str.len().unique()

In [None]:
np.sort(dfCustomers["ACTIVE"].unique())

In [None]:
np.sort(dfCustomers["LOAN"].unique())

In [None]:
np.sort(dfCustomers["DISTRICT_ID"].unique())

In [None]:
np.sort(dfCustomers["SET_SPLIT"].unique())

### dfDistricts

Looks like there is no field that contains NaN value in dfDistricts.

In [None]:
dfDistricts.columns[dfDistricts.isna().any()].tolist()

Checking unique fields to be sure if any other missing values except NaN. Unique values are sorted to detect missing values on the boundaries of the sorted array.

In [None]:
np.sort(dfDistricts.index.unique())

In [None]:
np.sort(dfDistricts["N_INHAB"].unique())

In [None]:
np.sort(dfDistricts["N_CITIES"].unique())

In [None]:
np.sort(dfDistricts["URBAN_RATIO"].unique())

In [None]:
np.sort(dfDistricts["AVG_SALARY"].unique())

"UNEMP_95" field contains value of question mark character '?'.

In [None]:
np.sort(dfDistricts["UNEMP_95"].unique())

In [None]:
np.sort(dfDistricts["UNEMP_96"].unique())

In [None]:
np.sort(dfDistricts["N_ENTR"].unique())

"CRIME_95" field contains value of question mark character '?'.

In [None]:
np.sort(dfDistricts["CRIME_95"].unique())

In [None]:
np.sort(dfDistricts["CRIME_96"].unique())

### dfTransactions

"ACCOUNT_ID" and "OPERATION" operation fields contain NaN values.

In [None]:
dfTransactions.columns[dfTransactions.isna().any()].tolist()

In [None]:
np.sort(dfTransactions.index.unique())

"ACCOUNT_ID" field contains NaN values.

In [None]:
np.sort(dfTransactions["ACCOUNT_ID"].unique())

"DATE" field doesn't contain NaN value but date format is not in DDMMYYY format for some values.

In [None]:
np.sort(dfTransactions["DATE"].unique())

Ther are incompatible formats of DD such as 40, 50, 41 etc. 

In [None]:
 dfTransactions["DATE"].astype(str).str[:2].unique()

Ther are incompatible formats of MM such as 71, 81, 91 etc.

In [None]:
dfTransactions["DATE"].astype(str).str[2:4].unique()

When some accounts are checked, it is observed that they are not inserted as DDMMYYY but DDMMYYYY format. And when day of month is less than 10, they are inserted only 1 digit not 2 digits.

In [None]:
 dfTransactions[dfTransactions["DATE"].astype(str).str[:2] == "50"]

In [None]:
dfTransactions[dfTransactions["ACCOUNT_ID"] == 652]

In [None]:
dfTransactions["DATE"].astype(str).str[4:].unique()

In [None]:
np.sort(dfTransactions["AMOUNT"].unique())

In [None]:
np.sort(dfTransactions["BALANCE"].unique())

In [None]:
np.sort(dfTransactions["TYPE"].unique())

"ACCOUNT_ID" field contains NaN values.

In [None]:
dfTransactions["OPERATION"].unique()

# PREPROCESSING

Data preprocessing is applied to the copied datasets since some fields may be good enough for exploratory analysis but not complied for predictive model. That's why, preprocessing is applied only to the datasets that will be used in predictive models.

In [None]:
dfCustomersModel = dfCustomers.copy()
dfTransactionsModel = dfTransactions.copy()
dfDistrictsModel = dfDistricts.copy()

## Remove Duplicated Rows

### TRANS_ID

One of the duplicated rows are kept. Rest of them are dropped.

In [None]:
dfTransactionsModel.drop_duplicates(inplace = True, keep = "first")

Still there are duplicated rows in terms of index but not for other values. The reason of this situation is that at least one of the duplicated rows contains NaN value.

In [None]:
len(dfTransactionsModel.index.unique()) == len(dfTransactionsModel.index)

In [None]:
dfDuplicatedTransactions = dfTransactionsModel[dfTransactionsModel.index.duplicated(keep=False)]

In [None]:
dfDuplicatedTransactions.sort_index()

We can fill missing values by copying from its duplicated index.

In [None]:
aDuplicatedTransIds = dfDuplicatedTransactions.index.unique()


for iTransId in aDuplicatedTransIds:
    
    for j in range(dfTransactionsModel.shape[1]):
        sColumnName = dfTransactionsModel.columns[j]
        
        aColumnValues = dfTransactionsModel.loc[iTransId, sColumnName]
        
        iNrOfMissingValues = aColumnValues.isna().sum()
        iNrOfDuplicates = len(aColumnValues)
        
        if iNrOfMissingValues > 0 and iNrOfMissingValues < iNrOfDuplicates:
            
            aNonMissingValues = aColumnValues[aColumnValues.notna()]
            iNrOfNonMissingValues = len(aNonMissingValues)
            
            if iNrOfNonMissingValues == 1:
                dfTransactionsModel.loc[iTransId, sColumnName] = aNonMissingValues
            
        
        


After filling missing values from it's duplicated row, dropping duplicates is applied once again. It is observed that, there is no more duplicated rows.

In [None]:
dfTransactionsModel.drop_duplicates(inplace = True, keep = "first")

In [None]:
len(dfTransactionsModel.index.unique()) == len(dfTransactionsModel.index)

## Converting Date Formats

### BIRTH_DT

In [None]:
dfCustomersModel["BIRTH_DT"] = pd.to_datetime(dfCustomersModel["BIRTH_DT"], format="%Y%m%d")

In [None]:
np.sort(dfCustomersModel["BIRTH_DT"].unique())

In [None]:
dfCustomersModel.info()

### DATE

"DATE" field should be in DDMMYYY based on description however, it is in DMMYYYY format. For the days that are less than 10, day of month is represented just 1 digit. 

 "0" character is added at the beginning of the values where day of month is represented with single value. 

In [None]:
dfTransactionsModel["DATE"] = dfTransactionsModel["DATE"].astype(str)
adfTransactionsWith7DigitsDates = dfTransactionsModel[dfTransactionsModel["DATE"].str.len() == 7]
dfTransactionsModel.loc[adfTransactionsWith7DigitsDates.index, "DATE"] = "0" + dfTransactionsModel.loc[adfTransactionsWith7DigitsDates.index, "DATE"]

In [None]:
dfTransactionsModel["DATE"] = pd.to_datetime(dfTransactionsModel["DATE"], format="%d%m%Y")

In [None]:
np.sort(dfTransactionsModel["DATE"].unique())

In [None]:
dfTransactionsModel.info()

## Missing Data Handling

### LOAN

There are 50 customers which have NaN value on their "LOAN" field. Empty "LOAN" data can't be used for training or testing purposes. That's why, the customers which don't have "LOAN" information are dropped from dfCustomersModel and dfTransactionsModel datasets.

In [None]:
dfCustomersMissingLoan = dfCustomers[dfCustomers["LOAN"].isna()]

dfTransactionsMissingLoan = dfTransactions.reset_index().merge(
    dfCustomersMissingLoan, 
    how = "inner", 
    on = "ACCOUNT_ID").set_index("TRANS_ID")


dfTransactionsModel.drop(dfTransactionsMissingLoan.index, inplace = True)
dfCustomersModel.drop(dfCustomersMissingLoan.index, inplace = True)

In [None]:
dfCustomersModel["LOAN"] = dfCustomersModel["LOAN"].astype(np.int64)

### UNEMP_95 & CRIME_95

There is 1 district (DISTRICT_ID=69) whose "UNEMP_95" and "CRIME_95" fields are empty.

In [None]:
dfDistricts[dfDistricts["UNEMP_95"] == "?"]

In [None]:
dfDistricts[dfDistricts["CRIME_95"] == "?"]

A predictive model is used to produce a value instead of question mark. As it is mentioned above, "UNEMP_95" and "CRIME_95" are in object format not in float format. The reason of this situation was because of question mark character on DISTRIC_ID=96. 
This row is dropped for temporary purpose from dataset to convert "UNEMP_95" and "CRIME_95" to float data type so that we can perform some numerical analysis. After missing values are predicted, DISTRICT_ID = 69 will be appended back to dfDistrictsModel.

In [None]:
dfMissingRows = dfDistrictsModel[(dfDistrictsModel["UNEMP_95"] == "?") | (dfDistrictsModel["CRIME_95"] == "?")]

dfDistrictsModel.drop(dfMissingRows.index, inplace = True)

dfDistrictsModel["UNEMP_95"] = pd.to_numeric(dfDistrictsModel["UNEMP_95"])
dfDistrictsModel["CRIME_95"] = pd.to_numeric(dfDistrictsModel["CRIME_95"])

dfDistrictsModel.info()

A heatmap is created to understand if there are any strong relationships between features.

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(abs(dfDistrictsModel.corr()), vmin = 0, vmax = 1, cmap = "Greens", linewidths=0.5, annot=True)

It is observed that linear relationship between "UNEMP_95" and "UNEMP_96" are strong. That's why, a simple linear regression model can be used to predict missing value.

In [None]:
aX = np.array(dfDistrictsModel["UNEMP_96"]).reshape(-1,1)
aY = np.array(dfDistrictsModel["UNEMP_95"]).reshape(-1,1)

oLinRegModel = LinearRegression()

oLinRegModel.fit(aX, aY)

aUnemp95ToPredict = dfDistricts[dfDistricts["UNEMP_95"] == "?"].loc[:, "UNEMP_96"]
aUnemp95ToPredict = np.array(aUnemp95ToPredict).reshape(-1,1)

aUnemp95Predicted = oLinRegModel.predict(aUnemp95ToPredict)

There is a linear relationship between "CRIME_95", "CRIME_96" and "N_INHAB" fields. That's why, a simple linear regression model can be used to predict missing value.

In [None]:
aX = np.array(np.array(dfDistrictsModel[["CRIME_96", "N_INHAB"]]))
aY = np.array(dfDistrictsModel["CRIME_95"]).reshape(-1,1)

oLinRegModel = LinearRegression()

oLinRegModel.fit(aX, aY)

aCrime95ToPredict = dfDistricts[dfDistricts["CRIME_95"] == "?"].loc[:,["CRIME_96", "N_INHAB"]]
aCrime95ToPredict = np.array(aCrime95ToPredict)

aCrime95Predicted = oLinRegModel.predict(aCrime95ToPredict)

In order to replace question marks with predicted valeus,  missing rows are added back to dfDistrictsModel.

In [None]:
dfDistrictsModel= dfDistrictsModel.append(dfMissingRows)
dfDistrictsModel.sort_index(inplace=True)

Predicted valeus are added to missing rows.

In [None]:
dfMaskMissingUnemp95s = dfDistrictsModel["UNEMP_95"] == "?"
dfDistrictsModel.loc[dfMaskMissingUnemp95s,"UNEMP_95"] = aUnemp95Predicted

In [None]:
dfMaskMissingCrime95s = dfDistrictsModel["CRIME_95"] == "?"
dfDistrictsModel.loc[dfMaskMissingCrime95s,"CRIME_95"] = aCrime95Predicted

Now those columns can be converted to numerical type.

In [None]:
dfDistrictsModel["UNEMP_95"] = dfDistrictsModel["UNEMP_95"].astype(np.float64)
dfDistrictsModel["CRIME_95"] = dfDistrictsModel["CRIME_95"].astype(np.int64)

dfDistrictsModel.info()

### ACCOUNT_ID

There are 4915 rows that don't have value in "ACCOUNT_ID" field. This is not a big amount for this database. That's why, those rows are dropped from dfTransactionsModel.

In [None]:
dfMissingAccountIds = dfTransactionsModel[dfTransactionsModel["ACCOUNT_ID"].isna()]

In [None]:
dfMissingAccountIds

In [None]:
dfTransactionsModel.drop(dfMissingAccountIds.index, inplace = True)

"ACCOUNT_ID" field is converted to integer format to be allign with dfCustomers's "ACCOINT_ID" field.

In [None]:
dfTransactionsModel["ACCOUNT_ID"] = dfTransactionsModel["ACCOUNT_ID"].astype(np.int64)

### OPERATION

In [None]:
dfTransactionsMissingOperation = dfTransactionsModel[dfTransactionsModel["OPERATION"].isna()]

In [None]:
dfTransactionsMissingOperation

"TYPE" field of all of the missing values are "CREDIT". When all data is checked, "CREDIT" type can have 2 possible "OPERATION":

1. COLLECTION_FROM_OTHER_BANK
or
2. CREDIT_IN_CASH

In [None]:
dfTransactionsMissingOperation["TYPE"].unique()

In [None]:
dfTransactionsModel[["TYPE", "OPERATION", "AMOUNT"]].groupby(["TYPE", "OPERATION"]).agg(["count", "min", "max"])

A decision classifier can be used to predict missing "OPERATION" values. In order to train the classifier, it would be enough to use only the transactions that have "CREDIT" type. And "TYPE" field doesn't need to be an input feature since it is always same for all missing data.

In [None]:
dfTransactionsNonMissingOperation = dfTransactionsModel.drop(dfTransactionsMissingOperation.index)
dfTransactionsNonMissingOperation = dfTransactionsNonMissingOperation[dfTransactionsNonMissingOperation["TYPE"] == "CREDIT"]

In [None]:
dfX = dfTransactionsModel[dfTransactionsModel["TYPE"] == "CREDIT"].copy()

dfX.drop(["TYPE", "OPERATION", "DATE"], axis = 1, inplace=True)

dfX["TRANSACTION_YEAR"] = dfTransactionsModel["DATE"].dt.year
dfX["TRANSACTION_MONTH"] = dfTransactionsModel["DATE"].dt.month
dfX["TRANSACTION_DAY"] = dfTransactionsModel["DATE"].dt.day

In [None]:
dfX

In [None]:
dfY = pd.get_dummies(dfTransactionsModel[dfTransactionsModel["TYPE"] == "CREDIT"]["OPERATION"])

In [None]:
dfY

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    dfX.loc[dfTransactionsNonMissingOperation.index], 
    dfY.loc[dfTransactionsNonMissingOperation.index], 
    test_size=0.3, 
    random_state=1, 
    shuffle=True)

In [None]:
oDecTreeModel =  DecisionTreeClassifier()
oDecTreeModel.fit(X_train, y_train)

y_pred = oDecTreeModel.predict(X_test)

y_pred = pd.DataFrame(y_pred, index = y_test.index, columns = y_test.columns )

Both classes have good accuracy and f1 score meaning that decision tree model may able to predict both True Positives and True Negatives.

In [None]:
print(classification_report(y_test, y_pred, target_names=dfY.columns))

for sClass in dfY.columns:
    print(f"Accuracy of : " + sClass + ": "+ str(round(metrics.accuracy_score(y_test[sClass], y_pred[sClass]), 2))) 

All of the missing values are predicted as "CREDIT_IN_CASH" value.

In [None]:
aPredictionsForMissingOperations = oDecTreeModel.predict(dfX.loc[dfTransactionsMissingOperation.index])
dfPredictionsForMissingOperations = pd.DataFrame(data = aPredictionsForMissingOperations, columns = dfY.columns, index = dfTransactionsMissingOperation.index)

aPredictedOperations = dfPredictionsForMissingOperations.idxmax(axis=1)

dfTransactionsModel.loc[dfTransactionsMissingOperation.index,"OPERATION"] = aPredictedOperations

In [None]:
aPredictedOperations.unique()

# DATA ANALYSIS

## Time Series Analysis

In [None]:
dfAnalysis = dfTransactionsModel[["DATE", "AMOUNT"]].groupby(["DATE"], as_index = False).agg(["count", "mean"])["AMOUNT"].reset_index()

In [None]:
dfAnalysis

1. There is a seasonality in average amount of payment. 
2. Behavior of average payment doesn't change much after 1994. But there is a high deviation on average payment in 1993's 1st half. 
3. On every January, average payment goes down.

In [None]:
plt.figure(figsize=(20,10))
sns.lineplot(data = dfAnalysis, 
             x = "DATE", 
             y = "mean")

In some dates, average "AMOUNT" is less than 1k. When year, 1998 is checked, it is observed that those are the end dates of each month.

In [None]:
dfAnalysis[(dfAnalysis["mean"] < 1000) & (dfAnalysis["DATE"].dt.year == 1998)]

1. There is an upwards-trend for the number of the payments. 
2. At the last day of each month, number of payment increases.
3. During 5th-15th of each month, number of payments are higher than other days (except latest day of month).

In [None]:
plt.figure(figsize=(20,10))
sns.lineplot(data = dfAnalysis, 
             x = "DATE", 
             y = "count")

In [None]:
plt.figure(figsize=(20,10))
sns.lineplot(data = dfAnalysis[(dfAnalysis["DATE"] > "1996-03-30") & (dfAnalysis["DATE"] < "1996-07-01")], 
             x = "DATE", 
             y = "count")

## Customer Profile Analysis

In [None]:
dfMaskTransaction1996 = dfTransactionsModel["DATE"].dt.year == 1996
dfMaskTransctionsCredit = dfTransactionsModel["TYPE"] =="CREDIT"
dfMaskTransctionsWithdrawal = dfTransactionsModel["TYPE"] =="WITHDRAWAL"

dfTransactions1996 = dfTransactionsModel[dfMaskTransaction1996]
dfTransactionsCredit1996 = dfTransactionsModel[dfMaskTransaction1996 & dfMaskTransctionsCredit]
dfTransactionsWithdrawal1996 = dfTransactionsModel[dfMaskTransaction1996 & dfMaskTransctionsWithdrawal]

In [None]:
dfBalances1996 = dfTransactions1996[["ACCOUNT_ID", "BALANCE"]].groupby(["ACCOUNT_ID"]).agg(["mean"])

In [None]:
dfCredits1996 = dfTransactionsCredit1996[["ACCOUNT_ID", "AMOUNT"]].groupby(["ACCOUNT_ID"]).agg(["count", "sum", "mean"])

dfCredits1996.columns = dfCredits1996.columns.set_levels(["CREDIT AMAOUNT"], level = 0)

In [None]:
dfWithdrawals1996 = dfTransactionsWithdrawal1996[["ACCOUNT_ID", "AMOUNT"]].groupby(["ACCOUNT_ID"]).agg(["count","sum","mean"])

dfWithdrawals1996.columns = dfWithdrawals1996.columns.set_levels(["WITHDRAWAL AMAOUNT"], level = 0)

In [None]:
dfTransactionSummary1996 = dfBalances1996.join(dfCredits1996, how = "left")
dfTransactionSummary1996 = dfTransactionSummary1996.join(dfWithdrawals1996, how = "left")
dfTransactionSummary1996.columns = ['_'.join(col) for col in dfTransactionSummary1996.columns.values]
dfTransactionSummary1996

In [None]:
dfAnalysis = dfCustomersModel.reset_index().merge(dfTransactionSummary1996, how = "inner", on = "ACCOUNT_ID")
dfAnalysis = dfAnalysis.merge(dfDistrictsModel, on = "DISTRICT_ID", how = "inner")

dfAnalysis["CUSTOMER_AGE"] = 1996 - dfAnalysis["BIRTH_DT"].dt.year

dfAnalysis.drop(["CLIENT_ID", "DISTRICT_ID", "ACCOUNT_ID", "BIRTH_DT", "SET_SPLIT", "CRIME_95", "UNEMP_95", "ACTIVE"], axis = 1, inplace=True)

In [None]:
dfAnalysis.info()

There is no linear relationship between district features v.s. average&sum transaction quantities. That's why, district related features are excluded from dfAnalysis.

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(round(abs(dfAnalysis.corr()),1), vmin=0, vmax=1,  cmap = "Greens", linewidths=0.5, annot=True)

In [None]:
dfAnalysis.drop(['N_INHAB','N_CITIES', 'URBAN_RATIO', 'AVG_SALARY', 'UNEMP_96', 'N_ENTR','CRIME_96'], axis = 1, inplace=True)

BALANCE_mean distribution plot: 
1. Average balances of the customers who have loan is similar to the ones who didn't have loan. 
2. Number of customers who have loan are less than the the ones who don't have loan.

BALANCE_mean vs. CREDIT_AMOUNT_sum: 
1. The customers who took loan has similar CREDIT_AMOUNT_sum to their BALANCE_mean.
2. The customers that didn't take loan has relatively smaller CREDIT_AMOUNT_sum.

BALANCE_mean vs. CUSTOMER_AGE: 
1. The customers that are under 20: 
    1. They don't have any loan almost. 
2. The customers that are between 20-40:
    1. mainly the customers that have over 60k balance have loan. 
3. The customers between 40-60 ages:
    1. loan is distributed more homogeneously than customers that are in age 20-40.
4. The customers over 60:
    1. They don't have loan. 
    2. They have almost half balance than other customer profiles. 

CREDIT_AMOUNT_count distribution plot: 
1. Plot has 3 hills. It s a sign that there are clusters based on CREDIT_AMOUNT_count. 
2. CREDIT_AMOUNT_count can be categorized: 
    1. less than 20 transactions
    2. 20-40 transactions
    3. 40+ transactions

CREDIT_AMOUNT_count vs. CUSTOMER_AGE: 
1. Customers over 60, have half number of credits compared to other customers.

CREDIT_AMOUNT_sum distribution plot: 
1. Customers that have less than 500k,
    1. number of the customers that don't have loan is more than the other ones. 
2. Customers that have more than 500k, 
    1. CREDIT_AMOUNT_sum have almost same amount of customers that are loaned and not loaned. 
3. Plot has 3 hills. We can categorize them as:
    1. less than 200k
    2. 200k-500k
    3. 500k+

CREDIT_AMOUNT_sum vs. WITHDRAWAL_AMOUNT_sum:
1. There is a strong linear relationship between them regardless loan status.

CREDIT_AMOUNT_sum vs. CUSTOMER_AGE: 
1. The customers over 60
    1. have 4 times less CREDIT_AMOUNT_sum than the other customers. 
    2. This ratio was 2 about CREDIT_AMOUNT_count and BALANCE_mean. 
2. Customers that are less than 60 years old
    1. customers that have CREDIT_AMOUNT_sum  more than 200k, 
        1. mostly have loan 
    2. customers that have CREDIT_AMOUNT_sum less than 200k,
        1. mostly don't have loan.

WITHDRAWAL_AMOUNT_count distribution plot: 
1. Customers that don't have loan withdraw more times than the ones that have loan. 
2. There are 2 hills on distribution plot. WITHDRAWAL_AMOUNT_count can be categorized as:
    1. less than 25 transactions
    2. 25+ transactions
3. Customers that have less than 500k transactions,
    1. Number of non-loaned customers are less than loaned customers.
4. Customers that have more than 500k transactions,
    1. Number of loaned and non-loaned customers are close to each other.

CUSTOMER_AGE distribution plot:
1. Based on the hills of plot, CUSTOMER_AGE can be categorized as follows:
    1. Less than 20 age
    2. 20-40 age
    3. 40-60 age
    4. 60+ age.

In [None]:
plt.figure(figsize=(20,10))

sns.pairplot(
    data=dfAnalysis,
    hue = "LOAN"
)

Gender doesn't matter on customer behavior. For example, CREDIT_AMOUNT_sum has similar pattern for both male and female customers.

In [None]:
plt.figure(figsize=(20,10))

sns.relplot(
    data=dfAnalysis, 
    x = "CUSTOMER_AGE", 
    y = "CREDIT AMAOUNT_sum", 
    hue = "LOAN",
    col="GENDER", 
    kind="scatter"
)

## Geographical District Analysis

In [None]:
dfAnalysis = dfTransactionsModel[dfTransactionsModel["DATE"].dt.year == 1996]
dfAnalysis = dfTransactionsModel.reset_index().merge(dfCustomersModel.reset_index(), on = "ACCOUNT_ID", how = "inner").set_index("TRANS_ID")
dfAnalysis = dfAnalysis.reset_index().merge(dfDistrictsModel.reset_index(), on = "DISTRICT_ID", how = "inner").set_index("TRANS_ID")
dfAnalysis["CUSTOMER_AGE"] = 1996 - dfAnalysis["BIRTH_DT"].dt.year

In [None]:
dfAnalysis.drop(["DATE", "GENDER", "BIRTH_DT", "ACTIVE", "SET_SPLIT", "UNEMP_95", "CRIME_95"], axis = 1, inplace = True)

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(data = dfAnalysis[dfAnalysis["TYPE"] == "CREDIT"], x = "DISTRICT_ID", y = "AMOUNT", hue = "OPERATION")

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(data = dfAnalysis[dfAnalysis["TYPE"] == "WITHDRAWAL"], x = "DISTRICT_ID", y = "AMOUNT", hue = "OPERATION")

# PREDICTIVE MODEL

In [None]:
dfAnalysis =dfTransactionsModel.pivot_table(index = "ACCOUNT_ID", columns="OPERATION", values="AMOUNT")
dfAnalysis.fillna(0, inplace=True)

dfTransacitonSummary = dfTransactionsModel[["BALANCE", "ACCOUNT_ID"]].groupby(["ACCOUNT_ID"]).agg(["count", "mean"])["BALANCE"]
dfTransacitonSummary.columns =["NUMBER_OF_TRANSACTIONS", "BALANCE_AVERAGE"]


dfAnalysis = dfAnalysis.merge(dfTransacitonSummary,  how = "inner", on = "ACCOUNT_ID")

In [None]:
dfAnalysis = dfAnalysis.merge(dfCustomersModel, how = "inner", on = "ACCOUNT_ID")
dfAnalysis["CUSTOMER_AGE"] = 1996- dfAnalysis["BIRTH_DT"].dt.year

In [None]:
dfAnalysis= dfAnalysis.merge(dfDistrictsModel, how = "inner", on = "DISTRICT_ID")

In [None]:
dfAnalysis.drop(["ACCOUNT_ID", "GENDER", "BIRTH_DT", "DISTRICT_ID"], axis=1, inplace=True)

## Split Dataset

In [None]:
dfTrainX = dfAnalysis[dfAnalysis["SET_SPLIT"] == "TRAIN"].drop(["SET_SPLIT", "LOAN"], axis = 1)
dfTrainY = dfAnalysis[dfAnalysis["SET_SPLIT"] == "TRAIN"]["LOAN"]
dfTestX = dfAnalysis[dfAnalysis["SET_SPLIT"] == "TEST"].drop(["SET_SPLIT", "LOAN"], axis = 1)
dfTestY = dfAnalysis[dfAnalysis["SET_SPLIT"] == "TEST"]["LOAN"]

## Oversample Imbalance Dataset

In [None]:
dfCombinations = dfTrainY
dfCombinationsStats = dfCombinations.value_counts()
dfCombinationsStats = pd.DataFrame(dfCombinationsStats).reset_index()

iMaxAmount = dfCombinationsStats.iloc[0,1]
for i in range(1, len(dfCombinationsStats) ):

    sCombination = dfCombinationsStats.iloc[i, 0]
    iSamplesNeeded = iMaxAmount - dfCombinationsStats.iloc[i, 1]

    dfSampledIndex =  dfCombinations[dfCombinations == sCombination].sample(iSamplesNeeded, replace = True).index

    dfSampledX = dfTrainX.loc[dfSampledIndex]
    dfSampledY = dfTrainY.loc[dfSampledIndex]


    dfTrainX = dfTrainX.append(dfSampledX , ignore_index= True)
    dfTrainY = dfTrainY.append(dfSampledY , ignore_index= True)


dfTrainX,dfTrainY = shuffle(dfTrainX,dfTrainY,random_state=1 )

In [None]:
dfTrainY.value_counts()

## Logistic Regression

### Train

In [None]:
oLogRegModel =  LogisticRegression(max_iter=10000)
oLogRegModel.fit(dfTrainX, dfTrainY)

In [None]:
aPredictedLogReg = oLogRegModel.predict(dfTrainX)
dfPredictedLogReg = pd.DataFrame(aPredictedLogReg, index = dfTrainX.index, columns = ["LOAN"])
print(classification_report(dfTrainY, aPredictedLogReg))

### Test

In [None]:
aPredictedLogRegProbability = oLogRegModel.predict_proba(dfTestX)
aPredictedLogReg = oLogRegModel.predict(dfTestX)
dfPredictedLogReg = pd.DataFrame(aPredictedLogReg, index = dfTestX.index, columns = ["LOAN"])
print(classification_report(dfTestY, aPredictedLogReg))

## Decision Tree Classifier

### Train

In [None]:
oDecTreeModel =  DecisionTreeClassifier()
oDecTreeModel.fit(dfTrainX, dfTrainY)

In [None]:
aPredictedDecTree = oDecTreeModel.predict(dfTrainX)
dfPredictedDecTree = pd.DataFrame(aPredictedDecTree, index = dfTrainX.index, columns = ["LOAN"])
print(classification_report(dfTrainY, dfPredictedDecTree))

### Test

In [None]:
aPredictedDecTreeProbability = oDecTreeModel.predict_proba(dfTestX)
aPredictedDecTree = oDecTreeModel.predict(dfTestX)
dfPredictedDecTree = pd.DataFrame(aPredictedDecTree, index = dfTestX.index, columns = ["LOAN"])
print(classification_report(dfTestY, dfPredictedDecTree))

## Random Forest Classification

### Train

In [None]:
oRandForestModel = RandomForestClassifier()
oRandForestModel.fit(dfTrainX, dfTrainY.values.ravel())

In [None]:
aPredictedRandForest = oRandForestModel.predict(dfTrainX)
dfPredictedRandForest = pd.DataFrame(aPredictedRandForest, index = dfTrainX.index, columns = ["LOAN"])
print(classification_report(dfTrainY, dfPredictedRandForest))

### Test

In [None]:
aPredictedRandForestProbability = oRandForestModel.predict_proba(dfTestX)
aPredictedRandForest = oRandForestModel.predict(dfTestX)
dfPredictedRandForest = pd.DataFrame(aPredictedRandForest, index = dfTestX.index, columns = ["LOAN"])
print(classification_report(dfTestY, dfPredictedRandForest))

## Preprocessing for Deep Learning Models

In [None]:
dfTrainX, dfValidationX, dfTrainY, dfValidationY = train_test_split(
    dfTrainX,
    dfTrainY,
    test_size=0.30,
    shuffle=True,
    random_state=1)

In [None]:
oScaler = StandardScaler()
dfTrainX = oScaler.fit_transform(dfTrainX)
dfValidationX = oScaler.transform(dfValidationX)
dfTestX = oScaler.transform(dfTestX)

In [None]:
c_f_LEARNING_RATE = 0.001
c_f_MOMENTUM_RATE = 0.9
c_i_PATIENCE = 10
c_i_BATCH_SIZE = 16
c_i_EPOCH_SIZE = 1000
c_f_L2_FACTOR = 0.01

oEarlyStop = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 0 , patience = c_i_PATIENCE, restore_best_weights=True)
oOptimizer = tf.keras.optimizers.Adam(learning_rate= c_f_LEARNING_RATE, beta_1=c_f_MOMENTUM_RATE)

oKernelRegulizer = regularizers.l2(c_f_L2_FACTOR)

## Multi Layer Perceptron

### Train

In [None]:
oMlpModel = tf.keras.Sequential()

oMlpModel.add(Dense(100, activation='relu', kernel_regularizer=oKernelRegulizer, input_shape=(dfTrainX.shape[1],)))
oMlpModel.add(Dropout(0.5))
oMlpModel.add(Dense(100, activation='relu', kernel_regularizer=oKernelRegulizer))
oMlpModel.add(Dropout(0.5))
oMlpModel.add(Dense(1, activation='sigmoid'))

oMlpModel.compile(optimizer=oOptimizer, loss='binary_crossentropy', metrics=['accuracy'])

oMlpModel.fit(dfTrainX, 
              dfTrainY, 
              epochs=c_i_EPOCH_SIZE, 
              batch_size=c_i_BATCH_SIZE, 
              verbose=1, 
              validation_data= (dfValidationX, dfValidationY),
              callbacks=[oEarlyStop]
             )

In [None]:
plt.figure(figsize=(20,10))
dfMlpHistory = pd.DataFrame(oMlpModel.history.history)

dfMlpHistory[["loss", "val_loss"]].plot()

In [None]:
aPredictedMlpProbability = oMlpModel.predict(dfTrainX)

aPredictedMlp = np.zeros(aPredictedMlpProbability.shape)
aPredictedMlp[aPredictedMlpProbability >= 0.5] = 1
print(classification_report(dfTrainY, aPredictedMlp, zero_division = 0))

### Test

In [None]:
aPredictedMlpProbability = oMlpModel.predict(dfTestX)

aPredictedMlp = np.zeros(aPredictedMlpProbability.shape)
aPredictedMlp[aPredictedMlpProbability >= 0.5] = 1
print(classification_report(dfTestY, aPredictedMlp, zero_division = 0))

## Long-Short Term Memory

### Train

In [None]:
oLstmModel = tf.keras.Sequential()

oLstmModel.add(LSTM(100, activation = "relu", kernel_regularizer=oKernelRegulizer))
oLstmModel.add(Dropout(0.5))
oLstmModel.add(Dense(1, activation='sigmoid'))

oLstmModel.compile(optimizer=oOptimizer, loss='binary_crossentropy', metrics=['accuracy'])

oLstmModel.fit(tf.expand_dims(dfTrainX, 1), 
                dfTrainY, 
                epochs=c_i_EPOCH_SIZE, 
                batch_size=c_i_BATCH_SIZE, 
                verbose=1, 
                validation_data= (tf.expand_dims(dfValidationX, 1), dfValidationY),
                callbacks=[oEarlyStop]
             )

In [None]:
plt.figure(figsize=(20,10))
dfLstmHistory = pd.DataFrame(oLstmModel.history.history)
dfLstmHistory[["loss", "val_loss"]].plot()

In [None]:
aPredictedLstmProbability = oLstmModel.predict(tf.expand_dims(dfTrainX, 1))

aPredictedLstm = np.zeros(aPredictedLstmProbability.shape)
aPredictedLstm[aPredictedLstmProbability >= 0.5] = 1
print(classification_report(dfTrainY, aPredictedLstm, zero_division = 0))

### Test

In [None]:
aPredictedLstmProbability = oLstmModel.predict(tf.expand_dims(dfTestX, 1))

aPredictedLstm = np.zeros(aPredictedLstmProbability.shape)
aPredictedLstm[aPredictedLstmProbability >= 0.5] = 1
print(classification_report(dfTestY, aPredictedLstm, zero_division = 0))

## ROC Curves

ROC curves typically feature true positive rate on the Y axis, and false positive rate on the X axis. This means that the top left corner of the plot is the “ideal” point - a false positive rate of zero, and a true positive rate of one. This is not very realistic, but it does mean that a larger area under the curve (AUC) is usually better.

The “steepness” of ROC curves is also important, since it is ideal to maximize the true positive rate while minimizing the false positive rate.

Source:

1. https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5

1. https://www.analyticsvidhya.com/blog/2020/06/auc-roc-curve-machine-learning/

In [None]:
aNoSkillProbability = [0 for _ in range(len(dfTestY))]

fAucNoSkill = metrics.roc_auc_score(dfTestY, aNoSkillProbability)
fAucLogReg = metrics.roc_auc_score(dfTestY, aPredictedLogRegProbability[:,0])
fAucRandForest = metrics.roc_auc_score(dfTestY, aPredictedRandForestProbability[:,0])
fAucDecTree = metrics.roc_auc_score(dfTestY, aPredictedDecTreeProbability[:,0])
fAucMlp = metrics.roc_auc_score(dfTestY, aPredictedMlpProbability[:,0])
fAucLstm = metrics.roc_auc_score(dfTestY, aPredictedLstmProbability[:,0])



aFprNoSkill, aTprNoSkill, _ = metrics.roc_curve(dfTestY, aNoSkillProbability)
aFprLogReg, aTprLogReg, _ = metrics.roc_curve(dfTestY,  aPredictedLogRegProbability[:,0])
aFprRandForest, aTprRandForest, _ = metrics.roc_curve(dfTestY,  aPredictedRandForestProbability[:,0])
aFprDecTree, aTprDecTree, _ = metrics.roc_curve(dfTestY,  aPredictedDecTreeProbability[:,0])
aFprMlp, aTprMlp, _ = metrics.roc_curve(dfTestY,  aPredictedMlpProbability[:,0])
aFprLstm, aTprLstm, _ = metrics.roc_curve(dfTestY,  aPredictedLstmProbability[:,0])

plt.figure(figsize=(20,10))
plt.plot(aFprNoSkill, aTprNoSkill, linestyle='--', label='No Skill ROC AUC=%.3f' % (fAucNoSkill))
plt.plot(aFprLogReg, aTprLogReg, marker='.', label='Logistic Regression ROC AUC=%.3f' % (fAucLogReg))
plt.plot(aFprRandForest, aTprRandForest, marker='.', label='Random Forest Classifier ROC AUC=%.3f' % (fAucRandForest))
plt.plot(aFprDecTree, aTprDecTree, marker='.', label='Decision Tree Classifier ROC AUC=%.3f' % (fAucDecTree))
plt.plot(aFprMlp, aTprMlp, marker='.', label='Multi Layer Perceptron ROC AUC=%.3f' % (fAucMlp))
plt.plot(aFprLstm, aTprLstm, marker='.', label='Long Short Term Memory ROC AUC=%.3f' % (fAucLstm))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(bbox_to_anchor=(1.0, 1.0))
plt.show()