# IMPORT LIBRARIES

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE

# READING DATA FROM SOURCE

In [None]:
dfCustomers = pd.read_csv("customers.csv", delimiter = ",")
dfCustomers.set_index("CLIENT_ID", inplace = True)

In [None]:
dfCustomers.head()

In [None]:
dfTransactions = pd.read_csv("transactions.csv", delimiter = ",")
dfTransactions.set_index("TRANS_ID", inplace = True)

In [None]:
dfTransactions.head()

In [None]:
dfDistricts =  pd.read_csv("districts.csv", delimiter = ",")
dfDistricts.set_index("DISTRICT_ID", inplace = True)

In [None]:
dfDistricts.head()

# DATA EXPLORATION

## Exploring Uniqueness

### dfCustomers

In [None]:
len(dfCustomers.index.unique()) == len(dfCustomers.index)

### dfDistricts

In [None]:
len(dfDistricts.index.unique()) == len(dfDistricts.index)

### dfTransactions

dfTransactions contain some duplicated TRANS_IDs.

In [None]:
len(dfTransactions.index.unique()) == len(dfTransactions.index)

There are 20000 rows which are duplicated.

In [None]:
dfTransactions[dfTransactions.index.duplicated(keep=False)]

In [None]:
dfTransactions.loc[3457056]

## Exploring Data Types

### dfCustomers

"LOAN" field should be either 1 or 0 by description. Usually such fields have integer data types. However, it looks as float64, it s a sign that this field can include some unexpected data.

In [None]:
dfCustomers.info()

Confirming that each client has only 1 account id in dfCustomers dataset.

In [None]:
len(dfCustomers.index.unique()) == len(dfCustomers["ACCOUNT_ID"].unique())

### dfDistricts

"UNEMP_95" and "CRIME_95" fields look as object data type. By description of field, "UNEMP_95" should represent unemployment ratio which is float, however it looks as object. Similarly, "CRIME_95" field represents number of committed crimes which should be in integer type. However, it also looks as object. These are signs that, the fields of "UNEMP_95" and "CRIME_95" contain some unexpected data.

In [None]:
dfDistricts.info()

### dfTransactions

"ACCOUNT_ID" field's data type should be integer.

In [None]:
dfTransactions.info()

## Exploring Missing Data

### dfCustomers

Only field 'LOAN' includes NaN values in dfCustomers dataframe.

In [None]:
dfCustomers.columns[dfCustomers.isna().any()].tolist()

Checking unique fields to be sure if any other missing values except NaN. Unique values are sorted to detect missing values on the boundaries of the sorted array.

In [None]:
np.sort(dfCustomers.index.unique())

In [None]:
np.sort(dfCustomers["ACCOUNT_ID"].unique())

In [None]:
np.sort(dfCustomers["GENDER"].unique())

In [None]:
np.sort(dfCustomers["BIRTH_DT"].unique())

Confirming that each date value includes 8 characters.

In [None]:
dfCustomers["BIRTH_DT"].astype(str).str.len().unique()

In [None]:
np.sort(dfCustomers["ACTIVE"].unique())

In [None]:
np.sort(dfCustomers["LOAN"].unique())

In [None]:
np.sort(dfCustomers["DISTRICT_ID"].unique())

In [None]:
np.sort(dfCustomers["SET_SPLIT"].unique())

### dfDistricts

Looks like there is no field that contains NaN value in dfDistricts.

In [None]:
dfDistricts.columns[dfDistricts.isna().any()].tolist()

Checking unique fields to be sure if any other missing values except NaN. Unique values are sorted to detect missing values on the boundaries of the sorted array.

In [None]:
np.sort(dfDistricts.index.unique())

In [None]:
np.sort(dfDistricts["N_INHAB"].unique())

In [None]:
np.sort(dfDistricts["N_CITIES"].unique())

In [None]:
np.sort(dfDistricts["URBAN_RATIO"].unique())

In [None]:
np.sort(dfDistricts["AVG_SALARY"].unique())

"UNEMP_95" field contains value of question mark character '?'.

In [None]:
np.sort(dfDistricts["UNEMP_95"].unique())

In [None]:
np.sort(dfDistricts["UNEMP_96"].unique())

In [None]:
np.sort(dfDistricts["N_ENTR"].unique())

"CRIME_95" field contains value of question mark character '?'.

In [None]:
np.sort(dfDistricts["CRIME_95"].unique())

In [None]:
np.sort(dfDistricts["CRIME_96"].unique())

### dfTransactions

"ACCOUNT_ID" and "OPERATION" operation fields contain NaN values.

In [None]:
dfTransactions.columns[dfTransactions.isna().any()].tolist()

In [None]:
np.sort(dfTransactions.index.unique())

"ACCOUNT_ID" field contains NaN values.

In [None]:
np.sort(dfTransactions["ACCOUNT_ID"].unique())

"DATE" field doesn't contain NaN value but date format is not in DDMMYYY format for some values.

In [None]:
np.sort(dfTransactions["DATE"].unique())

Ther are incompatible formats of DD such as 40, 50, 41 etc. 

In [None]:
 dfTransactions["DATE"].astype(str).str[:2].unique()

Ther are incompatible formats of MM such as 71, 81, 91 etc.

In [None]:
dfTransactions["DATE"].astype(str).str[2:4].unique()

When some accounts are checked, it is observed that they are not inserted as DDMMYYY but DDMMYYYY format. And when day of month is less than 10, they are inserted only 1 digit not 2 digits.

In [None]:
 dfTransactions[dfTransactions["DATE"].astype(str).str[:2] == "50"]

In [None]:
dfTransactions[dfTransactions["ACCOUNT_ID"] == 652]

In [None]:
dfTransactions["DATE"].astype(str).str[4:].unique()

In [None]:
np.sort(dfTransactions["AMOUNT"].unique())

In [None]:
np.sort(dfTransactions["BALANCE"].unique())

In [None]:
np.sort(dfTransactions["TYPE"].unique())

"ACCOUNT_ID" field contains NaN values.

In [None]:
dfTransactions["OPERATION"].unique()

# PREPROCESSING

Data preprocessing is applied to the copied datasets since some fields may be good enough for exploratory analysis but not complied for predictive model. That's why, preprocessing is applied only to the datasets that will be used in predictive models.

In [None]:
dfCustomersModel = dfCustomers.copy()
dfTransactionsModel = dfTransactions.copy()
dfDistrictsModel = dfDistricts.copy()

## Remove Duplicated Rows

### TRANS_ID

One of the duplicated rows are kept. Rest of them are dropped.

In [None]:
dfTransactionsModel.drop_duplicates(inplace = True, keep = "first")

Still there are duplicated rows in terms of index but not for other values. The reason of this situation is that at least one of the duplicated rows contains NaN value.

In [None]:
len(dfTransactionsModel.index.unique()) == len(dfTransactionsModel.index)

In [None]:
dfDuplicatedTransactions = dfTransactionsModel[dfTransactionsModel.index.duplicated(keep=False)]

In [None]:
dfDuplicatedTransactions.sort_index()

We can fill missing values by copying from its duplicated index.

In [None]:
aDuplicatedTransIds = dfDuplicatedTransactions.index.unique()


for iTransId in aDuplicatedTransIds:
    
    for j in range(dfTransactionsModel.shape[1]):
        sColumnName = dfTransactionsModel.columns[j]
        
        aColumnValues = dfTransactionsModel.loc[iTransId, sColumnName]
        
        iNrOfMissingValues = aColumnValues.isna().sum()
        iNrOfDuplicates = len(aColumnValues)
        
        if iNrOfMissingValues > 0 and iNrOfMissingValues < iNrOfDuplicates:
            
            aNonMissingValues = aColumnValues[aColumnValues.notna()]
            iNrOfNonMissingValues = len(aNonMissingValues)
            
            if iNrOfNonMissingValues == 1:
                dfTransactionsModel.loc[iTransId, sColumnName] = aNonMissingValues
            
        
        


After filling missing values from it's duplicated row, dropping duplicates is applied once again. It is observed that, there is no more duplicated rows.

In [None]:
dfTransactionsModel.drop_duplicates(inplace = True, keep = "first")

In [None]:
len(dfTransactionsModel.index.unique()) == len(dfTransactionsModel.index)

## Converting Date Formats

### BIRTH_DT

In [None]:
dfCustomersModel["BIRTH_DT"] = pd.to_datetime(dfCustomersModel["BIRTH_DT"], format="%Y%m%d")

In [None]:
np.sort(dfCustomersModel["BIRTH_DT"].unique())

In [None]:
dfCustomersModel.info()

### DATE

"DATE" field should be in DDMMYYY based on description however, it is in DMMYYYY format. For the days that are less than 10, day of month is represented just 1 digit. 

 "0" character is added at the beginning of the values where day of month is represented with single value. 

In [None]:
dfTransactionsModel["DATE"] = dfTransactionsModel["DATE"].astype(str)
adfTransactionsWith7DigitsDates = dfTransactionsModel[dfTransactionsModel["DATE"].str.len() == 7]
dfTransactionsModel.loc[adfTransactionsWith7DigitsDates.index, "DATE"] = "0" + dfTransactionsModel.loc[adfTransactionsWith7DigitsDates.index, "DATE"]

In [None]:
dfTransactionsModel["DATE"] = pd.to_datetime(dfTransactionsModel["DATE"], format="%d%m%Y")

In [None]:
np.sort(dfTransactionsModel["DATE"].unique())

In [None]:
dfTransactionsModel.info()

## Missing Data Handling

### LOAN

There are 50 customers which have NaN value on their "LOAN" field. Empty "LOAN" data can't be used for training or testing purposes. That's why, the customers which don't have "LOAN" information are dropped from dfCustomersModel and dfTransactionsModel datasets.

In [None]:
dfCustomersMissingLoan = dfCustomers[dfCustomers["LOAN"].isna()]

dfTransactionsMissingLoan = dfTransactions.reset_index().merge(
    dfCustomersMissingLoan, 
    how = "inner", 
    on = "ACCOUNT_ID").set_index("TRANS_ID")


dfTransactionsModel.drop(dfTransactionsMissingLoan.index, inplace = True)
dfCustomersModel.drop(dfCustomersMissingLoan.index, inplace = True)

### UNEMP_95 & CRIME_95

There is 1 district (DISTRICT_ID=69) whose "UNEMP_95" and "CRIME_95" fields are empty.

In [None]:
dfDistricts[dfDistricts["UNEMP_95"] == "?"]

In [None]:
dfDistricts[dfDistricts["CRIME_95"] == "?"]

A predictive model is used to produce a value instead of question mark. As it is mentioned above, "UNEMP_95" and "CRIME_95" are in object format not in float format. The reason of this situation was because of question mark character on DISTRIC_ID=96. 
This row is dropped for temporary purpose from dataset to convert "UNEMP_95" and "CRIME_95" to float data type so that we can perform some numerical analysis. After missing values are predicted, DISTRICT_ID = 69 will be appended back to dfDistrictsModel.

In [None]:
dfMissingRows = dfDistrictsModel[(dfDistrictsModel["UNEMP_95"] == "?") | (dfDistrictsModel["CRIME_95"] == "?")]

dfDistrictsModel.drop(dfMissingRows.index, inplace = True)

dfDistrictsModel["UNEMP_95"] = pd.to_numeric(dfDistrictsModel["UNEMP_95"])
dfDistrictsModel["CRIME_95"] = pd.to_numeric(dfDistrictsModel["CRIME_95"])

dfDistrictsModel.info()

A heatmap is created to understand if there are any strong relationships between features.

In [None]:
sns.heatmap(abs(dfDistrictsModel.corr()), vmin = 0, vmax = 1, cmap = "Greens", linewidths=0.5, annot=True)

It is observed that linear relationship between "UNEMP_95" and "UNEMP_96" are strong. That's why, a simple linear regression model can be used to predict missing value.

In [None]:
aX = np.array(dfDistrictsModel["UNEMP_96"]).reshape(-1,1)
aY = np.array(dfDistrictsModel["UNEMP_95"]).reshape(-1,1)

oLinRegModel = LinearRegression()

oLinRegModel.fit(aX, aY)

aUnemp95ToPredict = dfDistricts[dfDistricts["UNEMP_95"] == "?"].loc[:, "UNEMP_96"]
aUnemp95ToPredict = np.array(aUnemp95ToPredict).reshape(-1,1)

aUnemp95Predicted = oLinRegModel.predict(aUnemp95ToPredict)

There is a linear relationship between "CRIME_95", "CRIME_96" and "N_INHAB" fields. That's why, a simple linear regression model can be used to predict missing value.

In [None]:
aX = np.array(np.array(dfDistrictsModel[["CRIME_96", "N_INHAB"]]))
aY = np.array(dfDistrictsModel["CRIME_95"]).reshape(-1,1)

oLinRegModel = LinearRegression()

oLinRegModel.fit(aX, aY)

aCrime95ToPredict = dfDistricts[dfDistricts["CRIME_95"] == "?"].loc[:,["CRIME_96", "N_INHAB"]]
aCrime95ToPredict = np.array(aCrime95ToPredict)

aCrime95Predicted = oLinRegModel.predict(aCrime95ToPredict)

In order to replace question marks with predicted valeus,  missing rows are added back to dfDistrictsModel.

In [None]:
dfDistrictsModel= dfDistrictsModel.append(dfMissingRows)
dfDistrictsModel.sort_index(inplace=True)

Predicted valeus are added to missing rows.

In [None]:
dfMaskMissingUnemp95s = dfDistrictsModel["UNEMP_95"] == "?"
dfDistrictsModel.loc[dfMaskMissingUnemp95s,"UNEMP_95"] = aUnemp95Predicted

In [None]:
dfMaskMissingCrime95s = dfDistrictsModel["CRIME_95"] == "?"
dfDistrictsModel.loc[dfMaskMissingCrime95s,"CRIME_95"] = aCrime95Predicted

Now those columns can be converted to numerical type.

In [None]:
dfDistrictsModel["UNEMP_95"] = dfDistrictsModel["UNEMP_95"].astype(np.float64)
dfDistrictsModel["CRIME_95"] = dfDistrictsModel["CRIME_95"].astype(np.int64)

dfDistrictsModel.info()

### ACCOUNT_ID

There are 4915 rows that don't have value in "ACCOUNT_ID" field. This is not a big amount for this database. That's why, those rows are dropped from dfTransactionsModel.

In [None]:
dfMissingAccountIds = dfTransactionsModel[dfTransactionsModel["ACCOUNT_ID"].isna()]

In [None]:
dfMissingAccountIds

In [None]:
dfTransactionsModel.drop(dfMissingAccountIds.index, inplace = True)

"ACCOUNT_ID" field is converted to integer format to be allign with dfCustomers's "ACCOINT_ID" field.

In [None]:
dfTransactionsModel["ACCOUNT_ID"] = dfTransactionsModel["ACCOUNT_ID"].astype(np.int64)

### OPERATION

In [None]:
dfTransactionsMissingOperation = dfTransactionsModel[dfTransactionsModel["OPERATION"].isna()]

In [None]:
dfTransactionsMissingOperation

"TYPE" field of all of the missing values are "CREDIT". When all data is checked, "CREDIT" type can have 2 possible "OPERATION":

1. COLLECTION_FROM_OTHER_BANK
or
2. CREDIT_IN_CASH

In [None]:
dfTransactionsMissingOperation["TYPE"].unique()

In [None]:
dfTransactionsModel[["TYPE", "OPERATION", "AMOUNT"]].groupby(["TYPE", "OPERATION"]).agg(["count", "min", "max"])

A decision classifier can be used to predict missing "OPERATION" values. In order to train the classifier, it would be enough to use only the transactions that have "CREDIT" type. And "TYPE" field doesn't need to be an input feature since it is always same for all missing data.

In [None]:
dfTransactionsNonMissingOperation = dfTransactionsModel.drop(dfTransactionsMissingOperation.index)
dfTransactionsNonMissingOperation = dfTransactionsNonMissingOperation[dfTransactionsNonMissingOperation["TYPE"] == "CREDIT"]

In [None]:
dfX = dfTransactionsModel[dfTransactionsModel["TYPE"] == "CREDIT"].copy()

dfX.drop(["TYPE", "OPERATION", "DATE"], axis = 1, inplace=True)

dfX["TRANSACTION_YEAR"] = dfTransactionsModel["DATE"].dt.year
dfX["TRANSACTION_MONTH"] = dfTransactionsModel["DATE"].dt.month
dfX["TRANSACTION_DAY"] = dfTransactionsModel["DATE"].dt.day

In [None]:
dfX

In [None]:
dfY = pd.get_dummies(dfTransactionsModel[dfTransactionsModel["TYPE"] == "CREDIT"]["OPERATION"])

In [None]:
dfY

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    dfX.loc[dfTransactionsNonMissingOperation.index], 
    dfY.loc[dfTransactionsNonMissingOperation.index], 
    test_size=0.3, 
    random_state=1, 
    shuffle=True)

In [None]:
oDecTreeModel =  DecisionTreeClassifier()
oDecTreeModel.fit(X_train, y_train)

y_pred = oDecTreeModel.predict(X_test)

y_pred = pd.DataFrame(y_pred, index = y_test.index, columns = y_test.columns )

Both classes have good accuracy and f1 score meaning that decision tree model may able to predict both True Positives and True Negatives.

In [None]:
print(classification_report(y_test, y_pred, target_names=dfY.columns))

for sClass in dfY.columns:
    print(f"Accuracy of : " + sClass + ": "+ str(round(metrics.accuracy_score(y_test[sClass], y_pred[sClass]), 2))) 

All of the missing values are predicted as "CREDIT_IN_CASH" value.

In [None]:
aPredictionsForMissingOperations = oDecTreeModel.predict(dfX.loc[dfTransactionsMissingOperation.index])
dfPredictionsForMissingOperations = pd.DataFrame(data = aPredictionsForMissingOperations, columns = dfY.columns, index = dfTransactionsMissingOperation.index)

aPredictedOperations = dfPredictionsForMissingOperations.idxmax(axis=1)

dfTransactionsModel.loc[dfTransactionsMissingOperation.index,"OPERATION"] = aPredictedOperations

In [None]:
aPredictedOperations.unique()

## Merging Datasets

In [None]:
dfDataAnalysis = dfTransactionsModel.reset_index().merge(dfCustomersModel.reset_index(), on = "ACCOUNT_ID", how = "inner").set_index("TRANS_ID")

There are transactions where "ACCOUNT_ID" are not in dfCustomers.

In [None]:
len(dfTransactionsModel) - len(dfDataAnalysis)

There are no customer who doesn't have any transaction.

In [None]:
len(dfDataAnalysis["CLIENT_ID"].unique()) - len(dfCustomersModel)

In [None]:
dfDataAnalysis = dfDataAnalysis.reset_index().merge(dfDistrictsModel.reset_index(), on = "DISTRICT_ID", how = "inner").set_index("TRANS_ID")

There are customers from each district.

In [None]:
len(dfCustomersModel["DISTRICT_ID"].unique()) - len(dfDistrictsModel)

In [None]:
dfDataAnalysis.head()

# DATA ANALYSIS