### Imports

In [None]:
# Utilities
import pandas as pd
import numpy as np
from datetime import datetime, date, time, timedelta
from collections import Counter

# Plotting
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

# Modelling
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import recall_score, f1_score, accuracy_score, precision_score
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

RSEED = 42

In [None]:
# read-in the cleaned data from FraudDetectionEDA
df = pd.read_csv("data/data_train_clean_withdummies.csv")

In [None]:
#df.columns.values

---

### Splitting the data

In [None]:
#separate predictor variables
X = df.drop(['Value','FraudResult'], axis =1)

# separate target variable
y = df['FraudResult']


In [None]:
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = RSEED)

### Data Balance - Oversampling

*Usefull links*:
- [SMOTE](https://imbalanced-learn.org/stable/over_sampling.html)
- [Handling Imbalanced Data Sets, Medium](https://medium.com/coinmonks/handling-imbalanced-datasets-predicting-credit-card-fraud-544f5e74e0fd)
- [Dealing with imbalanced Data, towardsdatascience](https://towardsdatascience.com/methods-for-dealing-with-imbalanced-data-5b761be45a18)

In [None]:
# apply resampling ONLY to train data
X_train_res, y_train_res = SMOTE().fit_resample(X_train, y_train)

After we resampled the data using SMOTE, we now have to apply a scaler to `X_train_res` and `X_test`. After scalling we convert them again to a `DataFrame` for easier handling.

In [None]:
scaler = StandardScaler()

#standardization of train set (fit_transform)
X_train_res_stand = scaler.fit_transform(X_train_res)

#change array to dataframe
scaled_df_train_resampled = pd.DataFrame(X_train_res_stand)
scaled_df_train_resampled.columns = X_train.columns

#standardization of test set (transform)
X_test_stand = scaler.transform(X_test)

#change array to dataframe
scaled_df_test = pd.DataFrame(X_test_stand)
scaled_df_test.columns = X_test.columns

Now, we are set to feed our models with the *cleaned* and *resampled* data. For easier handling, we will save and read the finished data set as `X_train`, `X_test` and `y_train`.

In [None]:
# save TRAIN sets
scaled_df_train_resampled.to_csv('data/x-train.csv', index=False)
y_train_res.to_csv('data/y-train.csv', index=False)

# save TEST sets
scaled_df_test.to_csv('data/x-test.csv', index=False)
y_test.to_csv('data/y-test.csv', index=False)

In [None]:
# reading in finished data
X_train = pd.read_csv('data/x-train.csv')
y_train = pd.read_csv('data/y-train.csv')

X_test = pd.read_csv('data/x-test.csv')
y_test = pd.read_csv('data/y-test.csv')

---

## Base Line Model

The following code for the classifier comparison was taken from [Machine Learning Mastery](https://machinelearningmastery.com/naive-classifiers-imbalanced-classification-metrics/). It shows, which type of `DummyClassifier` delivers the best results for a **F1-Score**. On the website (see link) there are more examples for **other metrics** as well.

In [None]:
# compare naive classifiers with f1-measure
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.dummy import DummyClassifier
from matplotlib import pyplot

# evaluate a model
def evaluate_model(X, y, model):
	# define evaluation procedure
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	# evaluate model
	scores = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1)
	return scores

# define models to test
def get_models():
	models, names = list(), list()
	# Uniformly Random Guess
	models.append(DummyClassifier(strategy='uniform'))
	names.append('Uniform')
	# Prior Random Guess
	models.append(DummyClassifier(strategy='stratified'))
	names.append('Stratified')
	# Majority Class: Predict 0
	models.append(DummyClassifier(strategy='most_frequent'))
	names.append('Majority')
	# Minority Class: Predict 1
	models.append(DummyClassifier(strategy='constant', constant=1))
	names.append('Minority')
	# Class Prior
	models.append(DummyClassifier(strategy='prior'))
	names.append('Prior')
	return models, names

# define dataset
"""
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
	n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)
"""
# define models
models, names = get_models()
results = list()
# evaluate each model
for i in range(len(models)):
	# evaluate the model and store results
	scores = evaluate_model(X_train, y_train, models[i])
	results.append(scores)
	# summarize and store
	print('>%s %.3f (%.3f)' % (names[i], mean(scores), std(scores)))
# plot the results
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

### Results:

The 'Minority' `DummyClassifier` seems to deliver the best result (0.667) for a naive classifier with regards to the **F1-Score** metric which is our goal. The values in braces show the variance of that model. For 'Minority' it is 0.

Therefore, we will procede with this `DummyClassifier` as our baseline model and compare our own models against it.

In [None]:
dummy_clf = DummyClassifier(strategy="constant", constant=1)
dummy_clf.fit(X_train, y_train)

y_dummy_pred = dummy_clf.predict(X_train)

In [None]:
print("------"*10)
print(classification_report(y_train, y_dummy_pred))
print("------"*10)

# F1-score on test set
print("F1-score:", f1_score(y_train, y_dummy_pred, average='binary'))
print("------"*10)

f1_baseline = f1_score(y_train, y_dummy_pred, average='binary')

## Modelling

### Basic Models

1. SVM ==> Daniela
2. LogReg ==> Kai-Yang
3. DecisionTree ==> Fabio

### Advanced Models

1. Random Forest ==> Fabio
2. AdaBoost ==> Kai-Yang
3. Stacking ==> Kai-Yang

### Results

All basic models showed weak performance in comparison to our advanced models. Hence, we decided to focus on them.

---

### DecisionTree

In [None]:
# Random Forest =================================

# Make a decision tree and train
tree = DecisionTreeClassifier(max_depth= None, random_state=RSEED)
tree.fit(X_train, y_train)

y_tree_pred = tree.predict(X_test)

In [None]:
print(f'Decision tree has {tree.tree_.node_count} nodes with maximum depth {tree.tree_.max_depth}.')
print(f'Model Accuracy: {tree.score(X_train, y_train)}')
print(f'Model F1-Score: {f1_score(y_test, y_tree_pred, average="binary")}')


---

### Random Forest

In [None]:
# optimal params: n_estimators=1000, criterion=entropy, max_features=sqrt
model = RandomForestClassifier(n_estimators=1000, 
                               random_state=RSEED, 
                               criterion = 'entropy',
                               max_features = 'sqrt',
                               max_depth = None,
                               n_jobs=-1, verbose = 1)

# Fit on training data
model.fit(X_train, y_train)
y_rf_pred = model.predict(X_test)

In [None]:
print("------"*10)
print(classification_report(y_test, y_rf_pred))
print("------"*10)

# F1-score on test set
print(f'Model F1-Score: {f1_score(y_test, y_rf_pred, average="binary")}')
print("------"*10)
print("Confusion Matrix: \n", 
        confusion_matrix(y_test, y_rf_pred))

---

#### Finding optimal parameters

In [None]:
# GRID SEARCH ===========================

param_grid = {'max_features': ['sqrt'], 
              'criterion': ['gini', 'entropy'],
              'n_estimators': [100, 200, 300, 400, 500],
              'random_state': [10,42,112]
             }

grid       = GridSearchCV(RandomForestClassifier(), param_grid, verbose=True, n_jobs=-1,scoring='f1')
model_rf  = grid.fit(X_train, np.ravel(y_train))
y_rf_grid_pred = model_rf.predict(X_test)

In [None]:
print("------"*10)
print(classification_report(y_test, y_rf_grid_pred))
print("------"*10)

# F1-score on test set 
print(f'Model F1-Score: {f1_score(y_test, y_rf_grid_pred, average="binary")}')
print("------"*10)
print(f'Model Matthew Corr: {matthews_corrcoef(y_test, y_rf_grid_pred)}')
print("------"*10)
print("Confusion Matrix: \n", 
        confusion_matrix(y_test, y_rf_grid_pred))

In [None]:
model_rf.best_params_

---

### Feature Importance (Random Forest)

For this purpose we use the in-built parameter `feature_importance_` of the random forest model.

In [None]:
x=np.asarray(model.feature_importances_)
print(np.array_str(x, precision=2, suppress_small=True))
print(model.feature_names_in_)
df_feat = pd.DataFrame(x, columns= ['Importance'],index=model.feature_names_in_)

In [None]:
df_feat

In [None]:
df_feat_plot = df_feat.query('Importance >= 0.03').reset_index().sort_values(by=['Importance'], 
                                                                             ascending=False,
                                                                            ignore_index=True)
x = df_feat_plot['index']
y = df_feat_plot.Importance

sns.scatterplot(x=x, y=y, marker='x', color='green')
plt.xticks(ticks = range(4),
           rotation=30, 
           labels=['Value', 'ProviderId 6', 'ProductId 15', 'AccountId'],
           fontsize=10,
           ha='right'
          )
#plt.setp( plt.xaxis.get_majorticklabels(), rotation=-45, ha="right" )
plt.ylabel('Relative Importance (%)', fontsize=14)
plt.xlabel('Feature',fontsize=14)
plt.grid(visible=True, axis='y', linestyle='--')
plt.suptitle('Relative Feature Importance in Model', fontsize=16);


In [None]:
df_feat_plot