# GRADIENT BOOSTING MODEL NOTEBOOK
Felix A. Westphal
DLMDWME01

### Import

In [105]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score

### Parameter

In [106]:
FILE_PATH_BALANCED_DATA = r"../data/processed/Balanced_Input_Data.csv"
FILE_PATH_NORMALIZED_DATA = r"../data/processed/Normalized_Input_Data.csv"

### Load Data

In [107]:
input_data = pd.read_csv(FILE_PATH_NORMALIZED_DATA, parse_dates=[0])                        # Load input data file
print(f"Data loaded from Excel: \n{input_data.head()}")

Data loaded from Excel: 
                 tmsp    amount  success  3D_secured  Austria  Germany  \
0 2019-01-01 00:01:11  0.133013    False       False    False     True   
1 2019-01-01 00:01:17  0.133013     True       False    False     True   
2 2019-01-01 00:02:49  0.371795    False        True    False     True   
3 2019-01-01 00:03:13  0.371795     True        True    False     True   
4 2019-01-01 00:04:33  0.189103    False       False     True    False   

   Switzerland  Goldcard  Moneycard  Simplecard  UK_Card  Diners  Master  \
0        False     False      False       False     True   False   False   
1        False     False      False       False     True   False   False   
2        False     False      False       False     True    True   False   
3        False     False      False       False     True    True   False   
4        False     False      False        True    False    True   False   

    Visa  num_tries  order_id  
0   True          1         1  
1   True 

### Train and Test Dataset

In [108]:
model_data = input_data.drop('tmsp', axis=1)                                                            # Remove timestamp column for training
model_data = model_data.drop('order_id', axis=1)
model_data = model_data.drop('Austria', axis=1)
model_data = model_data.drop('Goldcard', axis=1)
model_data = model_data.drop('Diners', axis=1)
X = model_data.drop('success', axis=1)                                                                  # Features
y = model_data['success']                                                                               # Target Variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

### Gradient Boosting Model

In [None]:
gradientboosting = GradientBoostingClassifier(n_estimators=2000, random_state=42, max_depth=2)
cv_scores = cross_val_score(gradientboosting, X_train, y_train, cv=3, scoring="recall")
print(cv_scores)
print(f"Average 3-Fold CV recall score: {np.mean(cv_scores)}")

gradientboosting.fit(X_train, y_train)
y_pred = gradientboosting.predict(X_test)
y_pred_proba = gradientboosting.predict_proba(X_test)[:,1]

### Model Evaluation

In [None]:
# --- Calculate the accuracy of the Gradient Boosting model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
# 81%
# 62% (Balancing)
# 62% (Balancing and Normalized)
# 81% (No Balancing and Normalized)
# 78% (n_extimators=2000 (1000), max_depth=6 (3)