# GRADIENT BOOSTING MODEL NOTEBOOK
Felix A. Westphal
DLMDWME01

### Import

In [226]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score
from sklearn.model_selection import GridSearchCV

### Parameter

In [227]:
FILE_PATH_BALANCED_DATA = r"../data/processed/Balanced_Input_Data.csv"
FILE_PATH_NORMALIZED_DATA = r"../data/processed/Normalized_Input_Data.csv"

### Load Data

In [228]:
input_data = pd.read_csv(FILE_PATH_NORMALIZED_DATA, parse_dates=[0])                        # Load input data file
print(f"Data loaded from Excel: \n{input_data.head()}")

Data loaded from Excel: 
                 tmsp    amount  success  3D_secured  Austria  Germany  \
0 2019-01-01 00:01:11  0.133013    False       False    False     True   
1 2019-01-01 00:01:17  0.133013     True       False    False     True   
2 2019-01-01 00:02:49  0.371795    False        True    False     True   
3 2019-01-01 00:03:13  0.371795     True        True    False     True   
4 2019-01-01 00:04:33  0.189103    False       False     True    False   

   Switzerland  Goldcard  Moneycard  Simplecard  UK_Card  Diners  Master  \
0        False     False      False       False     True   False   False   
1        False     False      False       False     True   False   False   
2        False     False      False       False     True    True   False   
3        False     False      False       False     True    True   False   
4        False     False      False        True    False    True   False   

    Visa  num_tries  order_id      hour  is_weekend  
0   True          1

### Train and Test Dataset

In [229]:
input_data = input_data[input_data['num_tries'] == 1]                                                   # Only consider first tries
X = input_data[['hour', 'amount', '3D_secured', 'is_weekend', 'Goldcard', 'Simplecard', 'UK_Card']]     # Selected Features
y = input_data['success']                                                                               # Target Variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# --- Check Dataset Distribution
failed_transaction = y_train[y_train == False]
succeeded_transaction = y_train[y_train == True]
num_failed = len(failed_transaction.index)
num_succeeded = len(succeeded_transaction.index)
print("Number of failed transactions: " + str(num_failed))
print("Number of succeeded transactions: " + str(num_succeeded))

Number of failed transactions: 24073
Number of succeeded transactions: 6223


### Gradient Boosting Model

In [230]:
param_grid = {
    'learning_rate': [0,1, 0.01, 0.001],
    'max_depth': [3, 6, 8],
    'n_estimators': [100, 1000, 2000]
}

gradientboosting = GradientBoostingClassifier(learning_rate=0.001, max_depth=3, n_estimators=2000)

# --- Perform grid search cross-validation
#grid_search = GridSearchCV(estimator=gradientboosting, param_grid=param_grid, cv=3, scoring='accuracy')
#grid_search.fit(X_train, y_train)

# --- Get the best hyperparameters and the corresponding model
#best_params = grid_search.best_params_
#best_model = grid_search.best_estimator_

#best_model = grid_search.best_estimator_
#print(f"Best Hyperparameters for GradientBoostingClassifier: {best_params}")

#cv_scores = cross_val_score(gradientboosting, X_train, y_train, cv=3, scoring="recall")
#print(cv_scores)
#print(f"Average 3-Fold CV recall score: {np.mean(cv_scores)}")

gradientboosting.fit(X_train, y_train)
y_pred = gradientboosting.predict(X_test)
y_pred_proba = gradientboosting.predict_proba(X_test)[:,1]
#y_pred = best_model.predict(X_test)
#y_pred_proba = best_model.predict_proba(X_test)[:,1]

### Model Evaluation

In [231]:
# --- Calculate the accuracy of the Gradient Boosting model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
# 81%
# 62% (Balancing)
# 62% (Balancing and Normalized)
# 81% (No Balancing and Normalized)
# 78% (n_extimators=2000 (1000), max_depth=6 (3)
# 81% (max_depth=2)
# 81% (nur noch n_estimators=2000)
# 81-% (n_estimators=4000)
# 81+% (n_estimators=1000)
# 80,5% (Feature Selection, Hyperparameter Tuning)

Model Accuracy: 0.8047266965936097
