# BASELINE MODEL NOTEBOOK
Felix A. Westphal
DLMDWME01

### Import

In [587]:
import pandas as pd
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

### Parameter

In [588]:
FILE_PATH_BALANCED_DATA = r"../data/processed/Balanced_Input_Data.csv"
FILE_PATH_NORMALIZED_DATA = r"../data/processed/Normalized_Input_Data.csv"

### Load Data

In [589]:
input_data = pd.read_csv(FILE_PATH_NORMALIZED_DATA, parse_dates=[0])                        # Load input data file
print(f"Data loaded from Excel: \n{input_data.head()}")

Data loaded from Excel: 
                 tmsp    amount  success  3D_secured  Austria  Germany  \
0 2019-01-01 00:01:11  0.133013    False       False    False     True   
1 2019-01-01 00:01:17  0.133013     True       False    False     True   
2 2019-01-01 00:02:49  0.371795    False        True    False     True   
3 2019-01-01 00:03:13  0.371795     True        True    False     True   
4 2019-01-01 00:04:33  0.189103    False       False     True    False   

   Switzerland  Goldcard  Moneycard  Simplecard  UK_Card  Diners  Master  \
0        False     False      False       False     True   False   False   
1        False     False      False       False     True   False   False   
2        False     False      False       False     True    True   False   
3        False     False      False       False     True    True   False   
4        False     False      False        True    False    True   False   

    Visa  num_tries  order_id      hour  is_weekend  
0   True          1

### Train and Test Dataset

In [590]:
input_data = input_data[input_data['num_tries'] == 1]                                                   # Only consider first tries

X = input_data[['hour', 'amount', '3D_secured', 'is_weekend', 'Goldcard', 'Simplecard', 'UK_Card']]     # Selected Features
y = input_data['success']                                                                               # Target Variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
#training,test = train_test_split(model_data, train_size = 0.7, test_size = 0.3, shuffle=True)          # Performance Verschlechterung: 79% (Train and Testset Settings)
#training, valid = train_test_split(training, train_size = 0.7, test_size =0.3, shuffle=True)
print(f"Train Dataset: \n{X_train.head()}")

# --- Check Dataset Distribution
failed_transaction = y_train[y_train == False]
succeeded_transaction = y_train[y_train == True]
num_failed = len(failed_transaction.index)
num_succeeded = len(succeeded_transaction.index)
print("Number of failed transactions: " + str(num_failed))
print("Number of succeeded transactions: " + str(num_succeeded))

Train Dataset: 
           hour    amount  3D_secured  is_weekend  Goldcard  Simplecard  \
42485  0.182692  0.182692       False       False     False        True   
18229  0.126603  0.126603        True       False     False       False   
14138  0.333333  0.333333        True       False     False       False   
47317  0.198718  0.198718        True       False     False        True   
35141  0.139423  0.139423        True        True     False       False   

       UK_Card  
42485    False  
18229     True  
14138     True  
47317    False  
35141     True  
Number of failed transactions: 24073
Number of succeeded transactions: 6223


### Baseline Model - Logistic Regression

In [591]:
logReg_model = LogisticRegression(max_iter=200, random_state=0, solver='lbfgs', multi_class='multinomial')                     # Create a Logistic Regression model
logReg_model.fit(X_train, y_train)

y_pred_logReg = logReg_model.predict(X_test)            # Make predictions on the test set
proba_pred_logReg = logReg_model.predict_proba(X_test)  # Predict probabilities for the test data

# --- Extract the probabilities for the positive class (success)
success_prob_logReg = proba_pred_logReg[:, 1]
print(success_prob_logReg)

[0.28635276 0.23962683 0.24375885 ... 0.20554243 0.12375475 0.14714824]


### Baseline Model - Decision Tree

In [592]:
decTree_model = DecisionTreeClassifier(max_depth=4, criterion='entropy')    # Create a Decision Tree classifier
decTree_model.fit(X_train, y_train)
y_pred_decTree = decTree_model.predict(X_test)                              # Make predictions on the test set

### Baseline Model - Random Forest

In [593]:
ranForest_model = RandomForestClassifier(max_depth=4)       # Create a Random Forest classifier
ranForest_model.fit(X_train, y_train)
y_pred_ranForest = ranForest_model.predict(X_test)          # Make predictions on the test set

### Baseline Model - XGBoost

In [594]:
param_grid = {
    'learning_rate': [0.01],
    'max_depth': [3, 6],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'n_estimators': [100, 1000]
}

xgBoost_model = XGBClassifier(
    colsample_bytree=0.8,
    learning_rate=0.1,
    max_depth=3,
    n_estimators=1000,
    subsample=0.8
)

# --- Perform grid search cross-validation
#grid_search = GridSearchCV(estimator=xgBoost_model, param_grid=param_grid, cv=3, scoring='accuracy')
#grid_search.fit(X_train, y_train)

# --- Get the best hyperparameters and the corresponding model
#best_params = grid_search.best_params_
#best_model = grid_search.best_estimator_

#best_model = grid_search.best_estimator_
#print(f"Best Hyperparameters for XGBoost: {best_params}")

xgBoost_model.fit(X_train, y_train)
y_pred_xgBoost = xgBoost_model.predict(X_test)

### Baseline Model - Naive Bayes

In [595]:
nBay_model = GaussianNB()
nBay_model.fit(X_train, y_train)
y_pred_nBay = nBay_model.predict(X_test)

### Baseline Model - SVC

In [596]:
svc_model = SVC(kernel='rbf', C=1, gamma='auto')
svc_model.fit(X_train, y_train)
y_pred_svc = svc_model.predict(X_test)

### Baseline Model - kNeighbors Classifier

In [597]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

### Model Evaluation

In [598]:
# --- Calculate the accuracy of the Logistic Regression model
accuracy_logReg = accuracy_score(y_test, y_pred_logReg)
print("Logistic Regression Accuracy:", accuracy_logReg)
# 81%
# 81% (Entfernung collreationsspalten wg. OneHot & max_iter definiert)
# 57% (Balancing)
# 57% (Balancing and Normalized)
# 81% (No Balancing and Normalized)
# 79% (StandardScaler instead of MinMaxScaler and removed duplicates from raw dataset)
# 79% (Feature Selection and MinMaxScaler)
# 80% (Only First Tries and ['hour', 'amount', '3D_secured', 'is_weekend', 'Goldcard', 'Simplecard', 'UK_Card'])

# --- Calculate the accuracy of the Decision Tree model
accuracy_decTree = accuracy_score(y_test, y_pred_decTree)
print("Decision Tree Accuracy:", accuracy_decTree)
# 71%
# 75% (Entfernung collreationsspalten wg. OneHot)
# 57% (Balancing)
# 57% (Balancing and Normalized)
# 75% (No Balancing and Normalized)
# 73% (StandardScaler instead of MinMaxScaler and removed duplicates from raw dataset)
# 79% (Feature Selection and MinMaxScaler)
# 80% (Only First Tries and ['hour', 'amount', '3D_secured', 'is_weekend', 'Goldcard', 'Simplecard', 'UK_Card'])


# --- Calculate the accuracy of the Random Forest model
accuracy_ranForest = accuracy_score(y_test, y_pred_ranForest)
print("Random Forest Accuracy:", accuracy_ranForest)
# 77%
# 75% (Entfernung collreationsspalten wg. OneHot)
# 57% (Balancing)
# 57% (Balancing and Normalized)
# 75% (No Balancing and Normalized)
# 73% (StandardScaler instead of MinMaxScaler and removed duplicates from raw dataset)
# 79% (Feature Selection and MinMaxScaler)
# 80% (Only First Tries and ['hour', 'amount', '3D_secured', 'is_weekend', 'Goldcard', 'Simplecard', 'UK_Card'])

# --- Calculate the accuracy of the XGBoost model
accuracy_xgBoost = accuracy_score(y_test, y_pred_xgBoost)
print("XGBoost Accuracy:", accuracy_xgBoost)
# 81% (No Balancing and Normalized)
# 79% (StandardScaler instead of MinMaxScaler and removed duplicates from raw dataset)
# 79% (Feature Selection and MinMaxScaler)
# 80% (Only First Tries and ['hour', 'amount', '3D_secured', 'is_weekend', 'Goldcard', 'Simplecard', 'UK_Card'])

# --- Calculate the accuracy of the Naive Bayes model
accuracy_nBay = accuracy_score(y_test, y_pred_nBay)
print("Naive Bayes Accuracy:", accuracy_nBay)
# 81% (No Balancing and Normalized)
# 79% (StandardScaler instead of MinMaxScaler and removed duplicates from raw dataset)
# 78% (Feature Selection and MinMaxScaler)
# 79% (Only First Tries and ['hour', 'amount', '3D_secured', 'is_weekend', 'Goldcard', 'Simplecard', 'UK_Card'])


# --- Calculate the accuracy of the SVC model
accuracy_svc = accuracy_score(y_test, y_pred_svc)
print("SVC Accuracy:", accuracy_svc)
# 81% (No Balancing and Normalized)
# 79% (StandardScaler instead of MinMaxScaler and removed duplicates from raw dataset)
# 79% (Feature Selection and MinMaxScaler)
# 80% (Only First Tries and ['hour', 'amount', '3D_secured', 'is_weekend', 'Goldcard', 'Simplecard', 'UK_Card'])

# --- Calculate the accuracy of the kNeighbors Classifier model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("kNeighbors Classifier Accuracy:", accuracy_knn)
# 76% (No Balancing and Normalized)
# 74% (StandardScaler instead of MinMaxScaler and removed duplicates from raw dataset)
# 79% (Feature Selection and MinMaxScaler)
# 74% (Only First Tries and ['hour', 'amount', '3D_secured', 'is_weekend', 'Goldcard', 'Simplecard', 'UK_Card'])

Logistic Regression Accuracy: 0.8018220227092685
Decision Tree Accuracy: 0.8040665434380776
Random Forest Accuracy: 0.7993134407182466
XGBoost Accuracy: 0.8023501452336942
Naive Bayes Accuracy: 0.7896752046474782
SVC Accuracy: 0.8047266965936097
kNeighbors Classifier Accuracy: 0.7391074729337206
