# BASELINE MODEL NOTEBOOK
Felix A. Westphal
DLMDWME01

### Import

In [279]:
import pandas as pd
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

### Parameter

In [280]:
FILE_PATH_BALANCED_DATA = r"../data/processed/Balanced_Input_Data.csv"
FILE_PATH_NORMALIZED_DATA = r"../data/processed/Normalized_Input_Data.csv"

### Load Data

In [281]:
input_data = pd.read_csv(FILE_PATH_NORMALIZED_DATA, parse_dates=[0])                        # Load input data file
print(f"Data loaded from Excel: \n{input_data.head()}")

Data loaded from Excel: 
                 tmsp    amount  success  3D_secured  Austria  Germany  \
0 2019-01-01 00:01:11  0.133013    False       False    False     True   
1 2019-01-01 00:01:17  0.133013     True       False    False     True   
2 2019-01-01 00:02:49  0.371795    False        True    False     True   
3 2019-01-01 00:03:13  0.371795     True        True    False     True   
4 2019-01-01 00:04:33  0.189103    False       False     True    False   

   Switzerland  Goldcard  Moneycard  Simplecard  UK_Card  Diners  Master  \
0        False     False      False       False     True   False   False   
1        False     False      False       False     True   False   False   
2        False     False      False       False     True    True   False   
3        False     False      False       False     True    True   False   
4        False     False      False        True    False    True   False   

    Visa  num_tries  order_id  
0   True          1         1  
1   True 

### Train and Test Dataset

In [282]:
model_data = input_data.drop('tmsp', axis=1)                                                         # Remove timestamp column for training
model_data = model_data.drop('order_id', axis=1)
model_data = model_data.drop('Austria', axis=1)
model_data = model_data.drop('Goldcard', axis=1)
model_data = model_data.drop('Diners', axis=1)
X = model_data.drop('success', axis=1)                                                                  # Features
y = model_data['success']                                                                               # Target Variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
#training,test = train_test_split(model_data, train_size = 0.7, test_size = 0.3, shuffle=True)          # Performance Verschlechterung: 79% (Train and Testset Settings)
#training, valid = train_test_split(training, train_size = 0.7, test_size =0.3, shuffle=True)

print(model_data.head())

     amount  success  3D_secured  Germany  Switzerland  Moneycard  Simplecard  \
0  0.133013    False       False     True        False      False       False   
1  0.133013     True       False     True        False      False       False   
2  0.371795    False        True     True        False      False       False   
3  0.371795     True        True     True        False      False       False   
4  0.189103    False       False    False        False      False        True   

   UK_Card  Master   Visa  num_tries  
0     True   False   True          1  
1     True   False   True          2  
2     True   False  False          1  
3     True   False  False          1  
4    False   False  False          1  


### Baseline Model - Logistic Regression

In [283]:
logReg_model = LogisticRegression(max_iter=200, random_state=0, solver='lbfgs', multi_class='multinomial')                     # Create a Logistic Regression model
logReg_model.fit(X_train, y_train)

y_pred_logReg = logReg_model.predict(X_test)            # Make predictions on the test set
proba_pred_logReg = logReg_model.predict_proba(X_test)  # Predict probabilities for the test data

# --- Extract the probabilities for the positive class (success)
success_prob_logReg = proba_pred_logReg[:, 1]
print(success_prob_logReg)

[0.17213764 0.14943815 0.16212992 ... 0.1521076  0.40138925 0.21588472]


### Baseline Model - Decision Tree

In [284]:
decTree_model = DecisionTreeClassifier()            # Create a Decision Tree classifier
decTree_model.fit(X_train, y_train)
y_pred_decTree = decTree_model.predict(X_test)      # Make predictions on the test set

### Baseline Model - Random Forest

In [285]:
ranForest_model = RandomForestClassifier()          # Create a Random Forest classifier
ranForest_model.fit(X_train, y_train)
y_pred_ranForest = ranForest_model.predict(X_test)  # Make predictions on the test set

### Baseline Model - XGBoost

In [286]:
xgBoost_model = XGBClassifier()
xgBoost_model.fit(X_train, y_train)
y_pred_xgBoost = xgBoost_model.predict(X_test)

### Baseline Model - Naive Bayes

In [287]:
nBay_model = GaussianNB()
nBay_model.fit(X_train, y_train)
y_pred_nBay = nBay_model.predict(X_test)

### Baseline Model - SVC

In [288]:
svc_model = SVC(kernel='rbf', C=1, gamma='auto')
svc_model.fit(X_train, y_train)
y_pred_svc = svc_model.predict(X_test)

### Baseline Model - kNeighbors Classifier

In [289]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

### Model Evaluation

In [290]:
# --- Calculate the accuracy of the Logistic Regression model
accuracy_logReg = accuracy_score(y_test, y_pred_logReg)
print("Logistic Regression Accuracy:", accuracy_logReg)
# 81%
# 81% (Entfernung collreationsspalten wg. OneHot & max_iter definiert)
# 57% (Balancing)
# 57% (Balancing and Normalized)
# 81% (No Balancing and Normalized)

# --- Calculate the accuracy of the Decision Tree model
accuracy_decTree = accuracy_score(y_test, y_pred_decTree)
print("Decision Tree Accuracy:", accuracy_decTree)
# 71%
# 75% (Entfernung collreationsspalten wg. OneHot)
# 57% (Balancing)
# 57% (Balancing and Normalized)
# 75% (No Balancing and Normalized)

# --- Calculate the accuracy of the Random Forest model
accuracy_ranForest = accuracy_score(y_test, y_pred_ranForest)
print("Random Forest Accuracy:", accuracy_ranForest)
# 77%
# 75% (Entfernung collreationsspalten wg. OneHot)
# 57% (Balancing)
# 57% (Balancing and Normalized)
# 75% (No Balancing and Normalized)

# --- Calculate the accuracy of the XGBoost model
accuracy_xgBoost = accuracy_score(y_test, y_pred_xgBoost)
print("XGBoost Accuracy:", accuracy_xgBoost)
# 81% (No Balancing and Normalized)

# --- Calculate the accuracy of the Naive Bayes model
accuracy_nBay = accuracy_score(y_test, y_pred_nBay)
print("Naive Bayes Accuracy:", accuracy_nBay)
# 81% (No Balancing and Normalized)

# --- Calculate the accuracy of the SVC model
accuracy_svc = accuracy_score(y_test, y_pred_svc)
print("SVC Accuracy:", accuracy_svc)
# 81% (No Balancing and Normalized)

# --- Calculate the accuracy of the kNeighbors Classifier model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("kNeighbors Classifier Accuracy:", accuracy_knn)
# 76% (No Balancing and Normalized)

Logistic Regression Accuracy: 0.8112477682999405
Decision Tree Accuracy: 0.7543146201150566
Random Forest Accuracy: 0.7481650466177345
XGBoost Accuracy: 0.8094624082523308
Naive Bayes Accuracy: 0.8094624082523308
SVC Accuracy: 0.81362824836342
kNeighbors Classifier Accuracy: 0.7568934735171593
