# BASELINE MODEL NOTEBOOK
Felix A. Westphal
DLMDWME01

### Import

In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

### Parameter

In [66]:
FILE_PATH_EXTENDED_DATA = r"../data/processed/Extended_Input_Data.csv"

### Load Data

In [67]:
extended_data = pd.read_csv(FILE_PATH_EXTENDED_DATA, parse_dates=[0])                        # Load input data file
print(f"Extended Data loaded from Excel: \n{extended_data.head()}")

Extended Data loaded from Excel: 
                 tmsp  amount  success  3D_secured  Austria  Germany  \
0 2019-01-01 00:01:11    89.0    False       False    False     True   
1 2019-01-01 00:01:17    89.0     True       False    False     True   
2 2019-01-01 00:02:49   238.0    False        True    False     True   
3 2019-01-01 00:03:13   238.0     True        True    False     True   
4 2019-01-01 00:04:33   124.0    False       False     True    False   

   Switzerland  Goldcard  Moneycard  Simplecard  UK_Card  Diners  Master  \
0        False     False      False       False     True   False   False   
1        False     False      False       False     True   False   False   
2        False     False      False       False     True    True   False   
3        False     False      False       False     True    True   False   
4        False     False      False        True    False    True   False   

    Visa  num_tries  order_id  
0   True          1         1  
1   True    

### Train and Test Dataset

In [68]:
model_data = extended_data.drop('tmsp', axis=1)                                                         # Remove timestamp column for training
X = model_data.drop('success', axis=1)                                                                  # Features
y = model_data['success']                                                                               # Target Variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Baseline Model - Logistic Regression

In [69]:
logReg_model = LogisticRegression()                     # Create a Logistic Regression model
logReg_model.fit(X_train, y_train)

y_pred_logReg = logReg_model.predict(X_test)            # Make predictions on the test set
proba_pred_logReg = logReg_model.predict_proba(X_test)  # Predict probabilities for the test data

# --- Extract the probabilities for the positive class (success)
success_prob_logReg = proba_pred_logReg[:, 1]
print(success_prob_logReg)

[0.20364129 0.16173504 0.15127169 ... 0.16972879 0.21199467 0.24752795]


### Baseline Model - Decision Tree

In [70]:
decTree_model = DecisionTreeClassifier()            # Create a Decision Tree classifier
decTree_model.fit(X_train, y_train)
y_pred_decTree = decTree_model.predict(X_test)      # Make predictions on the test set

### Baseline Model - Random Forest

In [71]:
ranForest_model = RandomForestClassifier()          # Create a Random Forest classifier
ranForest_model.fit(X_train, y_train)
y_pred_ranForest = ranForest_model.predict(X_test)  # Make predictions on the test set

### Model Evaluation

In [72]:
# --- Calculate the accuracy of the Logistic Regression model
accuracy_logReg = accuracy_score(y_test, y_pred_logReg)
print("Logistic Regression Accuracy:", accuracy_logReg)

# --- Calculate the accuracy of the Decision Tree model
accuracy_decTree = accuracy_score(y_test, y_pred_decTree)
print("Decision Tree Accuracy:", accuracy_decTree)

# --- Calculate the accuracy of the Random Forest model
accuracy_ranForest = accuracy_score(y_test, y_pred_ranForest)
print("Random Forest Accuracy:", accuracy_ranForest)

Logistic Regression Accuracy: 0.8095615949216425
Decision Tree Accuracy: 0.7093830589168816
Random Forest Accuracy: 0.7674072604641936
