# Basismodell

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

# Load the data 
file_path = 'PSP_Jan_Feb_2019_ready.xlsx'
df = pd.read_excel(file_path)

In [5]:
# Drop columns that won't be used in the model
drop_columns = ['index', 'tmsp', 'hour']
df_model = df.drop(columns=drop_columns)

# Encode categorical variables
label_columns = ['country', 'PSP', 'card', 'weekday', 'time_of_day', 'country_PSP', 'PSP_3D_secured']
label_encoder = LabelEncoder()
for col in label_columns:
    df_model[col] = label_encoder.fit_transform(df_model[col].astype(str))

# Split the data into training and test sets
X = df_model.drop('success', axis=1)
y = df_model['success']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Show the first few rows of the scaled training data
pd.DataFrame(X_train_scaled, columns=X_train.columns).head()

Unnamed: 0,country,amount,PSP,3D_secured,card,attempts,fee,weekday,time_of_day,month,quarter,fee_to_amount_ratio,log_amount,country_PSP,PSP_3D_secured,previous_successes,previous_failures,transaction_duration
0,-0.015475,-0.647278,-0.248014,-0.561389,1.481725,-0.413814,-0.691158,-1.540419,-0.444938,-0.957841,0.0,-0.33892,-0.317704,-0.101105,-0.366451,-0.26956,-0.329465,-0.278471
1,-0.015475,-0.470256,-0.248014,-0.561389,-0.057004,-0.413814,-0.691158,1.38203,0.44684,1.044015,0.0,-0.351136,-0.14215,-0.101105,-0.366451,-0.26956,-0.329465,-0.278471
2,1.564492,-0.626452,-2.373362,-0.561389,-0.057004,-0.413814,1.791962,0.894955,0.44684,-0.957841,0.0,0.66055,-0.295983,0.640776,-2.441463,-0.26956,-0.329465,-0.278471
3,1.564492,0.748067,-0.248014,-0.561389,-0.057004,-0.413814,-0.691158,1.38203,-0.444938,1.044015,0.0,-0.394094,0.712489,1.382657,-0.366451,-0.26956,-0.329465,-0.278471
4,1.564492,-0.980494,-0.248014,-0.561389,-1.595733,-0.413814,-0.691158,-0.566269,-1.336716,-0.957841,0.0,-0.305493,-0.714679,1.382657,-0.366451,-0.26956,-0.329465,-0.278471


In [6]:
# Initialize and train the logistic regression model
log_reg_model = LogisticRegression(random_state=42)
log_reg_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = log_reg_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

accuracy, conf_matrix, class_report

(0.9589367189049792,
 array([[8024,   12],
        [ 402, 1644]]),
 '              precision    recall  f1-score   support\n\n           0       0.95      1.00      0.97      8036\n           1       0.99      0.80      0.89      2046\n\n    accuracy                           0.96     10082\n   macro avg       0.97      0.90      0.93     10082\nweighted avg       0.96      0.96      0.96     10082\n')

# XGBoost classifier

In [7]:
# Initialize and train the XGBoost model
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')
xgb_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Evaluate the XGBoost model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
class_report_xgb = classification_report(y_test, y_pred_xgb)

accuracy_xgb, conf_matrix_xgb, class_report_xgb

(1.0,
 array([[8036,    0],
        [   0, 2046]]),
 '              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00      8036\n           1       1.00      1.00      1.00      2046\n\n    accuracy                           1.00     10082\n   macro avg       1.00      1.00      1.00     10082\nweighted avg       1.00      1.00      1.00     10082\n')

## Cross validation

In [8]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the XGBoost model
cv_scores = cross_val_score(xgb_model, X_train_scaled, y_train, cv=5, scoring='accuracy')

# Calculate the mean and standard deviation of the cross-validation scores
cv_mean = cv_scores.mean()
cv_std = cv_scores.std()

cv_scores, cv_mean, cv_std

(array([1., 1., 1., 1., 1.]), 1.0, 0.0)

# SVM

In [None]:
# Drop columns that won't be used for modeling
df_model = df.drop(['tmsp', 'index'], axis=1)

# Define features and target variable
X = df_model.drop('success', axis=1)
y = df_model['success']

# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# Create preprocessing pipelines for both numerical and categorical data
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine both numerical and categorical transformations into one
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)])

# Create a pipeline with a SVM classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', SVC())])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

accuracy, classification_rep
