# Basismodell

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

# Load the data 
file_path = '../data/processed/PSP_model_data.xlsx'
df = pd.read_excel(file_path)

In [2]:
# Drop columns that won't be used in the model
drop_columns = ['index', 'tmsp', 'hour', 'attempts']
df_model = df.drop(columns=drop_columns)

# Encode categorical variables
label_columns = ['country', 'PSP', 'card', 'weekday', 'time_of_day', 'country_PSP', 'PSP_3D_secured']
label_encoder = LabelEncoder()
for col in label_columns:
    df_model[col] = label_encoder.fit_transform(df_model[col].astype(str))

# Split the data into training and test sets
X = df_model.drop('success', axis=1)
y = df_model['success']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Show the first few rows of the scaled training data
pd.DataFrame(X_train_scaled, columns=X_train.columns).head()

Unnamed: 0,country,amount,PSP,3D_secured,card,fee,weekday,time_of_day,month,quarter,fee_to_amount_ratio,log_amount,country_PSP,PSP_3D_secured,previous_successes,previous_failures,transaction_duration
0,-0.015475,-0.647278,-0.248014,-0.561389,1.481725,-0.691158,-1.540419,-0.444938,-0.957841,0.0,-0.33892,-0.317704,-0.101105,-0.366451,-0.26956,-0.329465,-0.278471
1,-0.015475,-0.470256,-0.248014,-0.561389,-0.057004,-0.691158,1.38203,0.44684,1.044015,0.0,-0.351136,-0.14215,-0.101105,-0.366451,-0.26956,-0.329465,-0.278471
2,1.564492,-0.626452,-2.373362,-0.561389,-0.057004,1.791962,0.894955,0.44684,-0.957841,0.0,0.66055,-0.295983,0.640776,-2.441463,-0.26956,-0.329465,-0.278471
3,1.564492,0.748067,-0.248014,-0.561389,-0.057004,-0.691158,1.38203,-0.444938,1.044015,0.0,-0.394094,0.712489,1.382657,-0.366451,-0.26956,-0.329465,-0.278471
4,1.564492,-0.980494,-0.248014,-0.561389,-1.595733,-0.691158,-0.566269,-1.336716,-0.957841,0.0,-0.305493,-0.714679,1.382657,-0.366451,-0.26956,-0.329465,-0.278471


In [3]:
# Initialize and train the logistic regression model
log_reg_model = LogisticRegression(random_state=42)
log_reg_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = log_reg_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

accuracy, conf_matrix, class_report

(0.9588375322356675,
 array([[8024,   12],
        [ 403, 1643]]),
 '              precision    recall  f1-score   support\n\n           0       0.95      1.00      0.97      8036\n           1       0.99      0.80      0.89      2046\n\n    accuracy                           0.96     10082\n   macro avg       0.97      0.90      0.93     10082\nweighted avg       0.96      0.96      0.96     10082\n')

# XGBoost classifier

In [4]:
# Initialize and train the XGBoost model
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')
xgb_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Evaluate the XGBoost model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
class_report_xgb = classification_report(y_test, y_pred_xgb)

accuracy_xgb, conf_matrix_xgb, class_report_xgb

(1.0,
 array([[8036,    0],
        [   0, 2046]]),
 '              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00      8036\n           1       1.00      1.00      1.00      2046\n\n    accuracy                           1.00     10082\n   macro avg       1.00      1.00      1.00     10082\nweighted avg       1.00      1.00      1.00     10082\n')

## Cross validation

In [5]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the XGBoost model
cv_scores = cross_val_score(xgb_model, X_train_scaled, y_train, cv=5, scoring='accuracy')

# Calculate the mean and standard deviation of the cross-validation scores
cv_mean = cv_scores.mean()
cv_std = cv_scores.std()

cv_scores, cv_mean, cv_std

(array([1., 1., 1., 1., 1.]), 1.0, 0.0)

In [None]:
# Function to predict the success probability for incoming data
def predict_success_probability(model, new_data, preprocessor):
    """
    Predict the success probability using the trained model.
    """
    processed_data = preprocessor.transform(new_data)
    success_probability = model.predict_proba(processed_data)[:, 1]  # Probability of success (class label 1)
    return success_probability

# Function to select the best PSP based on success probability and transaction fee
def select_best_psp(success_probabilities, transaction_fees, w_success=0.7, w_cost=0.3):
    """
    Select the best PSP based on success probability and transaction fee.
    """
    # Normalize the transaction fee to get the cost factor
    max_fee = max(transaction_fees)
    min_fee = min(transaction_fees)
    normalized_fees = [(fee - min_fee) / (max_fee - min_fee) for fee in transaction_fees]
    cost_factors = [1 - fee for fee in normalized_fees]
    
    # Calculate the score for each PSP
    scores = [w_success * prob + w_cost * cost for prob, cost in zip(success_probabilities, cost_factors)]
    
    # Select the PSP with the highest score
    best_psp_index = np.argmax(scores)
    best_psp_score = scores[best_psp_index]
    
    return best_psp_index, best_psp_score

# Simulate new incoming data (For demonstration, we take 4 samples from the original dataset)
new_data = X.sample(4, random_state=1)

# Extract the transaction fees for the new data (assuming it's a known quantity at the time of transaction)
new_data_fees = new_data['fee'].values

# Predict the success probability for the new data
new_data_prob = predict_success_probability(pipeline.named_steps['classifier'], 
                                            pipeline.named_steps['preprocessor'].transform(new_data), 
                                            pipeline.named_steps['preprocessor'])

# Select the best PSP based on the predicted success probabilities and transaction fees
best_psp_index, best_psp_score = select_best_psp(new_data_prob, new_data_fees)

new_data.reset_index(drop=True, inplace=True)
best_psp_data = new_data.iloc[best_psp_index]
best_psp_data, best_psp_score


In [None]:
# Use the entire pipeline (including preprocessor and classifier) for making predictions
def predict_success_probability_full_pipeline(pipeline, new_data):
    """
    Predict the success probability using the trained pipeline.
    """
    # Note: predict_proba gives probabilities for each class. 
    # We are interested in the second element of each tuple which corresponds to '1' or 'success'
    success_probability = pipeline.b(new_data)[:, 1]
    return success_probability

# Simulate new incoming data (For demonstration, we take 4 samples from the original dataset)
new_data = X.sample(4, random_state=1)

# Extract the transaction fees for the new data (assuming it's a known quantity at the time of transaction)
new_data_fees = new_data['fee'].values

# Predict the success probability for the new data using the full pipeline
new_data_prob = predict_success_probability_full_pipeline(pipeline, new_data)

# Select the best PSP based on the predicted success probabilities and transaction fees
best_psp_index, best_psp_score = select_best_psp(new_data_prob, new_data_fees)

new_data.reset_index(drop=True, inplace=True)
best_psp_data = new_data.iloc[best_psp_index]
best_psp_data, best_psp_score


In [None]:
from sklearn.linear_model import LogisticRegression

# Create a new pipeline with a Logistic Regression classifier configured for probability estimates
pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', LogisticRegression())])

# Fit the new model
pipeline_lr.fit(X_train, y_train)

# Predict the success probability for the new data using the Logistic Regression pipeline
new_data_prob_lr = predict_success_probability_full_pipeline(pipeline_lr, new_data)

# Select the best PSP based on the predicted success probabilities and transaction fees
best_psp_index_lr, best_psp_score_lr = select_best_psp(new_data_prob_lr, new_data_fees)

new_data.reset_index(drop=True, inplace=True)
best_psp_data_lr = new_data.iloc[best_psp_index_lr]
best_psp_data_lr, best_psp_score_lr
