In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib

# Verzeichnis für gespeicherte Modelle
model_dir = '/Users/userlow/Documents/kreditkarte_aufgabe/models'
os.makedirs(model_dir, exist_ok=True)

# Modellpfad definieren
model_path = os.path.join(model_dir, 'basis_model.joblib')

# Daten einlesen (Beispieldaten, bitte durch Ihre Daten ersetzen)
file_path = '/Users/userlow/Documents/kreditkarte_aufgabe/data/bereinigte_daten.xlsx'
data = pd.read_excel(file_path)

# Umwandeln von kategorischen Merkmalen in numerische
data['country'] = data['country'].astype('category').cat.codes
data['PSP'] = data['PSP'].astype('category').cat.codes
data['card'] = data['card'].astype('category').cat.codes

# Features und Zielvariable definieren
X = data[['amount', 'country', 'PSP', '3D_secured', 'card']]
y = data['success']

# Datenaufteilung in Trainings- und Testset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Prüfen, ob das Modell bereits existiert
if os.path.exists(model_path):
    print(f"Model already exists. Loading the model from {model_path}...")
    model = joblib.load(model_path)
else:
    # Basismodell: RandomForestClassifier
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    
    # Modelltraining
    model.fit(X_train, y_train)
    
    # Modell speichern
    joblib.dump(model, model_path)
    print(f"Model saved at {model_path}")

# Vorhersagen
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Modellbewertung
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

# Annahmen für die Transaktionsgebühren
transaction_fees = {
    0: {'success': 2, 'failure': 5},  # PSP 0
    1: {'success': 1, 'failure': 3},  # PSP 1
    2: {'success': 0.5, 'failure': 1},  # PSP 2
    3: {'success': 1.5, 'failure': 4}   # PSP 3
}

# Berechnung der Gebühren vor der Prognose (basierend auf den tatsächlichen Erfolgen)
def calculate_fees(df, success_col):
    fees = 0
    for index, row in df.iterrows():
        psp = row['PSP']
        success = row[success_col]
        fees += transaction_fees[psp]['success'] if success == 1 else transaction_fees[psp]['failure']
    return fees

fees_before = calculate_fees(data, 'success')

# Berechnung der Gebühren nach der Prognose (basierend auf den vorhergesagten Erfolgen)
def calculate_predicted_fees(X_test, y_test, y_pred, transaction_fees):
    X_test_with_actual = X_test.copy()
    X_test_with_actual['success'] = y_test.values
    X_test_with_actual['predicted_success'] = y_pred
    
    fees = 0
    for index, row in X_test_with_actual.iterrows():
        psp = row['PSP']
        success = row['predicted_success']
        fees += transaction_fees[psp]['success'] if success == 1 else transaction_fees[psp]['failure']
    return fees

fees_after = calculate_predicted_fees(X_test, y_test, y_pred, transaction_fees)

print(f"Transaktionsgebühren vor der Prognose: {fees_before}")
print(f"Transaktionsgebühren nach der Prognose: {fees_after}")

Model saved at /Users/userlow/Documents/kreditkarte_aufgabe/models/basis_model.joblib
Accuracy: 0.7318946598390637
              precision    recall  f1-score   support

           0       0.81      0.87      0.84      4329
           1       0.30      0.21      0.25      1139

    accuracy                           0.73      5468
   macro avg       0.55      0.54      0.54      5468
weighted avg       0.70      0.73      0.71      5468

Transaktionsgebühren vor der Prognose: 73291.5
Transaktionsgebühren nach der Prognose: 15496.0
