In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Verzeichnis für gespeicherte Modelle
model_dir = '/Users/userlow/Documents/kreditkarte_aufgabe/models'
os.makedirs(model_dir, exist_ok=True)

# Daten einlesen (Beispieldaten, bitte durch Ihre Daten ersetzen)
file_path = '/Users/userlow/Documents/kreditkarte_aufgabe/data/bereinigte_daten.xlsx'
data = pd.read_excel(file_path)


# Umwandeln von kategorischen Merkmalen in numerische
data['country'] = data['country'].astype('category').cat.codes
data['PSP'] = data['PSP'].astype('category').cat.codes
data['card'] = data['card'].astype('category').cat.codes

# Features und Zielvariable definieren
X = data[['amount', 'country', 'PSP', '3D_secured', 'card']]
y = data['success']

# Datenaufteilung in Trainings- und Testset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Basis-Modell definieren
basis_model_path = os.path.join(model_dir, 'basis_model.joblib')
if os.path.exists(basis_model_path):
    print(f"Basis model already exists. Loading the model from {basis_model_path}...")
    basis_model = joblib.load(basis_model_path)
else:
    print("Training basis model...")
    basis_model = RandomForestClassifier(random_state=42)
    basis_model.fit(X_train, y_train)
    joblib.dump(basis_model, basis_model_path)
    print(f"Basis model saved at {basis_model_path}")

# Vorhersagen und Bewertung des Basis-Modells
y_pred_basis = basis_model.predict(X_test)
accuracy_basis = accuracy_score(y_test, y_pred_basis)
print(f"Basis Model Accuracy: {accuracy_basis}")
print(classification_report(y_test, y_pred_basis))

# Berechnung der Gebühren vor der Prognose (basierend auf den tatsächlichen Erfolgen)
def calculate_fees(df, success_col):
    fees = 0
    for index, row in df.iterrows():
        psp = row['PSP']
        success = row[success_col]
        fees += transaction_fees[psp]['success'] if success == 1 else transaction_fees[psp]['failure']
    return fees

fees_before = calculate_fees(data, 'success')

# Berechnung der Gebühren nach der Prognose (basierend auf den vorhergesagten Erfolgen)
def calculate_predicted_fees(X_test, y_test, y_pred, transaction_fees):
    X_test_with_actual = X_test.copy()
    X_test_with_actual['success'] = y_test.values
    X_test_with_actual['predicted_success'] = y_pred
    
    fees = 0
    for index, row in X_test_with_actual.iterrows():
        psp = row['PSP']
        success = row['predicted_success']
        fees += transaction_fees[psp]['success'] if success == 1 else transaction_fees[psp]['failure']
    return fees

fees_after_basis = calculate_predicted_fees(X_test, y_test, y_pred_basis, transaction_fees)

# Modelle für Hyperparameteroptimierung definieren
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=0),
    'SVM': SVC(probability=True, random_state=42)
}

# Hyperparameter für GridSearchCV
param_grids = {
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    },
    'GradientBoosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 63, 127],
        'subsample': [0.8, 0.9, 1.0]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [3, 5, 7]
    },
    'SVM': {  # Hyperparameter für SVM hinzugefügt
        'C': [0.1, 1, 10],
        'gamma': [1, 0.1, 0.01],
        'kernel': ['rbf', 'poly', 'sigmoid']
    }
}

# Modelle trainieren, bewerten und speichern
best_models = {}
for model_name, model in models.items():
    model_path = os.path.join(model_dir, f"{model_name}_model.joblib")
    if os.path.exists(model_path):
        print(f"Model {model_name} already exists. Loading the model from {model_path}...")
        best_models[model_name] = joblib.load(model_path)
    else:
        print(f"Training {model_name}...")
        grid_search = GridSearchCV(model, param_grids[model_name], cv=5, n_jobs=-1, verbose=2)
        grid_search.fit(X_train, y_train)
        best_models[model_name] = grid_search.best_estimator_
        
        # Speichern des Modells
        joblib.dump(best_models[model_name], model_path)
        print(f"Model {model_name} saved at {model_path}")
    
    y_pred = best_models[model_name].predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

# Bestes Modell auswählen
best_model_name = max(best_models, key=lambda name: accuracy_score(y_test, best_models[name].predict(X_test)))
best_model = best_models[best_model_name]
best_accuracy = accuracy_score(y_test, best_model.predict(X_test))

# Berechnung der Gebühren nach der Prognose (basierend auf den vorhergesagten Erfolgen)
fees_after_best = calculate_predicted_fees(X_test, y_test, best_model.predict(X_test), transaction_fees)

# Wenn das Basis-Modell bessere Gebühren hat, verwende das Basis-Modell
if fees_after_basis < fees_after_best:
    best_model = basis_model
    best_model_name = 'BasisModel'
    best_accuracy = accuracy_basis
    fees_after_best = fees_after_basis

print(f"Best Model: {best_model_name} with Accuracy: {best_accuracy}")
print(f"Transaktionsgebühren vor der Prognose: {fees_before}")
print(f"Transaktionsgebühren nach der Prognose: {fees_after_best}")



Basis model already exists. Loading the model from /Users/userlow/Documents/kreditkarte_aufgabe/models/basis_model.joblib...
Basis Model Accuracy: 0.7318946598390637
              precision    recall  f1-score   support

           0       0.81      0.87      0.84      4329
           1       0.30      0.21      0.25      1139

    accuracy                           0.73      5468
   macro avg       0.55      0.54      0.54      5468
weighted avg       0.70      0.73      0.71      5468

Model RandomForest already exists. Loading the model from /Users/userlow/Documents/kreditkarte_aufgabe/models/RandomForest_model.joblib...
RandomForest Accuracy: 0.7979151426481346
              precision    recall  f1-score   support

           0       0.80      0.99      0.89      4329
           1       0.64      0.07      0.12      1139

    accuracy                           0.80      5468
   macro avg       0.72      0.53      0.50      5468
weighted avg       0.77      0.80      0.73      5468


Unnamed: 0,tmsp,country,amount,success,PSP,3D_secured,card
0,2019-01-10 03:49:12,0,6,0,1,0,0
1,2019-01-27 14:01:11,0,6,1,2,0,1
2,2019-02-08 05:02:33,0,6,0,3,0,0
3,2019-02-16 08:24:40,0,6,1,3,1,1
4,2019-01-25 04:18:26,0,7,0,1,0,1
