In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy.stats import uniform, randint
import os
import joblib

# Daten einlesen (Beispieldaten, bitte durch Ihre Daten ersetzen)
model_dir = '/models'
os.makedirs(model_dir, exist_ok=True)

file_path = '/data/bereinigte_daten.xlsx'
data = pd.read_excel(file_path)

# Umwandeln von kategorischen Merkmalen in numerische
data['country'] = data['country'].astype('category').cat.codes
data['PSP'] = data['PSP'].astype('category').cat.codes
data['card'] = data['card'].astype('category').cat.codes

# Features und Zielvariable definieren
X = data[['amount', 'country', 'PSP', '3D_secured', 'card']]
y = data['success']

# Datenaufteilung in Trainings- und Testset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modelle definieren
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=0),
    'SVM': SVC(probability=True, random_state=42)
}

# Angepasste Hyperparameter für RandomizedSearchCV
param_grids = {
    'RandomForest': [
        {   # Suchraum 1: Mit Bootstrap und OOB-Scores
            'n_estimators': randint(100, 500),
            'max_depth': [None] + list(range(10, 51)),
            'min_samples_split': randint(2, 10),
            'min_samples_leaf': randint(1, 10),
            'bootstrap': [True],
            'criterion': ['gini', 'entropy'],
            'max_features': ['sqrt', 'log2', None],
            'min_weight_fraction_leaf': [0.0, 0.1, 0.2],
            'max_leaf_nodes': [None] + list(range(10, 51)),
            'min_impurity_decrease': [0.0, 0.1, 0.2],
            'oob_score': [True],
            'warm_start': [False, True],
            'class_weight': [None, 'balanced', 'balanced_subsample']
        },
        {   # Suchraum 2: Ohne OOB-Scores
            'n_estimators': randint(100, 500),
            'max_depth': [None] + list(range(10, 51)),
            'min_samples_split': randint(2, 10),
            'min_samples_leaf': randint(1, 10),
            'bootstrap': [False],
            'criterion': ['gini', 'entropy'],
            'max_features': ['sqrt', 'log2', None],
            'min_weight_fraction_leaf': [0.0, 0.1, 0.2],
            'max_leaf_nodes': [None] + list(range(10, 51)),
            'min_impurity_decrease': [0.0, 0.1, 0.2],
            'oob_score': [False],
            'warm_start': [False, True],
            'class_weight': [None, 'balanced', 'balanced_subsample']
        }
    ],
    'GradientBoosting': {
        'n_estimators': randint(100, 500),
        'learning_rate': uniform(0.01, 0.3),
        'max_depth': randint(3, 10),
        'subsample': uniform(0.8, 0.2)
    },
    'XGBoost': {
        'n_estimators': randint(100, 500),
        'learning_rate': uniform(0.01, 0.3),
        'max_depth': randint(3, 10),
        'subsample': uniform(0.8, 0.2),
        'colsample_bytree': uniform(0.8, 0.2)
    },
    'LightGBM': {
        'n_estimators': randint(100, 500),
        'learning_rate': uniform(0.01, 0.3),
        'num_leaves': randint(31, 127),
        'subsample': uniform(0.8, 0.2)
    },
    'CatBoost': {
        'iterations': randint(100, 500),
        'learning_rate': uniform(0.01, 0.3),
        'depth': randint(3, 10)
    },
    'SVM': {
        'C': uniform(0.1, 10),
        'gamma': uniform(0.01, 1),
        'kernel': ['rbf']
    }
}

# Modelle trainieren, bewerten und speichern
best_models = {}
for model_name, model in models.items():
    model_path = os.path.join(model_dir, f"{model_name}_model.joblib")
    if os.path.exists(model_path):
        print(f"Model {model_name} already exists. Loading the model from {model_path}...")
        best_models[model_name] = joblib.load(model_path)
    else:
        print(f"Training {model_name}...")
        if model_name == 'RandomForest':
            # RandomForest separat behandeln
            best_estimator = None
            best_score = -np.inf
            for param_grid in param_grids['RandomForest']:
                random_search = RandomizedSearchCV(model, param_grid, n_iter=100, cv=5, n_jobs=-1, verbose=2, random_state=42, scoring='accuracy')
                random_search.fit(X_train, y_train)
                if random_search.best_score_ > best_score:
                    best_score = random_search.best_score_
                    best_estimator = random_search.best_estimator_
            best_models[model_name] = best_estimator
        else:
            random_search = RandomizedSearchCV(model, param_grids[model_name], n_iter=100, cv=5, n_jobs=-1, verbose=2, random_state=42, scoring='accuracy')
            random_search.fit(X_train, y_train)
            best_models[model_name] = random_search.best_estimator_
        
        # Speichern des Modells
        joblib.dump(best_models[model_name], model_path)
        print(f"Model {model_name} saved at {model_path}")
    
    y_pred = best_models[model_name].predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

# Überprüfen der gespeicherten Modelle
for model_name in best_models.keys():
    model_path = os.path.join(model_dir, f"{model_name}_model.joblib")
    if not os.path.exists(model_path):
        print(f"Warning: Model {model_name} was not saved properly!")

# Annahmen für die Transaktionsgebühren
transaction_fees = {
    0: {'success': 2, 'failure': 5},  # PSP 0
    1: {'success': 1, 'failure': 3},  # PSP 1
    2: {'success': 0.5, 'failure': 1},  # PSP 2
    3: {'success': 1.5, 'failure': 4}   # PSP 3
}

# Gebührenberechnung
def calculate_fees(df, success_col):
    fees = 0
    for index, row in df.iterrows():
        psp = row['PSP']
        success = row[success_col]
        fees += transaction_fees[psp]['success'] if success == 1 else transaction_fees[psp]['failure']
    return fees

fees_before = calculate_fees(data, 'success')

# Ensemble-Modell erstellen
estimators = [(name, model) for name, model in best_models.items()]
ensemble_model = VotingClassifier(estimators=estimators, voting='soft')
ensemble_model.fit(X_train, y_train)

y_pred = ensemble_model.predict(X_test)

X_test_with_actual = X_test.copy()
X_test_with_actual['success'] = y_test.values
X_test_with_actual['predicted_success'] = y_pred

fees_after = calculate_fees(X_test_with_actual, 'predicted_success')

print(f"Transaktionsgebühren vor der Prognose: {fees_before}")
print(f"Transaktionsgebühren nach der Prognose: {fees_after}")

# Berechnung der Gebühren nach der Prognose für jedes Modell
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    X_test_with_actual = X_test.copy()
    X_test_with_actual['success'] = y_test.values
    X_test_with_actual['predicted_success'] = y_pred

    fees_after = calculate_fees(X_test_with_actual, 'predicted_success')
    print(f"Transaktionsgebühren nach der Prognose für {model_name}: {fees_after}")

Model RandomForest already exists. Loading the model from /Users/userlow/Documents/kreditkarte_aufgabe/models/RandomForest_model.joblib...
RandomForest Accuracy: 0.7990124359912216
              precision    recall  f1-score   support

           0       0.80      0.99      0.89      4329
           1       0.70      0.06      0.11      1139

    accuracy                           0.80      5468
   macro avg       0.75      0.53      0.50      5468
weighted avg       0.78      0.80      0.73      5468

Model GradientBoosting already exists. Loading the model from /Users/userlow/Documents/kreditkarte_aufgabe/models/GradientBoosting_model.joblib...
GradientBoosting Accuracy: 0.7991953182150695
              precision    recall  f1-score   support

           0       0.80      0.99      0.89      4329
           1       0.70      0.06      0.11      1139

    accuracy                           0.80      5468
   macro avg       0.75      0.53      0.50      5468
weighted avg       0.78    