In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Daten einlesen (Beispieldaten, bitte durch Ihre Daten ersetzen)
file_path = '/Users/userlow/Documents/kreditkarte_aufgabe/data/bereinigte_daten.xlsx'
data = pd.read_excel(file_path)

# Umwandeln von kategorischen Merkmalen in numerische
data['country'] = data['country'].astype('category').cat.codes
data['PSP'] = data['PSP'].astype('category').cat.codes
data['card'] = data['card'].astype('category').cat.codes

# Features und Zielvariable definieren
X = data[['amount', 'country', 'PSP', '3D_secured', 'card']]
y = data['success']

# Datenaufteilung in Trainings- und Testset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline mit StandardScaler und RandomForestClassifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter für GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__bootstrap': [True, False]
}

# GridSearchCV für Hyperparameteroptimierung
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Bestes Modell
best_model = grid_search.best_estimator_

# Vorhersagen mit dem besten Modell
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Modellbewertung
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

# Annahmen für die Transaktionsgebühren
transaction_fees = {
    0: {'success': 2, 'failure': 5},  # PSP 0
    1: {'success': 1, 'failure': 3},  # PSP 1
    2: {'success': 0.5, 'failure': 1},  # PSP 2
    3: {'success': 1.5, 'failure': 4}   # PSP 3
}

# Berechnung der Gebühren vor der Prognose (basierend auf den tatsächlichen Erfolgen)
def calculate_fees(df, success_col):
    fees = 0
    for index, row in df.iterrows():
        psp = row['PSP']
        success = row[success_col]
        fees += transaction_fees[psp]['success'] if success == 1 else transaction_fees[psp]['failure']
    return fees

fees_before = calculate_fees(data, 'success')

# Gebühren nach der Prognose (basierend auf den vorhergesagten Erfolgen)
X_test_with_actual = X_test.copy()
X_test_with_actual['success'] = y_test.values
X_test_with_actual['predicted_success'] = y_pred

fees_after = calculate_fees(X_test_with_actual, 'predicted_success')

print(f"Transaktionsgebühren vor der Prognose: {fees_before}")
print(f"Transaktionsgebühren nach der Prognose: {fees_after}")

# Anzeigen der ersten Zeilen nach dem Entfernen von Duplikaten
data.head()

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END classifier__bootstrap=True, classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   1.4s
[CV] END classifier__bootstrap=True, classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   1.4s
[CV] END classifier__bootstrap=True, classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   1.4s
[CV] END classifier__bootstrap=True, classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   1.4s
[CV] END classifier__bootstrap=True, classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   1.5s
[CV] END classifier__bootstrap=True, clas



[CV] END classifier__bootstrap=False, classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   4.1s
[CV] END classifier__bootstrap=False, classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   4.0s
[CV] END classifier__bootstrap=False, classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   4.0s
[CV] END classifier__bootstrap=False, classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   4.0s
[CV] END classifier__bootstrap=False, classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   4.0s
[CV] END classifier__bootstrap=False, classifier__max_depth=None, classifier__min_samples_leaf=1, cl

Unnamed: 0,tmsp,country,amount,success,PSP,3D_secured,card
0,2019-01-10 03:49:12,0,6,0,1,0,0
1,2019-01-27 14:01:11,0,6,1,2,0,1
2,2019-02-08 05:02:33,0,6,0,3,0,0
3,2019-02-16 08:24:40,0,6,1,3,1,1
4,2019-01-25 04:18:26,0,7,0,1,0,1


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import os
import joblib
# Daten einlesen (Beispieldaten, bitte durch Ihre Daten ersetzen)
model_dir = '/Users/userlow/Documents/kreditkarte_aufgabe/models'
os.makedirs(model_dir, exist_ok=True)

file_path = '/Users/userlow/Documents/kreditkarte_aufgabe/data/bereinigte_daten.xlsx'
data = pd.read_excel(file_path)


# Umwandeln von kategorischen Merkmalen in numerische
data['country'] = data['country'].astype('category').cat.codes
data['PSP'] = data['PSP'].astype('category').cat.codes
data['card'] = data['card'].astype('category').cat.codes

# Features und Zielvariable definieren
X = data[['amount', 'country', 'PSP', '3D_secured', 'card']]
y = data['success']

# Datenaufteilung in Trainings- und Testset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modelle definieren
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=0),
    'SVM': SVC(probability=True, random_state=42) 
}

# Hyperparameter für GridSearchCV
param_grids = {
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    },
    'GradientBoosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 63, 127],
        'subsample': [0.8, 0.9, 1.0]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [3, 5, 7]
    },
    'SVM': {  
        'C': [0.1, 1, 10],
        'gamma': [1, 0.1, 0.01],
        'kernel': ['rbf']
    }
}
# Modelle trainieren, bewerten und speichern
best_models = {}
for model_name, model in models.items():
    model_path = os.path.join(model_dir, f"{model_name}_model.joblib")
    if os.path.exists(model_path):
        print(f"Model {model_name} already exists. Loading the model from {model_path}...")
        best_models[model_name] = joblib.load(model_path)
    else:
        print(f"Training {model_name}...")
        grid_search = GridSearchCV(model, param_grids[model_name], cv=5, n_jobs=-1, verbose=2)
        grid_search.fit(X_train, y_train)
        best_models[model_name] = grid_search.best_estimator_
        
        # Speichern des Modells
        joblib.dump(best_models[model_name], model_path)
        print(f"Model {model_name} saved at {model_path}")
    
    y_pred = best_models[model_name].predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

# Überprüfen der gespeicherten Modelle
for model_name in best_models.keys():
    model_path = os.path.join(model_dir, f"{model_name}_model.joblib")
    if not os.path.exists(model_path):
        print(f"Warning: Model {model_name} was not saved properly!")

# Annahmen für die Transaktionsgebühren
transaction_fees = {
    0: {'success': 2, 'failure': 5},  # PSP 0
    1: {'success': 1, 'failure': 3},  # PSP 1
    2: {'success': 0.5, 'failure': 1},  # PSP 2
    3: {'success': 1.5, 'failure': 4}   # PSP 3
}

# Gebührenberechnung
def calculate_fees(df, success_col):
    fees = 0
    for index, row in df.iterrows():
        psp = row['PSP']
        success = row[success_col]
        fees += transaction_fees[psp]['success'] if success == 1 else transaction_fees[psp]['failure']
    return fees

fees_before = calculate_fees(data, 'success')

# Bestes Modell laden und verwenden
best_model_name = max(best_models, key=lambda name: accuracy_score(y_test, best_models[name].predict(X_test)))
print(f"Best Model: {best_model_name}")

best_model_path = os.path.join(model_dir, f"{best_model_name}_model.joblib")
print(f"Loading best model from {best_model_path}")
best_model = joblib.load(best_model_path)
y_pred = best_model.predict(X_test)

X_test_with_actual = X_test.copy()
X_test_with_actual['success'] = y_test.values
X_test_with_actual['predicted_success'] = y_pred

fees_after = calculate_fees(X_test_with_actual, 'predicted_success')

print(f"Transaktionsgebühren vor der Prognose: {fees_before}")
print(f"Transaktionsgebühren nach der Prognose: {fees_after}")



Model RandomForest already exists. Loading the model from /Users/userlow/Documents/kreditkarte_aufgabe/models/RandomForest_model.joblib...
RandomForest Accuracy: 0.7979151426481346
              precision    recall  f1-score   support

           0       0.80      0.99      0.89      4329
           1       0.64      0.07      0.12      1139

    accuracy                           0.80      5468
   macro avg       0.72      0.53      0.50      5468
weighted avg       0.77      0.80      0.73      5468

Model GradientBoosting already exists. Loading the model from /Users/userlow/Documents/kreditkarte_aufgabe/models/GradientBoosting_model.joblib...
GradientBoosting Accuracy: 0.7991953182150695
              precision    recall  f1-score   support

           0       0.80      0.99      0.89      4329
           1       0.70      0.06      0.11      1139

    accuracy                           0.80      5468
   macro avg       0.75      0.53      0.50      5468
weighted avg       0.78    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Model: GradientBoosting
Loading best model from /Users/userlow/Documents/kreditkarte_aufgabe/models/GradientBoosting_model.joblib
Transaktionsgebühren vor der Prognose: 73291.5
Transaktionsgebühren nach der Prognose: 16934
