In [1]:
# 1. Bibliotheken importieren
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import requests

In [2]:
# 2. Kreditkartenbetrugsdaten laden
credit_data = pd.read_csv("creditcard.csv", sep=",", quotechar='"', on_bad_lines='skip')
credit_data2test = pd.read_csv("fraudTrain.csv")
credit_data2train = pd.read_csv("fraudTest.csv")

# 3. Feiertagsdaten einlesen
holidays = pd.read_csv('eu_holidays.csv')

# 4. Feiertagsdaten aufbereiten
# Beispiel: nur Datumsspalte und Feiertags-Flag
holidays['Date'] = pd.to_datetime(holidays['Date'])
holidays['is_holiday'] = 1

# 5. Kreditkarten-Datensatz vorbereiten (Datum hinzufügen)
# ECHTES Datum aus Sekunden ab fiktivem Startdatum generieren
start_date = pd.to_datetime('2013-09-09')  # Dein gewünschtes Startdatum
credit_data['trans_date_trans_time'] = start_date + pd.to_timedelta(credit_data['Time'], unit='s')



df_credittest_relevant = credit_data2test[['trans_date_trans_time', 'amt', 'is_fraud']]
df_credittrain_relevant = credit_data2train[['trans_date_trans_time', 'amt', 'is_fraud']]
df_creditdate_relevant = credit_data[['trans_date_trans_time', 'Amount', 'Class']]

In [3]:
# 6. Merge der beiden Datensätze über das Datum
df = pd.DataFrame(columns=['trans_date_trans_time', 'amt', 'is_fraud'])

df_creditdate_relevant = df_creditdate_relevant.rename(columns={
    'Amount': 'amt',
    'Class': 'is_fraud'
})

# Datentypen bei Datum ggf. angleichen
df_credittest_relevant['trans_date_trans_time'] = pd.to_datetime(df_credittest_relevant['trans_date_trans_time'])
df_credittrain_relevant['trans_date_trans_time'] = pd.to_datetime(df_credittrain_relevant['trans_date_trans_time'])
df_creditdate_relevant['trans_date_trans_time'] = pd.to_datetime(df_creditdate_relevant['trans_date_trans_time'])

# Alle drei DataFrames untereinander zusammenfügen
df_all = pd.concat([df_credittest_relevant, df_credittrain_relevant, df_creditdate_relevant], ignore_index=True)

df_all['date_only'] = pd.to_datetime(df_all['trans_date_trans_time']).dt.normalize()
df_all.drop(columns=['trans_date_trans_time'], inplace=True)
df_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_credittest_relevant['trans_date_trans_time'] = pd.to_datetime(df_credittest_relevant['trans_date_trans_time'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_credittrain_relevant['trans_date_trans_time'] = pd.to_datetime(df_credittrain_relevant['trans_date_trans_time'])


Unnamed: 0,amt,is_fraud,date_only
0,4.97,0,2019-01-01
1,107.23,0,2019-01-01
2,220.11,0,2019-01-01
3,45.00,0,2019-01-01
4,41.96,0,2019-01-01
...,...,...,...
2137196,0.77,0,2013-09-10
2137197,24.79,0,2013-09-10
2137198,67.88,0,2013-09-10
2137199,10.00,0,2013-09-10


In [4]:
# Merge df_all mit holidays (links, damit alle df_all-Zeilen bleiben)
holidays_relevant = holidays[['Date', 'is_holiday']]
df_all = df_all.merge(holidays_relevant, how='left',
                      left_on='date_only', right_on='Date')

# Fehlende Werte in is_holiday durch 0 ersetzen (keine Feiertage)
df_all['is_holiday'] = df_all['is_holiday'].fillna(0).astype(int)

# Falls gewünscht, kannst du die zusätzliche 'Date' Spalte entfernen
df_all.drop(columns=['Date'], inplace=True)

# Ergebnis prüfen
df_all.head()

Unnamed: 0,amt,is_fraud,date_only,is_holiday
0,4.97,0,2019-01-01,1
1,107.23,0,2019-01-01,1
2,220.11,0,2019-01-01,1
3,45.0,0,2019-01-01,1
4,41.96,0,2019-01-01,1


In [5]:
# 7. Feature Engineering (z.B. nur einige Features verwenden)
features = [ 'is_holiday', 'amt']  # Beispielfeatures
X = df_all[features]
y = df_all['is_fraud']  # 0 = normal, 1 = Betrug

# 8. Daten aufteilen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# 9. Modelle trainieren
# Random Forest
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Logistic Regression
lr = LogisticRegression(class_weight='balanced', max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [7]:
# 10. Modelle bewerten
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

print("\nLogistic Regression:")
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))

# 11. Bestes Modell auswählen und speichern (z.B. Random Forest)
import joblib
import os
joblib.dump(lr, 'best_model.joblib')
print("Gespeichert:", os.path.exists("best_model.joblib"))  # Muss True sein

Random Forest:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97    425427
           1       0.03      0.31      0.05      2014

    accuracy                           0.94    427441
   macro avg       0.51      0.63      0.51    427441
weighted avg       0.99      0.94      0.97    427441

Accuracy: 0.9411053221380261

Logistic Regression:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97    425427
           1       0.06      0.75      0.11      2014

    accuracy                           0.95    427441
   macro avg       0.53      0.85      0.54    427441
weighted avg       0.99      0.95      0.97    427441

Accuracy: 0.9451690408734773
Gespeichert: True
