In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder


In [13]:
train_data = pd.read_csv("/kaggle/input/fraud-detection/fraudTrain.csv")
test_data = pd.read_csv("/kaggle/input/fraud-detection/fraudTest.csv")

In [14]:
TARGET_COL = 'is_fraud'

In [15]:
def preprocess_data(data):
    # Convert transaction date to datetime and extract time-related features
    data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])
    data['trans_hour'] = data['trans_date_trans_time'].dt.hour
    data['trans_day'] = data['trans_date_trans_time'].dt.dayofweek  # 0 = Monday, 6 = Sunday
    data['is_weekend'] = data['trans_day'].apply(lambda x: 1 if x >= 5 else 0)

    # Drop columns that are unlikely to contribute to prediction
    drop_cols = ['trans_date_trans_time', 'cc_num', 'first', 'last', 'street']
    data = data.drop(columns=drop_cols)
    
    # Label encode all non-numeric columns to avoid conversion errors
    for col in data.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

    # Normalize the amount column
    scaler = StandardScaler()
    data['amt'] = scaler.fit_transform(data[['amt']])

    return data

In [16]:
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

In [17]:
X_train = train_data.drop(columns=[TARGET_COL])
y_train = train_data[TARGET_COL]

In [18]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [19]:
log_reg = LogisticRegression(random_state=42)
tree = DecisionTreeClassifier(random_state=42)
forest = RandomForestClassifier(random_state=42)

In [20]:
models = {'Logistic Regression': log_reg, 'Decision Tree': tree, 'Random Forest': forest}
for model_name, model in models.items():
    model.fit(X_train_split, y_train_split)
    y_val_pred = model.predict(X_val_split)
    print(f"\n{model_name} Evaluation:")
    print(classification_report(y_val_split, y_val_pred))
    print(f"ROC AUC Score: {roc_auc_score(y_val_split, y_val_pred):.4f}")


Logistic Regression Evaluation:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      1.00    257815
           1       0.00      0.00      0.00      1520

    accuracy                           0.99    259335
   macro avg       0.50      0.50      0.50    259335
weighted avg       0.99      0.99      0.99    259335

ROC AUC Score: 0.5000

Decision Tree Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.82      0.84      0.83      1520

    accuracy                           1.00    259335
   macro avg       0.91      0.92      0.91    259335
weighted avg       1.00      1.00      1.00    259335

ROC AUC Score: 0.9172

Random Forest Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.98      0.75      0.85      1520

    accuracy                           1.00    259335
   macro avg       0.99      0

In [21]:
best_model = forest

In [23]:
X_test = test_data.drop(columns=[TARGET_COL])
y_test = test_data[TARGET_COL]
y_test_pred = best_model.predict(X_test)

In [24]:
print("\nFinal Model Evaluation on Test Set:")
print(classification_report(y_test, y_test_pred))
print(f"ROC AUC Score on Test Set: {roc_auc_score(y_test, y_test_pred):.4f}")


Final Model Evaluation on Test Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.93      0.70      0.80      2145

    accuracy                           1.00    555719
   macro avg       0.97      0.85      0.90    555719
weighted avg       1.00      1.00      1.00    555719

ROC AUC Score on Test Set: 0.8500
