XGBoost with No feature engineering

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
)

# --- 1. Load Data ---
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Fraudulent Data (5)/Fraudulent_E-Commerce_Transaction_Data_MERGED.csv')

print("Starting SIMPLE XGBOOST MODEL (No Feature Engineering)")

# --- 2. Remove columns not usable directly ---
cols_to_drop = [
    'Transaction ID',
    'Customer ID',
    'Shipping Address',
    'Billing Address',
    'Customer Location',
    'IP Address',
    'Transaction Date'      # datetime not used
]

df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors='ignore')

# --- 3. Handle Missing Values ---
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != 'Is Fraudulent']  # exclude target

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# numeric → median
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

# categorical → "Unknown"
for col in categorical_cols:
    df[col] = df[col].fillna("Unknown")

# --- 4. Encode categorical variables (simple, no OHE) ---
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# --- 5. Prepare features ---
X = df.drop(columns=['Is Fraudulent'])
y = df['Is Fraudulent']

# --- 6. Train / Validation / Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# validation split from training
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.20, random_state=42, stratify=y_train
)

print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

# --- 7. Train Simple XGBoost Model ---
model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_estimators=200,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9
)

model.fit(X_train, y_train)

# --- 8. Validation Metrics ---
val_pred = model.predict(X_val)
val_proba = model.predict_proba(X_val)[:, 1]

print("\n--- VALID METRICS ---")
print("Accuracy:", accuracy_score(y_val, val_pred))
print("Precision:", precision_score(y_val, val_pred))
print("Recall:", recall_score(y_val, val_pred))
print("F1 Score:", f1_score(y_val, val_pred))
print("AUC:", roc_auc_score(y_val, val_proba))

# --- 9. Test Metrics ---
test_pred = model.predict(X_test)
test_proba = model.predict_proba(X_test)[:, 1]

print("\n--- TEST METRICS ---")
print("Accuracy:", accuracy_score(y_test, test_pred))
print("Precision:", precision_score(y_test, test_pred))
print("Recall:", recall_score(y_test, test_pred))
print("F1 Score:", f1_score(y_test, test_pred))
print("AUC:", roc_auc_score(y_test, test_proba))

print("\n--- CLASSIFICATION REPORT (Test) ---")
print(classification_report(y_test, test_pred))


Starting SIMPLE XGBOOST MODEL (No Feature Engineering)
Train: 838088, Validation: 209522, Test: 448976

--- VALID METRICS ---
Accuracy: 0.9558232548372009
Precision: 0.7936210131332082
Recall: 0.16102017510468214
F1 Score: 0.26772151898734176
AUC: 0.8177930205046777

--- TEST METRICS ---
Accuracy: 0.9554029614055094
Precision: 0.7828156880525958
Recall: 0.15334399147348787
F1 Score: 0.2564521519551413
AUC: 0.8191914196349335

--- CLASSIFICATION REPORT (Test) ---
              precision    recall  f1-score   support

           0       0.96      1.00      0.98    426458
           1       0.78      0.15      0.26     22518

    accuracy                           0.96    448976
   macro avg       0.87      0.58      0.62    448976
weighted avg       0.95      0.96      0.94    448976



RandomForest with no feature engineering

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
)

# --- 1. Load Data ---
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Fraudulent Data (5)/Fraudulent_E-Commerce_Transaction_Data_MERGED.csv')

print("Starting SIMPLE RANDOM FOREST MODEL (No Feature Engineering)")

# --- 2. Remove unusable columns ---
cols_to_drop = [
    'Transaction ID',
    'Customer ID',
    'Shipping Address',
    'Billing Address',
    'Customer Location',
    'IP Address',
    'Transaction Date'      # datetime kept out since no feature engineering
]

df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors='ignore')

# --- 3. Handle Missing Values ---
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != 'Is Fraudulent']  # exclude target

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# numeric → median
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

# categorical → "Unknown"
for col in categorical_cols:
    df[col] = df[col].fillna("Unknown")

# --- 4. Encode categorical variables (simple label encoding) ---
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# --- 5. Prepare features ---
X = df.drop(columns=['Is Fraudulent'])
y = df['Is Fraudulent']

# --- 6. Train / Validation / Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# Secondary validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.20, random_state=42, stratify=y_train
)

print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

# --- 7. Train Random Forest ---
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight="balanced",     # handles imbalance
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# --- 8. Validation Metrics ---
val_pred = model.predict(X_val)
val_proba = model.predict_proba(X_val)[:, 1]

print("\n--- VALID METRICS ---")
print("Accuracy:", accuracy_score(y_val, val_pred))
print("Precision:", precision_score(y_val, val_pred))
print("Recall:", recall_score(y_val, val_pred))
print("F1 Score:", f1_score(y_val, val_pred))
print("AUC:", roc_auc_score(y_val, val_proba))

# --- 9. Test Metrics ---
test_pred = model.predict(X_test)
test_proba = model.predict_proba(X_test)[:, 1]

print("\n--- TEST METRICS ---")
print("Accuracy:", accuracy_score(y_test, test_pred))
print("Precision:", precision_score(y_test, test_pred))
print("Recall:", recall_score(y_test, test_pred))
print("F1 Score:", f1_score(y_test, test_pred))
print("AUC:", roc_auc_score(y_test, test_proba))

print("\n--- CLASSIFICATION REPORT (Test) ---")
print(classification_report(y_test, test_pred))


Starting SIMPLE RANDOM FOREST MODEL (No Feature Engineering)
Train: 838088, Validation: 209522, Test: 448976

--- VALID METRICS ---
Accuracy: 0.9553698418304521
Precision: 0.7727487034417727
Recall: 0.15597639893414542
F1 Score: 0.2595613271042838
AUC: 0.7933515892466723

--- TEST METRICS ---
Accuracy: 0.9549218666476604
Precision: 0.7629817678282945
Recall: 0.14681588062883028
F1 Score: 0.2462478119995531
AUC: 0.7933679585451869

--- CLASSIFICATION REPORT (Test) ---
              precision    recall  f1-score   support

           0       0.96      1.00      0.98    426458
           1       0.76      0.15      0.25     22518

    accuracy                           0.95    448976
   macro avg       0.86      0.57      0.61    448976
weighted avg       0.95      0.95      0.94    448976



Logistic regression with no feature engineering

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
)

# --- 1. Load Data ---
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Fraudulent Data (5)/Fraudulent_E-Commerce_Transaction_Data_MERGED.csv')

print("Starting SIMPLE MODEL (No Feature Engineering)")

# --- 2. Remove columns that cannot be used directly ---

cols_to_drop = [
    'Transaction ID',
    'Customer ID',
    'Shipping Address',
    'Billing Address',
    'Customer Location',
    'IP Address',
    'Transaction Date'     # raw datetime cannot be used directly
]

df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors='ignore')

# Keep Transaction Hour and Transaction Amount as-is
# Keep categorical variables but encode them later

# --- 3. Handle Missing Values ---

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != 'Is Fraudulent']

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

for col in categorical_cols:
    df[col] = df[col].fillna("Unknown")

# --- 4. Encode Categorical Columns (Very simple, no OHE) ---

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# --- 5. Prepare X and y ---

X = df.drop(columns=['Is Fraudulent'])
y = df['Is Fraudulent']

# --- 6. Train / Validation / Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# Make a validation split from training data
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.20, random_state=42, stratify=y_train
)

print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

# --- 7. Train a SIMPLE ML MODEL (Logistic Regression) ---

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# --- 8. Validation Metrics ---

val_pred = model.predict(X_val)
val_proba = model.predict_proba(X_val)[:, 1]

print("\n--- VALIDATION METRICS ---")
print("Accuracy:", accuracy_score(y_val, val_pred))
print("Precision:", precision_score(y_val, val_pred))
print("Recall:", recall_score(y_val, val_pred))
print("F1 Score:", f1_score(y_val, val_pred))
print("AUC:", roc_auc_score(y_val, val_proba))

# --- 9. Test Metrics ---

test_pred = model.predict(X_test)
test_proba = model.predict_proba(X_test)[:, 1]

print("\n--- TEST METRICS ---")
print("Accuracy:", accuracy_score(y_test, test_pred))
print("Precision:", precision_score(y_test, test_pred))
print("Recall:", recall_score(y_test, test_pred))
print("F1 Score:", f1_score(y_test, test_pred))
print("AUC:", roc_auc_score(y_test, test_proba))

print("\n--- CLASSIFICATION REPORT (Test) ---")
print(classification_report(y_test, test_pred))


Starting SIMPLE MODEL (No Feature Engineering)
Train: 838088, Validation: 209522, Test: 448976

--- VALIDATION METRICS ---
Accuracy: 0.9547684730004486
Precision: 0.8622628250175686
Recall: 0.11676817662733156
F1 Score: 0.20568267538345486
AUC: 0.7677108293295923

--- TEST METRICS ---
Accuracy: 0.9546456826200064
Precision: 0.8704022000687521
Recall: 0.11244337863042898
F1 Score: 0.1991583749557557
AUC: 0.7715598183251018

--- CLASSIFICATION REPORT (Test) ---
              precision    recall  f1-score   support

           0       0.96      1.00      0.98    426458
           1       0.87      0.11      0.20     22518

    accuracy                           0.95    448976
   macro avg       0.91      0.56      0.59    448976
weighted avg       0.95      0.95      0.94    448976



initial feature engineering with XGBoost (NOT FINAL MODEL)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report
)

# ============================================================
# 1. LOAD DATA
# ============================================================

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Fraudulent Data (5)/Fraudulent_E-Commerce_Transaction_Data_MERGED.csv')

print("--- Starting Feature Engineering ---")


# ============================================================
# 2. PREPROCESSING
# ============================================================

# Convert date + hour → timestamp
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
df['Transaction_DateTime'] = df['Transaction Date'] + pd.to_timedelta(df['Transaction Hour'], unit='H')

df = df.sort_values(by='Transaction_DateTime').reset_index(drop=True)

# Handle duplicates
initial_rows = len(df)
df = df.drop_duplicates(subset=['Transaction ID'], keep='first')
print(f"Dropped {initial_rows - len(df)} duplicate transaction records.")

# Handle missing values
numerical_cols = ['Transaction Amount', 'Customer Age']
categorical_cols = ['Payment Method', 'Product Category', 'Device Used']

for col in numerical_cols:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)
    print(f"Imputed NaN values in '{col}' with median: {median_val}")

for col in categorical_cols:
    df[col].fillna('Unknown', inplace=True)
    print(f"Imputed NaN values in '{col}' with 'Unknown'")


# ============================================================
# 3. ONE-HOT ENCODING
# ============================================================

df = pd.get_dummies(
    df,
    columns=['Payment Method', 'Device Used', 'Product Category'],
    prefix=['PM', 'Device', 'Product'],
    dtype=int
)


# ============================================================
# 4. TEMPORAL FEATURES
# ============================================================

df['Transaction_Year'] = df['Transaction_DateTime'].dt.year
df['Transaction_Month'] = df['Transaction_DateTime'].dt.month
df['Transaction_Day_Of_Week'] = df['Transaction_DateTime'].dt.dayofweek
df['Transaction_Hour_Of_Day'] = df['Transaction_DateTime'].dt.hour
df['Is_Weekend'] = df['Transaction_Day_Of_Week'].apply(lambda x: 1 if x >= 5 else 0)


# ============================================================
# 5. VELOCITY FEATURES
# ============================================================

def calculate_velocity_features(df_input, group_col, time_col, window, agg_col='Transaction Amount'):
    df_temp = df_input.copy()
    grouped = df_temp.groupby(group_col)

    count_series = grouped.rolling(window=window, min_periods=1, on=time_col)[agg_col] \
                        .count().shift(1).fillna(0)

    sum_series = grouped.rolling(window=window, min_periods=1, on=time_col)[agg_col] \
                        .sum().shift(1).fillna(0)

    mean_series = grouped.rolling(window=window, min_periods=1, on=time_col)[agg_col] \
                         .mean().shift(1).fillna(0)

    # Reset index for merging
    count_series = count_series.reset_index(level=0, drop=True)
    sum_series = sum_series.reset_index(level=0, drop=True)
    mean_series = mean_series.reset_index(level=0, drop=True)

    return pd.DataFrame({
        f'{group_col}_Tx_Count_Last_{window}': count_series,
        f'{group_col}_Amount_Sum_Last_{window}': sum_series,
        f'{group_col}_Amount_Mean_Last_{window}': mean_series
    })

velocity_1d = calculate_velocity_features(df, 'Customer ID', 'Transaction_DateTime', '1D')
velocity_7d = calculate_velocity_features(df, 'Customer ID', 'Transaction_DateTime', '7D')

df = pd.concat([df.reset_index(drop=True),
                velocity_1d.reset_index(drop=True),
                velocity_7d.reset_index(drop=True)], axis=1)


# ============================================================
# 6. IDENTITY + ACCOUNT FEATURES
# ============================================================

df['Address_Match'] = (df['Shipping Address'] == df['Billing Address']).astype(int)
df['Customer_Tx_Count_Total'] = df.groupby('Customer ID')['Transaction ID'].transform('count')
df['Account_Age_to_Tx_Ratio'] = df['Account Age Days'] / df['Customer_Tx_Count_Total']


# ============================================================
# 7. FEATURE SELECTION
# ============================================================

X = df.drop(columns=[
    'Transaction ID', 'Customer ID', 'Transaction Date', 'Transaction Hour',
    'Shipping Address', 'Billing Address', 'Is Fraudulent',
    'Transaction_Day_Of_Week', 'Transaction_DateTime',
    'Customer_Tx_Count_Total', 'Customer Location', 'IP Address'
])

y = df['Is Fraudulent']

print(f"\nTotal Features: {X.shape[1]}")
print(f"Transactions: {len(df)}")
print(f"Fraudulent Cases: {y.sum()}")
print(f"Non-Fraudulent Cases: {len(y)-y.sum()}")


# ============================================================
# 8. TRAIN/TEST SPLIT + VALIDATION SPLIT
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.20, random_state=42, stratify=y_train
)

print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")


# ============================================================
# 9. TRAIN XGBOOST
# ============================================================

model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1])
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)


# ============================================================
# 10. VALIDATION METRICS
# ============================================================

val_pred = model.predict(X_val)
val_proba = model.predict_proba(X_val)[:, 1]

print("\n--- VALIDATION METRICS ---")
print("Accuracy :", accuracy_score(y_val, val_pred))
print("Precision:", precision_score(y_val, val_pred))
print("Recall   :", recall_score(y_val, val_pred))
print("F1 Score :", f1_score(y_val, val_pred))
print("AUC      :", roc_auc_score(y_val, val_proba))


# ============================================================
# 11. TEST METRICS
# ============================================================

test_pred = model.predict(X_test)
test_proba = model.predict_proba(X_test)[:, 1]

print("\n--- TEST METRICS ---")
print("Accuracy :", accuracy_score(y_test, test_pred))
print("Precision:", precision_score(y_test, test_pred))
print("Recall   :", recall_score(y_test, test_pred))
print("F1 Score :", f1_score(y_test, test_pred))
print("AUC      :", roc_auc_score(y_test, test_proba))

print("\n--- CLASSIFICATION REPORT (Test) ---")
print(classification_report(y_test, test_pred))


--- Starting Feature Engineering ---


  df['Transaction_DateTime'] = df['Transaction Date'] + pd.to_timedelta(df['Transaction Hour'], unit='H')


Dropped 0 duplicate transaction records.
Imputed NaN values in 'Transaction Amount' with median: 151.76
Imputed NaN values in 'Customer Age' with median: 35.0
Imputed NaN values in 'Payment Method' with 'Unknown'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)


Imputed NaN values in 'Product Category' with 'Unknown'
Imputed NaN values in 'Device Used' with 'Unknown'

Total Features: 28
Transactions: 1496586
Fraudulent Cases: 75060
Non-Fraudulent Cases: 1421526
Train: 838088, Validation: 209522, Test: 448976

--- VALIDATION METRICS ---
Accuracy : 0.878595087866668
Precision: 0.2399038294017213
Recall   : 0.6552150742291587
F1 Score : 0.3512127936337899
AUC      : 0.810318095992076

--- TEST METRICS ---
Accuracy : 0.8785102098998611
Precision: 0.24044539531265194
Recall   : 0.6588062883026912
F1 Score : 0.3523083499572528
AUC      : 0.811967828154596

--- CLASSIFICATION REPORT (Test) ---
              precision    recall  f1-score   support

           0       0.98      0.89      0.93    426458
           1       0.24      0.66      0.35     22518

    accuracy                           0.88    448976
   macro avg       0.61      0.77      0.64    448976
weighted avg       0.94      0.88      0.90    448976

