In [None]:
import pandas as pd

In [None]:
DATA_PATH = "C:\\Users\\ozdil\\Downloads\\fraud\\ieee-fraud-detection\\"

train_txn = pd.read_csv(DATA_PATH + "train_transaction.csv")
train_id = pd.read_csv(DATA_PATH + "train_identity.csv")

df = train_txn.merge(train_id, on="TransactionID", how="left")

print(df.shape)
df.head()


In [None]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 200)

# Use a sample for speed (EDA + feature dev)
df_fe = df.sample(frac=0.2, random_state=42).copy()

print(df_fe.shape)


In [None]:
df_fe = df_fe.assign(
    TransactionHour=((df_fe["TransactionDT"] / 3600) % 24).astype(int),
    TransactionDay=(df_fe["TransactionDT"] // (3600 * 24)).astype(int)
)


Transaction Amount Deviation 

In [None]:
card_col = "card1"  # most stable card identifier

card_stats = (
    df_fe
    .groupby(card_col)["TransactionAmt"]
    .agg(["mean", "std"])
    .rename(columns={
        "mean": "card_amt_mean",
        "std": "card_amt_std"
    })
)

df_fe = df_fe.join(card_stats, on=card_col)


In [None]:
df_fe["amt_zscore"] = (
    (df_fe["TransactionAmt"] - df_fe["card_amt_mean"]) /
    (df_fe["card_amt_std"] + 1e-6)
)


Transaction Velocity Features

In [None]:
df_fe["card_txn_count_day"] = (
    df_fe
    .groupby([card_col, "TransactionDay"])["TransactionAmt"]
    .transform("count")
)


In [None]:
df_fe["card_amt_sum_day"] = (
    df_fe
    .groupby([card_col, "TransactionDay"])["TransactionAmt"]
    .transform("sum")
)


Time Since Previous Transaction

In [None]:
df_fe = df_fe.sort_values([card_col, "TransactionDT"])

df_fe["time_since_prev_txn"] = (
    df_fe
    .groupby(card_col)["TransactionDT"]
    .diff()
)

df_fe["time_since_prev_txn"] = df_fe["time_since_prev_txn"].fillna(-1)


Missingness Indicators

In [None]:
identity_cols = [c for c in df_fe.columns if c.startswith("id_")]

df_fe["missing_identity_count"] = df_fe[identity_cols].isnull().sum(axis=1)

df_fe["has_identity_info"] = (df_fe["missing_identity_count"] < len(identity_cols)).astype(int)


Simple Risk Flags

In [None]:
df_fe["high_amount_flag"] = (df_fe["TransactionAmt"] > 500).astype(int)

df_fe["night_transaction_flag"] = (
    (df_fe["TransactionHour"] <= 5) | (df_fe["TransactionHour"] >= 23)
).astype(int)


In [None]:
feature_cols = [
    "amt_zscore",
    "card_txn_count_day",
    "card_amt_sum_day",
    "time_since_prev_txn",
    "missing_identity_count",
    "high_amount_flag",
    "night_transaction_flag"
]

df_fe[feature_cols + ["isFraud"]].describe()


In [None]:
df_fe.groupby("isFraud")[feature_cols].mean()


In [None]:
df_fe.to_parquet("transactions_features.parquet", index=False)


### Feature Engineering Summary
Key fraud-relevant features were engineered to capture:
- Abnormal spending relative to card history
- Transaction velocity and burst behavior
- Missing identity information as an evasion signal
- Time-based risk patterns

These features balance predictive power with interpretability, enabling both high-performing models and LLM-based explanations.
