In [4]:
import pandas as pd

In [5]:
DATA_PATH = "C:\\Users\\ozdil\\Downloads\\fraud\\ieee-fraud-detection\\"

train_txn = pd.read_csv(DATA_PATH + "train_transaction.csv")
train_id = pd.read_csv(DATA_PATH + "train_identity.csv")

df = train_txn.merge(train_id, on="TransactionID", how="left")

print(df.shape)
df.head()


(590540, 434)


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [6]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 200)

# Use a sample for speed (EDA + feature dev)
df_fe = df.sample(frac=0.2, random_state=42).copy()

print(df_fe.shape)


(118108, 434)


In [7]:
df_fe = df_fe.assign(
    TransactionHour=((df_fe["TransactionDT"] / 3600) % 24).astype(int),
    TransactionDay=(df_fe["TransactionDT"] // (3600 * 24)).astype(int)
)


Transaction Amount Deviation 

In [8]:
card_col = "card1"  # most stable card identifier

card_stats = (
    df_fe
    .groupby(card_col)["TransactionAmt"]
    .agg(["mean", "std"])
    .rename(columns={
        "mean": "card_amt_mean",
        "std": "card_amt_std"
    })
)

df_fe = df_fe.join(card_stats, on=card_col)


In [9]:
df_fe["amt_zscore"] = (
    (df_fe["TransactionAmt"] - df_fe["card_amt_mean"]) /
    (df_fe["card_amt_std"] + 1e-6)
)


Transaction Velocity Features

In [10]:
df_fe["card_txn_count_day"] = (
    df_fe
    .groupby([card_col, "TransactionDay"])["TransactionAmt"]
    .transform("count")
)


In [11]:
df_fe["card_amt_sum_day"] = (
    df_fe
    .groupby([card_col, "TransactionDay"])["TransactionAmt"]
    .transform("sum")
)


Time Since Previous Transaction

In [12]:
df_fe = df_fe.sort_values([card_col, "TransactionDT"])

df_fe["time_since_prev_txn"] = (
    df_fe
    .groupby(card_col)["TransactionDT"]
    .diff()
)

df_fe["time_since_prev_txn"] = df_fe["time_since_prev_txn"].fillna(-1)


Missingness Indicators

In [13]:
identity_cols = [c for c in df_fe.columns if c.startswith("id_")]

df_fe["missing_identity_count"] = df_fe[identity_cols].isnull().sum(axis=1)

df_fe["has_identity_info"] = (df_fe["missing_identity_count"] < len(identity_cols)).astype(int)


Simple Risk Flags

In [14]:
df_fe["high_amount_flag"] = (df_fe["TransactionAmt"] > 500).astype(int)

df_fe["night_transaction_flag"] = (
    (df_fe["TransactionHour"] <= 5) | (df_fe["TransactionHour"] >= 23)
).astype(int)


In [15]:
feature_cols = [
    "amt_zscore",
    "card_txn_count_day",
    "card_amt_sum_day",
    "time_since_prev_txn",
    "missing_identity_count",
    "high_amount_flag",
    "night_transaction_flag"
]

df_fe[feature_cols + ["isFraud"]].describe()


Unnamed: 0,amt_zscore,card_txn_count_day,card_amt_sum_day,time_since_prev_txn,missing_identity_count,high_amount_flag,night_transaction_flag,isFraud
count,115001.0,118108.0,118108.0,118108.0,118108.0,118108.0,118108.0,118108.0
mean,-9.267869e-13,4.99238,629.828344,387817.0,32.220485,0.038465,0.310055,0.035916
std,0.9764888,10.54294,1363.029066,1151625.0,10.452118,0.192316,0.462518,0.186082
min,-2.51035,1.0,0.251,-1.0,0.0,0.0,0.0,0.0
25%,-0.4817046,1.0,82.0,4192.75,38.0,0.0,0.0,0.0
50%,-0.2860157,2.0,209.5,35535.0,38.0,0.0,0.0,0.0
75%,0.1419544,5.0,594.65,185638.8,38.0,0.0,1.0,0.0
max,25.95467,126.0,36186.681,15313740.0,38.0,1.0,1.0,1.0


In [16]:
df_fe.groupby("isFraud")[feature_cols].mean()


Unnamed: 0_level_0,amt_zscore,card_txn_count_day,card_amt_sum_day,time_since_prev_txn,missing_identity_count,high_amount_flag,night_transaction_flag
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,-0.00489,5.019312,633.626422,390336.059816,32.494739,0.03837,0.309249
1,0.130528,4.269448,527.878318,320197.99835,24.858793,0.041018,0.331683


In [18]:
df_fe.to_parquet("transactions_features.parquet", index=False)


### Feature Engineering Summary
Key fraud-relevant features were engineered to capture:
- Abnormal spending relative to card history
- Transaction velocity and burst behavior
- Missing identity information as an evasion signal
- Time-based risk patterns

These features balance predictive power with interpretability, enabling both high-performing models and LLM-based explanations.
