Fraud detection is a highly imbalanced classification problem where false negatives are significantly more costly than false positives.
This EDA explores transaction-level patterns to understand fraud behavior before model development.

1. Business Context
2. Data Loading & Merge
3. Target Variable Analysis
4. Transaction Amount Analysis
5. Time-Based Fraud Patterns
6. Missing Value Analysis
7. Feature Correlation with Fraud
8. Key EDA Takeaways


In [None]:
%pip install pandas

In [None]:
%pip install seaborn

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", 200)
sns.set(style="whitegrid")


In [None]:
DATA_PATH = "C:\\Users\\ozdil\\Downloads\\fraud\\ieee-fraud-detection\\"

train_txn = pd.read_csv(DATA_PATH + "train_transaction.csv")
train_id = pd.read_csv(DATA_PATH + "train_identity.csv")

df = train_txn.merge(train_id, on="TransactionID", how="left")

print(df.shape)
df.head()


In [None]:
df_sample = df.sample(frac=0.2, random_state=42)


In [None]:
fraud_rate = df_sample["isFraud"].mean()
print(f"Fraud Rate: {fraud_rate:.4%}")


In [None]:
plt.figure()
df_sample["isFraud"].value_counts(normalize=True).plot(kind="bar")
plt.title("Class Distribution (Fraud vs Non-Fraud)")
plt.ylabel("Proportion")
plt.xlabel("isFraud")
plt.show()


In [None]:
plt.figure()
sns.histplot(df_sample["TransactionAmt"], bins=100)
plt.title("Transaction Amount Distribution")
plt.show()


In [None]:
plt.figure()
sns.boxplot(x="isFraud", y="TransactionAmt", data=df_sample)
plt.yscale("log")
plt.title("Transaction Amount by Fraud Status (Log Scale)")
plt.show()


In [None]:
df_sample = df_sample.assign(
    TransactionHour=(df["TransactionDT"] / 3600) % 24,
    TransactionDay=df["TransactionDT"] // (3600 * 24)
)


In [None]:
df_sample["isFraud"].value_counts(dropna=False)


In [None]:
df_sample["isFraud"] = df_sample["isFraud"].astype(int)
df_sample["TransactionHour"] = df_sample["TransactionHour"].astype(int)


In [None]:
hourly_fraud = df_sample.groupby("TransactionHour")["isFraud"].mean()

plt.figure()
hourly_fraud.plot()
plt.title("Fraud Rate by Hour of Day")
plt.xlabel("Hour")
plt.ylabel("Fraud Rate")
plt.show()


In [None]:
daily_fraud = df_sample.groupby("TransactionDay")["isFraud"].mean()

plt.figure()
daily_fraud.plot()
plt.title("Fraud Rate Over Time")
plt.xlabel("Day Index")
plt.ylabel("Fraud Rate")
plt.show()


In [None]:
missing_pct = df_sample.isnull().mean().sort_values(ascending=False)

missing_pct.head(20)


In [None]:
plt.figure(figsize=(8, 4))
missing_pct.head(20).plot(kind="bar")
plt.title("Top 20 Features by Missing Rate")
plt.ylabel("Missing Percentage")
plt.show()


In [None]:
missing_fraud = (
    df.isnull()
    .groupby(df_sample["isFraud"])
    .mean()
    .T
)

missing_fraud["diff"] = missing_fraud[1] - missing_fraud[0]
missing_fraud.sort_values("diff", ascending=False).head(15)


In [None]:
import numpy as np

numeric_cols = df.select_dtypes(include=[np.number]).columns

corr = (
    df_sample[numeric_cols]
    .corr()["isFraud"]
    .sort_values(ascending=False)
)

corr.head(15)


In [None]:
plt.figure(figsize=(6, 4))
corr.drop("isFraud").head(10).plot(kind="barh")
plt.title("Top Correlated Features with Fraud")
plt.xlabel("Correlation")
plt.show()
