# E-Commerce Fraud Detection

This project models E-Commerce transaction data to identify fraudelent activity, based on this [Kaggle Dataset](https://www.kaggle.com/datasets/umuttuygurr/e-commerce-fraud-detection-dataset).

## Setup
### Define parameters
The input/output parameters are defined in the next cell.

In [12]:
# Dataset parameters
kaggle_source = "umuttuygurr/e-commerce-fraud-detection-dataset"
data_dir = "./data"
csv_file = "transactions.csv"
target_col = "is_fraud"

### Import packages

In [13]:
import os
import pandas as pd
import numpy as np
from pathlib import Path

### Define functions

In [14]:
def download_data_csv(kaggle_source, data_dir, csv_file):
    """Download csv file from kaggle_source. Requires install of kaggle python
    package to use the Kaggle API and Kaggle API credentials set up in
    `~/.kaggle/kaggle.json`. Creates data directory, data_dir, if it doesn't
    exist. csv_file is the name of the downloaded file.
    """
    Path(data_dir).mkdir(parents=True, exist_ok=True)
    if not os.path.exists(f"{data_dir}/{csv_file}"):
        print(f"Downloading dataset from Kaggle...")
        !kaggle datasets download -d {kaggle_source} -p {data_dir} --unzip
        print("Download complete!")
    else:
        print(f"Dataset already exists at {data_dir}/{csv_file}")

def load_data(data_dir, csv_file, verbose=True):
    df = pd.read_csv(
        f"{data_dir}/{csv_file}",
        low_memory=False  # Read entire file to infer dtypes properly
    )
    if verbose:
        print(f"Dataset Shape: {df.shape[0]} rows, {df.shape[1]} columns")
        print(f"\nColumn Names:\n{df.columns.tolist()}")
        print(f"\nData Types:\n{df.dtypes}")
        print(f"\nMemory Usage:\n{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    return df

## Load data

In [15]:
download_data_csv(kaggle_source, data_dir, csv_file)
input_df = load_data(data_dir, csv_file, verbose=True)

Dataset already exists at ./data/transactions.csv
Dataset Shape: 299695 rows, 17 columns

Column Names:
['transaction_id', 'user_id', 'account_age_days', 'total_transactions_user', 'avg_amount_user', 'amount', 'country', 'bin_country', 'channel', 'merchant_category', 'promo_used', 'avs_match', 'cvv_result', 'three_ds_flag', 'transaction_time', 'shipping_distance_km', 'is_fraud']

Data Types:
transaction_id               int64
user_id                      int64
account_age_days             int64
total_transactions_user      int64
avg_amount_user            float64
amount                     float64
country                     object
bin_country                 object
channel                     object
merchant_category           object
promo_used                   int64
avs_match                    int64
cvv_result                   int64
three_ds_flag                int64
transaction_time            object
shipping_distance_km       float64
is_fraud                     int64
dtype: obj

### Data Quality Checks
Perform initial exploratory analysis to understand the dataset quality and structure.

In [6]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
print(f"Percentage of duplicates: {(duplicates / len(df)) * 100:.2f}%")

Number of duplicate rows: 0
Percentage of duplicates: 0.00%


In [7]:
# Identify potential target column (fraud indicator)
fraud_cols = [col for col in df.columns if 'fraud' in col.lower() or 'class' in col.lower() or 'label' in col.lower()]

if fraud_cols:
    target_col = fraud_cols[0]
    print(f"Target column identified: '{target_col}'")
    print(f"\nClass Distribution:")
    print(df[target_col].value_counts())
    print(f"\nClass Distribution (%):")
    print(df[target_col].value_counts(normalize=True) * 100)
    
    # Check for class imbalance
    class_ratio = df[target_col].value_counts(normalize=True).values
    if len(class_ratio) >= 2 and (class_ratio[0] / class_ratio[1] > 10 or class_ratio[1] / class_ratio[0] > 10):
        print("\nWarning: Significant class imbalance detected!")
else:
    print("No obvious target column found. Please identify the fraud indicator column manually.")

Target column identified: 'is_fraud'

Class Distribution:
is_fraud
0    293083
1      6612
Name: count, dtype: int64

Class Distribution (%):
is_fraud
0    97.793757
1     2.206243
Name: proportion, dtype: float64



### Target Variable Analysis
Check if there's a fraud indicator column and analyze class distribution.

In [8]:
# Summary for categorical/object columns
categorical_cols = df.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    print("Categorical Columns Summary:")
    for col in categorical_cols:
        print(f"\n{col}:")
        print(f"  Unique values: {df[col].nunique()}")
        print(f"  Top 5 values:\n{df[col].value_counts().head()}")

Categorical Columns Summary:

country:
  Unique values: 10
  Top 5 values:
country
US    32430
GB    30602
FR    30343
NL    30220
TR    30074
Name: count, dtype: int64

bin_country:
  Unique values: 10
  Top 5 values:
bin_country
US    32295
GB    30563
FR    30261
NL    30256
TR    29972
Name: count, dtype: int64

channel:
  Unique values: 2
  Top 5 values:
channel
web    152226
app    147469
Name: count, dtype: int64

merchant_category:
  Unique values: 5
  Top 5 values:
merchant_category
electronics    60220
travel         59922
grocery        59913
gaming         59839
fashion        59801
Name: count, dtype: int64

transaction_time:
  Unique values: 297975
  Top 5 values:
transaction_time
2024-02-22T00:29:19Z    3
2024-10-17T01:27:59Z    3
2024-10-17T10:28:04Z    3
2024-08-03T03:21:43Z    3
2024-09-22T12:02:03Z    3
Name: count, dtype: int64


In [9]:
# Statistical summary for numerical columns
df.describe()

Unnamed: 0,transaction_id,user_id,account_age_days,total_transactions_user,avg_amount_user,amount,promo_used,avs_match,cvv_result,three_ds_flag,shipping_distance_km,is_fraud
count,299695.0,299695.0,299695.0,299695.0,299695.0,299695.0,299695.0,299695.0,299695.0,299695.0,299695.0,299695.0
mean,149848.0,3002.559432,973.397871,50.673321,148.142973,177.165279,0.15364,0.837999,0.87211,0.784588,357.049028,0.022062
std,86514.6388,1732.309663,525.241409,5.976391,200.364624,306.926507,0.360603,0.368453,0.333968,0.411109,427.672074,0.146887
min,1.0,1.0,1.0,40.0,3.52,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,74924.5,1501.0,516.0,46.0,46.19,42.1,0.0,1.0,1.0,1.0,136.6,0.0
50%,149848.0,3007.0,975.0,51.0,90.13,89.99,0.0,1.0,1.0,1.0,273.02,0.0
75%,224771.5,4504.0,1425.0,56.0,173.45,191.11,0.0,1.0,1.0,1.0,409.18,0.0
max,299695.0,6000.0,1890.0,60.0,4565.29,16994.74,1.0,1.0,1.0,1.0,3748.56,1.0


### Statistical Summary

### Duplicate Records

In [10]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing Count': missing_values.values,
    'Missing Percentage': missing_percent.values
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("Columns with missing values:")
    print(missing_df.to_string(index=False))
else:
    print("No missing values found in the dataset!")

No missing values found in the dataset!


### Missing Values Analysis

In [11]:
# Display first few rows
df.head()

Unnamed: 0,transaction_id,user_id,account_age_days,total_transactions_user,avg_amount_user,amount,country,bin_country,channel,merchant_category,promo_used,avs_match,cvv_result,three_ds_flag,transaction_time,shipping_distance_km,is_fraud
0,1,1,141,47,147.93,84.75,FR,FR,web,travel,0,1,1,1,2024-01-06T04:09:39Z,370.95,0
1,2,1,141,47,147.93,107.9,FR,FR,web,travel,0,0,0,0,2024-01-09T20:13:47Z,149.62,0
2,3,1,141,47,147.93,92.36,FR,FR,app,travel,1,1,1,1,2024-01-12T06:20:11Z,164.08,0
3,4,1,141,47,147.93,112.47,FR,FR,web,fashion,0,1,1,1,2024-01-15T17:00:04Z,397.4,0
4,5,1,141,47,147.93,132.91,FR,US,web,electronics,0,1,1,1,2024-01-17T01:27:31Z,935.28,0
