# Exploratory data analysis

**Inputs:**

- Configuration file
- Raw data

**Steps:**

- Load configuration
- Load raw data
- Create summary statistics
- Create plots

**Outputs:**

- Summary statistics
- Plots

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from src.utils import load_config

config_path = Path.cwd() / "config.yaml"
config = load_config(config_path)

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

## Step 1: Load data

In [4]:
data_dir = Path.cwd().parent / "data/processed"
print(data_dir.exists())

True


In [5]:
import pandas as pd

customers = pd.read_csv(data_dir / "customers.csv")
terminals = pd.read_csv(data_dir / "terminals.csv")
transactions = pd.read_csv(data_dir / "transactions.csv")

print(customers.shape, terminals.shape, transactions.shape)

(5000, 8) (10000, 3) (1754155, 9)


## Step 2: Perform exploratory data analysis

In [6]:
transactions.head()

Unnamed: 0,transaction_id,tx_datetime,customer_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario
0,0,2023-02-01 00:00:31,596,3156,57.16,31,0,0,0
1,1,2023-02-01 00:02:10,4961,3412,81.51,130,0,0,0
2,2,2023-02-01 00:07:56,2,1365,146.0,476,0,0,0
3,3,2023-02-01 00:09:29,4128,8737,64.49,569,0,0,0
4,4,2023-02-01 00:10:34,927,9906,50.99,634,0,0,0


### Create basic statistics

In [7]:
stats = {}

In [8]:
stats["number_of_transactions"] = len(transactions)
stats["number_of_customers"] = len(customers)
stats["number_of_terminals"] = len(terminals)
stats["start_date"] = transactions["tx_datetime"].min()
stats["end_date"] = transactions["tx_datetime"].max()

In [9]:
tx_per_day = transactions.groupby("tx_time_days")["transaction_id"].count()

In [10]:
# plot tx_per_day using plotly

import plotly.express as px

num_tx_fig = px.scatter(tx_per_day, x=tx_per_day.index, y="transaction_id")
num_tx_fig.update_xaxes(title_text="Day")
num_tx_fig.update_yaxes(title_text="Number of transactions")
num_tx_fig.add_hline(y=tx_per_day.mean(), line_dash="dot", annotation_text="Mean", annotation_position="bottom right")
num_tx_fig.add_hrect(y0=tx_per_day.mean()-tx_per_day.std(), y1=tx_per_day.mean()+tx_per_day.std(), line_width=0, fillcolor="red", opacity=0.2, annotation_text="Std", annotation_position="bottom right")
num_tx_fig.show()

### Create transaction amount statistics

In [11]:
tx_stats = transactions["tx_amount"].describe()
stats["tx_amount_mean"] = tx_stats["mean"]
stats["tx_amount_std"] = tx_stats["std"]
stats["tx_amount_min"] = tx_stats["min"]
stats["tx_amount_max"] = tx_stats["max"]
stats["tx_amount_median"] = tx_stats["50%"]

### Create transaction per customer statistics

In [12]:
tx_per_customer = transactions.groupby('customer_id')['transaction_id'].count().describe()
stats["customer_tx_min"] = tx_per_customer["min"]
stats["customer_tx_max"] = tx_per_customer["max"]
stats["customer_tx_mean"] = tx_per_customer["mean"]
stats["customer_tx_median"] = tx_per_customer["50%"]

### Create transaction per terminal statistics

In [13]:
tx_per_terminal = transactions.groupby('terminal_id')['transaction_id'].count().describe()
stats["terminal_tx_min"] = tx_per_terminal["min"]
stats["terminal_tx_max"] = tx_per_terminal["max"]
stats["terminal_tx_mean"] = tx_per_terminal["mean"]
stats["terminal_tx_median"] = tx_per_terminal["50%"]

### Create fraud statistics

In [14]:
frauds = transactions[transactions["tx_fraud"] == 1]
stats["percentage_of_frauds"] = len(frauds) / len(transactions)
stats["number_of_frauds"] = len(frauds)

In [15]:
frauds_by_scenario = frauds.groupby("tx_fraud_scenario")["transaction_id"].count()
stats["number_of_scenario1_frauds"] = frauds_by_scenario[1]
stats["number_of_scenario2_frauds"] = frauds_by_scenario[2]
stats["number_of_scenario3_frauds"] = frauds_by_scenario[3]

In [16]:
frauds_per_day = transactions.groupby("tx_time_days")["tx_fraud"].sum()

# Plot the number of frauds per day using plotly
frauds_per_day_fig = px.scatter(frauds_per_day, x=frauds_per_day.index, y=frauds_per_day.values, labels={"x": "Day", "y": "Number of frauds"})
frauds_per_day_fig.add_hline(y=frauds_per_day.mean(), line_dash="dot", annotation_text="Average", annotation_position="bottom right")
frauds_per_day_fig.show()

### Create fraud per customer statistics

In [17]:
frauds_per_customer = frauds.groupby('customer_id')['transaction_id'].count().describe()
stats["customer_fraud_min"] = frauds_per_customer["min"]
stats["customer_fraud_max"] = frauds_per_customer["max"]
stats["customer_fraud_mean"] = frauds_per_customer["mean"]
stats["customer_fraud_median"] = frauds_per_customer["50%"]

### Creat fraud per terminal statistics

In [18]:
frauds_per_terminal = frauds.groupby('terminal_id')['transaction_id'].count().describe()
stats["terminal_fraud_min"] = frauds_per_terminal["min"]
stats["terminal_fraud_max"] = frauds_per_terminal["max"]
stats["terminal_fraud_mean"] = frauds_per_terminal["mean"]
stats["terminal_fraud_median"] = frauds_per_terminal["50%"]

## Step 3: Log artifacts

In [19]:
stats_df = pd.DataFrame(data=[stats])
stats_df.head()

Unnamed: 0,number_of_transactions,number_of_customers,number_of_terminals,start_date,end_date,tx_amount_mean,tx_amount_std,tx_amount_min,tx_amount_max,tx_amount_median,...,number_of_scenario2_frauds,number_of_scenario3_frauds,customer_fraud_min,customer_fraud_max,customer_fraud_mean,customer_fraud_median,terminal_fraud_min,terminal_fraud_max,terminal_fraud_mean,terminal_fraud_median
0,1754155,5000,10000,2023-02-01 00:00:31,2023-08-02 23:59:57,53.632302,42.326492,0.0,2628.0,44.64,...,9077,4631,1.0,45.0,4.228399,3.0,1.0,85.0,3.441397,1.0


In [20]:
import wandb

In [21]:
run = wandb.init(project="fraud-detection", job_type="data_analysis")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfelixpeters[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [22]:
run.use_artifact('felixpeters/fraud-detection/raw_data:v0', type='dataset')

<Artifact QXJ0aWZhY3Q6NTMyMjM4NDY4>

In [23]:
eda_results = wandb.Artifact("eda_results", type="stats")

In [24]:
summary_stats = wandb.Table(dataframe=stats_df)

In [25]:
eda_results.add(summary_stats, "summary_stats")

<wandb.sdk.artifacts.artifact_manifest_entry.ArtifactManifestEntry at 0x292468c70>

In [26]:
img_dir = Path.cwd() / "images"
img_dir.mkdir(parents=True, exist_ok=True)

In [27]:
num_tx_fig_path = img_dir / 'num_tx_fig.png'
num_tx_fig.write_image(str(num_tx_fig_path))

In [29]:
num_tx_img = wandb.Image(str(num_tx_fig_path))

In [30]:
eda_results.add(num_tx_img, "number_of_transactions_per_day")

<wandb.sdk.artifacts.artifact_manifest_entry.ArtifactManifestEntry at 0x2929c38e0>

In [31]:
frauds_per_day_fig_path = img_dir / 'frauds_per_day_fig.png'
frauds_per_day_fig.write_image(str(frauds_per_day_fig_path))

In [32]:
frauds_per_day_img = wandb.Image(str(frauds_per_day_fig_path))

In [33]:
eda_results.add(frauds_per_day_img, "number_of_frauds_per_day")

<wandb.sdk.artifacts.artifact_manifest_entry.ArtifactManifestEntry at 0x292b2e680>

In [34]:
run.log_artifact(eda_results)

<Artifact eda_results>

In [35]:
run.finish()

