# Data preprocessing

**Inputs:**

- Configuration file
- Raw data

**Steps:**

- Load configuration
- Load raw data
- Clean data
- Split data into training, validation, and test sets
- Engineer features
- Save data

**Outputs:**

- Training data
- Validation data
- Test data


## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [20]:
from pathlib import Path
from dotenv import load_dotenv
from src.utils import load_config

config_path = Path.cwd() / "config.yaml"
config = load_config(config_path)

load_dotenv()

True

## Step 1: Load data

In [3]:
import pandas as pd

data_dir = Path.cwd().parent / 'data'
proc_dir = data_dir / 'processed'

tx_df = pd.read_csv(proc_dir / "transactions.csv")
print(tx_df.shape)
cust_df = pd.read_csv(proc_dir / "customers.csv")
print(cust_df.shape)
term_df = pd.read_csv(proc_dir / "terminals.csv")
print(term_df.shape)

(1754155, 9)
(5000, 8)
(10000, 3)


In [14]:
tx_df.tx_datetime = pd.to_datetime(tx_df.tx_datetime)

## Step 2: Create date and time features

In [15]:
from src.data.features import is_weekend

tx_df["tx_during_weekend"] = tx_df.tx_datetime.apply(is_weekend)

In [16]:
from src.data.features import is_night

tx_df["tx_during_night"] = tx_df.tx_datetime.apply(is_night)

## Step 3: Create customer features

In [17]:
from src.data.features import get_customer_spending_features

WINDOW_SIZES = config["data"]["features"]["window_sizes"]

tx_df = tx_df.groupby("customer_id").apply(lambda x: get_customer_spending_features(x, window_sizes=WINDOW_SIZES))

In [19]:
tx_df = tx_df.sort_values('tx_datetime').reset_index(drop=True)
tx_df.tail()

Unnamed: 0,transaction_id,tx_datetime,customer_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario,tx_during_weekend,tx_during_night,customer_id_nb_tx_1_day_window,customer_id_avg_amount_1_day_window,customer_id_nb_tx_7_day_window,customer_id_avg_amount_7_day_window,customer_id_nb_tx_30_day_window,customer_id_avg_amount_30_day_window
1754150,1754150,2023-08-02 23:56:36,161,655,54.24,15810996,182,0,0,0,0,2.0,75.28,12.0,67.0475,72.0,69.521111
1754151,1754151,2023-08-02 23:57:38,4342,6181,1.23,15811058,182,0,0,0,0,1.0,1.23,21.0,22.17381,93.0,24.780753
1754152,1754152,2023-08-02 23:58:21,618,1502,6.62,15811101,182,0,0,0,0,5.0,7.368,21.0,7.400476,65.0,7.864462
1754153,1754153,2023-08-02 23:59:52,4056,3067,55.4,15811192,182,0,0,0,0,3.0,100.696667,16.0,107.0525,51.0,102.919608
1754154,1754154,2023-08-02 23:59:57,3542,9849,23.59,15811197,182,0,0,0,0,5.0,41.304,24.0,35.308333,119.0,37.251513


## Step 4: Create terminal features

In [21]:
from src.data.features import get_terminal_risk_features

DELAY_PERIOD = config["data"]["features"]["delay_period"]

tx_df = tx_df.groupby("terminal_id").apply(lambda x: get_terminal_risk_features(x, delay_period=DELAY_PERIOD, window_sizes=WINDOW_SIZES))
tx_df=tx_df.sort_values('tx_datetime').reset_index(drop=True)

In [22]:
tx_df.tail()

Unnamed: 0,transaction_id,tx_datetime,customer_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario,tx_during_weekend,...,customer_id_nb_tx_7_day_window,customer_id_avg_amount_7_day_window,customer_id_nb_tx_30_day_window,customer_id_avg_amount_30_day_window,terminal_id_nb_tx_1_day_window,terminal_id_risk_1_day_window,terminal_id_nb_tx_7_day_window,terminal_id_risk_7_day_window,terminal_id_nb_tx_30_day_window,terminal_id_risk_30_day_window
1754150,1754150,2023-08-02 23:56:36,161,655,54.24,15810996,182,0,0,0,...,12.0,67.0475,72.0,69.521111,1.0,0.0,4.0,0.0,28.0,0.0
1754151,1754151,2023-08-02 23:57:38,4342,6181,1.23,15811058,182,0,0,0,...,21.0,22.17381,93.0,24.780753,1.0,0.0,9.0,0.0,39.0,0.0
1754152,1754152,2023-08-02 23:58:21,618,1502,6.62,15811101,182,0,0,0,...,21.0,7.400476,65.0,7.864462,1.0,0.0,5.0,0.0,33.0,0.0
1754153,1754153,2023-08-02 23:59:52,4056,3067,55.4,15811192,182,0,0,0,...,16.0,107.0525,51.0,102.919608,1.0,0.0,6.0,0.0,28.0,0.0
1754154,1754154,2023-08-02 23:59:57,3542,9849,23.59,15811197,182,0,0,0,...,24.0,35.308333,119.0,37.251513,1.0,0.0,12.0,0.0,41.0,0.02439


## Step 5: Store dataset

In [24]:
feature_dir = data_dir / "features"
feature_dir.mkdir(parents=True, exist_ok=True)

ds_path = feature_dir / "transactions.csv"

tx_df.to_csv(ds_path, index=False)

## Step 6: Track artifacts in MLOps platform

In [25]:
import wandb

run = wandb.init(project="fraud-detection", job_type="data_preprocessing")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfelixpeters[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [26]:
run.use_artifact('felixpeters/fraud-detection/raw_data:v0', type='dataset')

<Artifact QXJ0aWZhY3Q6NTMyMjM4NDY4>

In [27]:
features = wandb.Artifact("features", type="dataset")

In [28]:
features.add_dir(str(feature_dir))

[34m[1mwandb[0m: Adding directory to artifact (/Users/fpe/code/ml/fraud-detection/data/features)... Done. 0.8s


In [29]:
run.log_artifact(features)

<Artifact features>

In [30]:
run.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced
