# Model training

**Inputs:**

- Configuration file
- Training data
- Validation data

**Steps:**

- Load configuration
- Load training data
- Load validation data
- Create model based on configuration
- Train model on training data
- Evaluate model on validation data

**Outputs:**

- Trained model
- Validation metrics

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from dotenv import load_dotenv
from src.utils import load_config

config_path = Path.cwd() / "config.yaml"
config = load_config(config_path)

load_dotenv()

True

## Load historical data

In [3]:
data_dir = Path.cwd().parent / "data"
feature_dir = data_dir / "features"

assert data_dir.exists()
assert feature_dir.exists()

In [4]:
import pandas as pd

data = pd.read_csv(feature_dir / "transactions.csv")
data.tx_datetime = pd.to_datetime(data.tx_datetime)
print(data.shape)
data.head()

(1754155, 23)


Unnamed: 0,transaction_id,tx_datetime,customer_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario,tx_during_weekend,...,customer_id_nb_tx_7_day_window,customer_id_avg_amount_7_day_window,customer_id_nb_tx_30_day_window,customer_id_avg_amount_30_day_window,terminal_id_nb_tx_1_day_window,terminal_id_risk_1_day_window,terminal_id_nb_tx_7_day_window,terminal_id_risk_7_day_window,terminal_id_nb_tx_30_day_window,terminal_id_risk_30_day_window
0,0,2023-02-01 00:00:31,596,3156,57.16,31,0,0,0,0,...,1.0,57.16,1.0,57.16,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2023-02-01 00:02:10,4961,3412,81.51,130,0,0,0,0,...,1.0,81.51,1.0,81.51,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2023-02-01 00:07:56,2,1365,146.0,476,0,0,0,0,...,1.0,146.0,1.0,146.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,2023-02-01 00:09:29,4128,8737,64.49,569,0,0,0,0,...,1.0,64.49,1.0,64.49,0.0,0.0,0.0,0.0,0.0,0.0
4,4,2023-02-01 00:10:34,927,9906,50.99,634,0,0,0,0,...,1.0,50.99,1.0,50.99,0.0,0.0,0.0,0.0,0.0,0.0


## Create train and test splits

In [5]:
import datetime

from src.data.split import get_train_test_set

train_start_date_str = config["data"]["split"]["start_date_training"]
delta_train = config["data"]["split"]["delta_train"]
delta_delay = config["data"]["split"]["delta_delay"]
delta_test = config["data"]["split"]["delta_test"]

train_start_date = datetime.datetime.strptime(str(train_start_date_str), "%Y-%m-%d")

train_df, test_df = get_train_test_set(data, train_start_date, delta_train=delta_train, delta_delay=delta_delay, delta_test=delta_test)

print("total transactions in training set:", len(train_df))
print("total transactions in test set:", len(test_df))
print("fraudulent transactions in training set:", len(train_df[train_df.tx_fraud == 1]))
print("fraudulent transactions in test set:", len(test_df[test_df.tx_fraud == 1]))

total transactions in training set: 67203
total transactions in test set: 57447
fraudulent transactions in training set: 627
fraudulent transactions in test set: 307


## Extract features

In [6]:
input_features = config["data"]["features"]["input_features"]
target_feature = config["data"]["features"]["output_feature"]

print(input_features)
print(target_feature)

['tx_amount', 'tx_during_weekend', 'tx_during_night', 'customer_id_nb_tx_1_day_window', 'customer_id_avg_amount_1_day_window', 'customer_id_nb_tx_7_day_window', 'customer_id_avg_amount_7_day_window', 'customer_id_nb_tx_30_day_window', 'customer_id_avg_amount_30_day_window', 'terminal_id_nb_tx_1_day_window', 'terminal_id_risk_1_day_window', 'terminal_id_nb_tx_7_day_window', 'terminal_id_risk_7_day_window', 'terminal_id_nb_tx_30_day_window', 'terminal_id_risk_30_day_window']
tx_fraud


## Train classifier

In [7]:
from src.model import fit_model
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(max_depth=2, random_state=0)

results = fit_model(classifier, train_df, test_df, input_features, target_feature, scale=False)

In [8]:
test_df["tx_fraud_predicted"] = results["predictions_test"]
test_df.head()

Unnamed: 0,transaction_id,tx_datetime,customer_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario,tx_during_weekend,...,customer_id_avg_amount_7_day_window,customer_id_nb_tx_30_day_window,customer_id_avg_amount_30_day_window,terminal_id_nb_tx_1_day_window,terminal_id_risk_1_day_window,terminal_id_nb_tx_7_day_window,terminal_id_risk_7_day_window,terminal_id_nb_tx_30_day_window,terminal_id_risk_30_day_window,tx_fraud_predicted
1284710,1284710,2023-06-15 00:00:33,4643,391,26.28,11577633,134,0,0,0,...,24.118182,43.0,26.433721,1.0,0.0,4.0,0.0,16.0,0.0,0.003749
1284711,1284711,2023-06-15 00:01:00,3383,3948,57.5,11577660,134,0,0,0,...,65.233684,81.0,74.607407,1.0,0.0,9.0,0.0,40.0,0.0,0.003749
1284712,1284712,2023-06-15 00:01:03,1766,6503,47.52,11577663,134,0,0,0,...,48.628333,49.0,44.877347,1.0,0.0,6.0,0.0,27.0,0.0,0.003749
1284713,1284713,2023-06-15 00:01:30,1218,8150,20.96,11577690,134,0,0,0,...,21.417143,94.0,19.482234,0.0,0.0,4.0,0.0,22.0,0.0,0.003749
1284714,1284714,2023-06-15 00:03:59,2423,4018,4.65,11577839,134,0,0,0,...,7.397692,66.0,8.934091,0.0,0.0,14.0,0.0,62.0,0.0,0.003749


## Calculate metrics

In [9]:
from src.metrics import evaluate_predictions

predictions_df = test_df
predictions_df["predictions"] = results["predictions_test"]

evals = evaluate_predictions(predictions_df, target_feature, "predictions", top_k_list=[100])
print(evals)

{'auc_roc': 0.771, 'average_precision': 0.415, 'card_precision@100': 0.199}


## Store artifacts

In [10]:
model_dir = data_dir / "models"
model_dir.mkdir(exist_ok=True, parents=True)

In [11]:
train_set_fp = model_dir / "train_set.csv"
test_set_fp = model_dir / "test_set.csv"
model_fp = model_dir / "model.joblib"

In [12]:
train_df.to_csv(train_set_fp, index=False)
test_df.to_csv(test_set_fp, index=False)

import joblib

joblib.dump(results["classifier"], model_fp)

['/Users/fpe/code/personal/fraud-detection/data/models/model.joblib']

## Track artifacts in MLOps platform

In [13]:
import wandb

In [14]:
run = wandb.init(project="fraud-detection", job_type="model_training")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfelixpeters[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [15]:
run.use_artifact('felixpeters/fraud-detection/features:v0', type='dataset')

<Artifact QXJ0aWZhY3Q6NTQ5OTY2MDQx>

In [16]:
train_set_artifact = wandb.Artifact("training_set", type="dataset")
train_set_artifact.add_file(str(train_set_fp))
run.log_artifact(train_set_artifact)

<Artifact training_set>

In [17]:
test_set_artifact = wandb.Artifact("test_set", type="dataset")
test_set_artifact.add_file(str(test_set_fp))
run.log_artifact(test_set_artifact)

<Artifact test_set>

In [18]:
model_artifact = wandb.Artifact("model", type="model")
model_artifact.add_file(str(model_fp))
run.log_artifact(model_artifact)

<Artifact model>

In [19]:
run.log({
    "auc_roc": evals["auc_roc"],
    "average_precision": evals["average_precision"],
    "card_precision@100": evals["card_precision@100"],
})

In [20]:
run.finish()



0,1
auc_roc,▁
average_precision,▁
card_precision@100,▁

0,1
auc_roc,0.771
average_precision,0.415
card_precision@100,0.199
