In [1]:
import polars as pl
from utils import get_config, read_data
from retrain.train import process_data, select_features, select_target, temporal_split, train_model, make_predictions, evaluate_predictions
from logger import setup_logger
import glob

# Get the logger
logger = setup_logger()

In [2]:
config = get_config("/home/christian/special-broccoli/config/config.yaml")

In [3]:
train_data = config["train_data_path"]
test_data = config["test_data_path"]
validation_days = config["training"]["validation_days"]
features = config["training"]["features"]
target = config["training"]["target"]

drift_evaluation_days = config["drift_monitor"]["evaluation_days"]
standard_deviation_threshold = config["drift_monitor"]["standard_deviation_threshold"]
prediction_thresholds = config["drift_monitor"]["prediction_thresholds"]
columns_to_evaluate = config["drift_monitor"]["columns_to_evaluate"]


In [4]:
train = read_data(path=train_data)
train = process_data(data=train)

test = read_data(path=test_data)
test = process_data(data=test)

In [5]:
trained_model = train_model(
    X=select_features(train, features), y=select_target(train, target)
)
predictions = make_predictions(
    X=select_features(test, features), trained_model=trained_model
)

[LightGBM] [Info] Number of positive: 1729, number of negative: 6414
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000571 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 263
[LightGBM] [Info] Number of data points in the train set: 8143, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212330 -> initscore=-1.310940
[LightGBM] [Info] Start training from score -1.310940


In [6]:
evaluate_predictions(y_test=select_target(test, target), y_pred=predictions)

Accuracy: 0.98
Precision: 0.95
Recall: 1.0
F1 Score: 0.97


In [7]:
before, latest = temporal_split(train, 2)

2024-01-12 21:14:08 - INFO - 2 validation days yield a 0.42 test fraction


In [9]:
columns_to_evaluate

['Occupancy', 'Temperature', 'Light']

In [11]:
before.select(columns_to_evaluate).std()

Occupancy,Temperature,Light
f64,f64,f64
0.429142,0.937042,209.189269


In [16]:
lower_bound = before.select(columns_to_evaluate).mean() - before.select(columns_to_evaluate).std()*standard_deviation_threshold
upper_bound = before.select(columns_to_evaluate).mean() + before.select(columns_to_evaluate).std()*standard_deviation_threshold

In [65]:
latest = latest.select(columns_to_evaluate).mean()

Occupancy,Temperature,Light
f64,f64,f64
0.170237,20.051028,87.132523


In [79]:
if (latest < lower_bound).select(any=pl.any_horizontal(columns_to_evaluate)).item() or (latest > upper_bound).select(any=pl.any_horizontal(columns_to_evaluate)).item():
    print("Retraining Needed")

Retraining Needed


In [82]:
evaluation_columns = columns_to_evaluate
baseline_data, newest_data = temporal_split(
    data=train, validation_days=2
)

baseline_data = baseline_data.select(evaluation_columns)
newest_data = newest_data.select(evaluation_columns)

lower_bounds = (
    baseline_data.mean() - baseline_data.std() * standard_deviation_threshold
)
upper_bounds = (
    baseline_data.mean() + baseline_data.std() * standard_deviation_threshold
)

data_drift = (newest_data.mean() < lower_bounds).select(
    any=pl.any_horizontal("*")
).item() or (newest_data.mean() > upper_bounds).select(
    any=pl.any_horizontal("*")
).item()

2024-01-12 21:48:49 - INFO - 2 validation days yield a 0.42 test fraction


In [86]:
test.select("date").describe()

describe,date
str,str
"""count""","""2665"""
"""null_count""","""0"""
"""mean""",
"""std""",
"""min""","""2015-02-02"""
"""25%""",
"""50%""",
"""75%""",
"""max""","""2015-02-04"""
