In [2]:
import sys
from pathlib import Path

# Add project root to Python path
PROJECT_ROOT = Path.cwd().parents[0]
sys.path.insert(0, str(PROJECT_ROOT))

print("Added to sys.path:", PROJECT_ROOT)

Added to sys.path: /Users/croberts/practice-telemetry-ml


In [3]:
import pandas as pd
from src.config import PROCESSED_DIR

FEATURES_PATH = PROCESSED_DIR / "telemetry_features.parquet"
df = pd.read_parquet(FEATURES_PATH)

df["timestamp"] = pd.to_datetime(df["timestamp"])
df["timestamp"].min(), df["timestamp"].max()

(Timestamp('2025-12-12 21:17:00'), Timestamp('2025-12-25 21:17:00'))

In [4]:
t_min = df["timestamp"].min()
t_max = df["timestamp"].max()
t_min, t_max

(Timestamp('2025-12-12 21:17:00'), Timestamp('2025-12-25 21:17:00'))

In [5]:
train_end = df["timestamp"].quantile(0.70)
calib_end = df["timestamp"].quantile(0.85)

train_end, calib_end

(Timestamp('2025-12-22 02:02:00'), Timestamp('2025-12-24 00:47:00'))

In [6]:
train_df = df[df["timestamp"] <= train_end].copy()
calib_df = df[(df["timestamp"] > train_end) & (df["timestamp"] <= calib_end)].copy()
score_df = df[df["timestamp"] > calib_end].copy()

len(train_df), len(calib_df), len(score_df)

(19856, 4261, 4240)

In [7]:
non_feature_cols = {"device_id", "timestamp", "event_tag"}
feature_cols = [
    c for c in df.columns
    if c not in non_feature_cols and pd.api.types.is_numeric_dtype(df[c])
]

# Drop NaNs in each split (safety)
train_df = train_df.dropna(subset=feature_cols).reset_index(drop=True)
calib_df = calib_df.dropna(subset=feature_cols).reset_index(drop=True)
score_df = score_df.dropna(subset=feature_cols).reset_index(drop=True)

len(feature_cols), train_df.shape, calib_df.shape, score_df.shape

(65, (19833, 68), (4257, 68), (4239, 68))

In [8]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(
    n_estimators=200,
    contamination=0.02,
    random_state=42,
    n_jobs=-1
)

iso.fit(train_df[feature_cols])

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of base estimators in the ensemble.",200
,"max_samples  max_samples: ""auto"", int or float, default=""auto"" The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If ""auto"", then `max_samples=min(256, n_samples)`. If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling).",'auto'
,"contamination  contamination: 'auto' or float, default='auto' The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the scores of the samples. - If 'auto', the threshold is determined as in the  original paper. - If float, the contamination should be in the range (0, 0.5]. .. versionchanged:: 0.22  The default value of ``contamination`` changed from 0.1  to ``'auto'``.",0.02
,"max_features  max_features: int or float, default=1.0 The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max(1, int(max_features * n_features_in_))` features. Note: using a float number less than 1.0 or integer less than number of features will enable feature subsampling and leads to a longer runtime.",1.0
,"bootstrap  bootstrap: bool, default=False If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed.",False
,"n_jobs  n_jobs: int, default=None The number of jobs to run in parallel for :meth:`fit`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",-1
,"random_state  random_state: int, RandomState instance or None, default=None Controls the pseudo-randomness of the selection of the feature and split values for each branching step and each tree in the forest. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `.",42
,"verbose  verbose: int, default=0 Controls the verbosity of the tree building process.",0
,"warm_start  warm_start: bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. .. versionadded:: 0.21",False


In [9]:
alert_rate = 0.01

calib_scores = iso.decision_function(calib_df[feature_cols])
calib_df["anomaly_score"] = calib_scores

threshold = calib_df["anomaly_score"].quantile(alert_rate)
threshold

np.float64(-0.04158118594991586)

In [10]:
score_df["anomaly_score"] = iso.decision_function(score_df[feature_cols])
score_df["is_alert"] = score_df["anomaly_score"] <= threshold

score_df["is_alert"].mean(), threshold

(np.float64(0.01863647086577023), np.float64(-0.04158118594991586))

In [11]:
score_df.groupby("event_tag")["is_alert"].mean().sort_values(ascending=False)

event_tag
normal    0.018672
spike     0.000000
Name: is_alert, dtype: float64

In [12]:
cols = ["device_id", "timestamp", "sensor_value", "temp_c", "rssi", "event_tag", "anomaly_score", "is_alert"]
score_df.sort_values("anomaly_score").head(20)[cols]

Unnamed: 0,device_id,timestamp,sensor_value,temp_c,rssi,event_tag,anomaly_score,is_alert
3822,D022,2025-12-25 06:17:00,17.139303,23.700418,-55.304555,normal,-0.088008,True
3823,D022,2025-12-25 06:32:00,15.815582,24.237214,-55.504298,normal,-0.086308,True
3821,D022,2025-12-25 06:02:00,17.367777,24.023208,-56.466679,normal,-0.086177,True
3415,D019,2025-12-25 12:32:00,18.716333,16.468494,-54.634335,normal,-0.083196,True
3290,D019,2025-12-24 05:17:00,19.87722,22.399129,-49.720468,normal,-0.07803,True
3414,D019,2025-12-25 12:17:00,18.941277,16.057194,-55.11713,normal,-0.077714,True
3416,D019,2025-12-25 12:47:00,19.174603,16.220046,-55.737207,normal,-0.073853,True
3291,D019,2025-12-24 05:32:00,20.945846,23.30554,-57.439782,normal,-0.073656,True
3288,D019,2025-12-24 04:47:00,21.366715,26.548128,-58.707343,normal,-0.073319,True
3824,D022,2025-12-25 06:47:00,16.60343,23.467816,-55.843309,normal,-0.07308,True
