In [None]:
Below is a **single clean Jupyter Notebook (baseline model)** you can copy directly into an `.ipynb` (or paste cell-by-cell).
It is **production-clean**, consistent scaling, trains per-patient Isolation Forest models, saves them, and includes a **terminal-style live scoring simulation** (1 score per second).

No API. No “top anomalies”. Only continuous scoring.

---

# ✅ Baseline Isolation Forest Notebook (Time-Series Per-Second Scoring)

---

## Cell 1 — Imports

```python
import os
import globm
import time
import joblib
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

import matplotlib.pyplot as plt
```

---

## Cell 2 — Configuration (edit this)

```python
# Folder with training CSVs (1 file per patient)
TRAIN_DATA_DIR = "./data/train_patients"

# Where to save models
MODEL_DIR = "./models"
PATIENT_MODEL_DIR = os.path.join(MODEL_DIR, "patient_models")

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(PATIENT_MODEL_DIR, exist_ok=True)

# Columns that must exist in every CSV
FEATURE_COLS = ["heart_rate", "spo2", "blood_pressure", "motion"]

# Isolation Forest hyperparameters (baseline model)
ISO_PARAMS = {
    "n_estimators": 300,
    "contamination": "auto",
    "random_state": 42,
    "n_jobs": -1
}

MIN_TRAIN_ROWS = 50  # skip patient if less than this
```

---

## Cell 3 — Utility functions (clean + reusable)

```python
def list_patient_csvs(folder):
    files = sorted(glob.glob(os.path.join(folder, "*.csv")))
    if len(files) == 0:
        raise ValueError(f"No CSV files found in: {folder}")
    return files


def load_patient_df(csv_path, feature_cols):
    """
    Loads one patient CSV, keeps only features, drops NaNs.
    Returns a clean DataFrame.
    """
    df = pd.read_csv(csv_path)

    missing = [c for c in feature_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns in {csv_path}: {missing}")

    df = df[feature_cols].copy()
    df = df.dropna()

    return df


def patient_id_from_path(path):
    return os.path.splitext(os.path.basename(path))[0]
```

---

## Cell 4 — Discover patient files

```python
patient_files = list_patient_csvs(TRAIN_DATA_DIR)
print("Patients found:", len(patient_files))
print("Example:", patient_files[0])
```

---

## Cell 5 — Fit ONE global scaler (consistent scaling)

This ensures the scale is consistent across all patients.

```python
all_rows = []

for f in patient_files:
    df = load_patient_df(f, FEATURE_COLS)
    if len(df) > 0:
        all_rows.append(df)

all_rows = pd.concat(all_rows, axis=0)

scaler = StandardScaler()
scaler.fit(all_rows.values)

print("Global scaler fitted on shape:", all_rows.shape)

# Save scaler
scaler_path = os.path.join(MODEL_DIR, "global_scaler.pkl")
joblib.dump(scaler, scaler_path)
print("Saved scaler ->", scaler_path)
```

---

## Cell 6 — Train per-patient Isolation Forest baseline models

```python
trained_patients = []

for f in patient_files:
    pid = patient_id_from_path(f)
    df = load_patient_df(f, FEATURE_COLS)

    if len(df) < MIN_TRAIN_ROWS:
        print(f"[SKIP] {pid} | rows={len(df)} (< {MIN_TRAIN_ROWS})")
        continue

    X_scaled = scaler.transform(df.values)

    model = IsolationForest(**ISO_PARAMS)
    model.fit(X_scaled)

    model_path = os.path.join(PATIENT_MODEL_DIR, f"{pid}_iforest.pkl")
    joblib.dump(model, model_path)

    trained_patients.append(pid)
    print(f"[OK] trained={pid} | rows={len(df)} | saved={model_path}")

print("\nTotal trained patients:", len(trained_patients))
```

---

## Cell 7 — Scoring logic (important)

Isolation Forest output:

* `decision_function`: higher means more normal
  We convert it into:
* **anomaly_score = -decision_function**
  so **higher score = more anomalous**

```python
def anomaly_scores_from_scaled(model, X_scaled):
    """
    X_scaled: np.ndarray of shape (n, features)
    returns: np.ndarray anomaly scores (higher = more anomalous)
    """
    normality = model.decision_function(X_scaled)  # higher = more normal
    return -normality


def score_one_frame(model, scaler, row_values):
    """
    row_values: list in the same order as FEATURE_COLS
    returns: float anomaly score
    """
    x = np.array([row_values], dtype=float)
    x_scaled = scaler.transform(x)
    return float(anomaly_scores_from_scaled(model, x_scaled)[0])
```

---

## Cell 8 — Load a trained patient model (for testing)

```python
if len(trained_patients) == 0:
    raise RuntimeError("No patients trained. Check your data.")

TEST_PATIENT_ID = trained_patients[0]
test_model_path = os.path.join(PATIENT_MODEL_DIR, f"{TEST_PATIENT_ID}_iforest.pkl")

model = joblib.load(test_model_path)
print("Loaded patient model:", TEST_PATIENT_ID)
```

---

## Cell 9 — Simulate “terminal live scoring” from a CSV (1 row per second)

This is exactly the behavior you want:
**every second → print score**

```python
# We'll simulate streaming by reading a CSV row-by-row
stream_file = [f for f in patient_files if patient_id_from_path(f) == TEST_PATIENT_ID][0]
stream_df = load_patient_df(stream_file, FEATURE_COLS).reset_index(drop=True)

print("Streaming rows:", len(stream_df))
print(stream_df.head())
```

---

## Cell 10 — Live scoring loop (prints per second)

```python
scores = []

for t in range(len(stream_df)):
    row = stream_df.loc[t].values.tolist()

    s = score_one_frame(model, scaler, row)
    scores.append(s)

    # This print is terminal-style output
    print(f"[t={t:05d}] anomaly_score={s:.5f}")

    time.sleep(1)  # 1 score per second
```

---

## Cell 11 — Plot the anomaly score over time (optional)

```python
plt.figure(figsize=(12, 5))
plt.plot(scores)
plt.title(f"Anomaly Score Over Time (Patient: {TEST_PATIENT_ID})")
plt.xlabel("Time (seconds / frames)")
plt.ylabel("Anomaly Score (higher = more anomalous)")
plt.show()
```

---

## Cell 12 — Optional: Add alert levels (not top anomalies)

This is still continuous scoring — it just gives meaning.

```python
def alert_level(score, warn=0.1, alert=0.3):
    if score < warn:
        return "NORMAL"
    elif score < alert:
        return "WARNING"
    else:
        return "ALERT"


for t in range(min(30, len(stream_df))):
    row = stream_df.loc[t].values.tolist()
    s = score_one_frame(model, scaler, row)

    print(f"[t={t:05d}] score={s:.5f} | {alert_level(s)}")
```

---

# Notes (important, practical)

### 1) This baseline model scores each second correctly

It outputs a score per row (per second).

### 2) Isolation Forest is row-based

It does not “understand sequences”.
Your second model will handle patterns.

### 3) The scaler is global

This is the correct way if you want consistent scoring.

---

If you want, I can also give you the **same notebook but with clean saving of metadata** like:

* feature order
* scaler version
* training rows per patient
  (so inference never breaks later).
