In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from copy import deepcopy
import random
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import json
import os
import pickle

### Overview

**Baseline config** initial state:  (used in production)   
3 PIDs (engine load, rpm, speed) set to monitoring, with 300 s Min Period (distance, angle, speed delta are not used), min saved records = 1, send period = 120.

**Worst config**  initial state (from my own experience)  
3 PIDs (engine load, rpm, speed) set to on_change, with 1 s Min Period (distance, angle, speed delta are not used), min saved records = 1, send period = 1.

**Test conditions**
1. "Vehicle trip" (12 minutes vehicle trip with values increasing monotonically)
2.  Sleep, battery, GPS precision and network mode (home, roaming, unkown) are assumed optimal. 

**State representation**  
1. $p_{i_0}$----PID (integer identifier for PID, e.g. 12 = RPM)
2. $p_{i_1}$----change data acquisition strategy for PID $i$  *(off, monitoring, on_change, hysteresis, delta_change)*  
3. $d_0$------increase/decrease min_time  (time the device lets pass before querying PID values)     *(seconds, 0 disables it)*  
4. $d_1$------increase/decrease min_saved (number of records the device will accumulate before sending) *(int, 0 disables it)*  
5. $d_2$------increase/decrease send_time  (time the device lets pass before attempting to send a new record)  *(seconds, 0 disables it)*  
 

For example, considering 3 different PIDs a state $S$ is defined like: 

$S = (p_{10}, p_{11}, p_{20}, p_{21}, p_{30}, p_{31}, d_0, d_1, d_2)$

So for 3 PIDs, the $S$ vector is of size $9$, and in general:  $2n + 3$

$S = (p_{10}, p_{11}, p_{20}, p_{21}, ..., p_{n0}, p_{n1}, d_0, d_1, d_2)$

### PID configuration

In [2]:
STRATEGY_MAP = {
    "off": 0,
    "monitoring": 1,
    "on_change": 2,
    "hysteresis": 3,
    "on_delta_change": 4 # on_enter, on_exit, on_both are omitted to simplify the model.
}

REVERSE_STRATEGY_MAP = {v: k for k, v in STRATEGY_MAP.items()}
STRATEGY_LIST = list(STRATEGY_MAP.keys())

ACTIVE_PIDS = [12, 13, 39]  # add all necessary PIDs
n_pids = len(ACTIVE_PIDS)

PID_PRECISION = {
    12: 100,
    13: 5,
    39: 5,
    28: 2,
    38: 3,
    31: 3
}

### State encoder / decoder

In [3]:
def encode_config(config: dict) -> list:
    """
    Converts a human-readable config dict into a state vector.
    Each PID is encoded as [pid, strategy_idx], where strategy_idx includes 'off'.
    """
    vector = []

    # Encode OBD parameters
    for param in config.get("obd_parameters", []):
        pid = param.get("pid", 0)
        strategy_idx = STRATEGY_MAP.get(param.get("strategy", "monitoring"), 1)
        vector.extend([pid, strategy_idx])

    # Global settings
    global_ = config.get("global_settings", {})
    vector.extend([
        global_.get("min_time", 10),
        global_.get("send_period", 10),
        global_.get("min_saved_records", 1)
    ])

    return vector


def decode_config(vector: list) -> dict:
    """
    Converts a state vector into a human-readable config dict.
    Assumes 2 elements per PID: [pid, strategy_idx], followed by 3 global settings.
    """
    config = {"obd_parameters": [], "global_settings": {}}

    # Number of PIDs can be inferred
    pid_section_length = len(vector) - 3  # last 3 are global
    assert pid_section_length % 2 == 0, "Invalid config vector length"

    n_pids = pid_section_length // 2

    # Decode OBD parameter block
    for i in range(n_pids):
        base = i * 2
        pid, strategy_idx = vector[base:base + 2]
        config["obd_parameters"].append({
            "pid": pid,
            "enabled": strategy_idx != 0,  # 'off' means not enabled
            "strategy": REVERSE_STRATEGY_MAP.get(strategy_idx, "monitoring")
        })

    # Decode global settings
    g_base = n_pids * 2
    config["global_settings"] = {
        "min_time": vector[g_base],
        "send_period": vector[g_base + 1],
        "min_saved_records": vector[g_base + 2]
    }

    return config


def load_config(num):
    with open(f'./device_config/config_{num}.json', "r") as json_file:
        config_ = json.load(json_file)
        c_vector = encode_config(config_)
        print(f"Config vector {num} loaded:", c_vector)
        return config_, c_vector

### Actions 

In [None]:
ACTIONS = []

# Strategy change for enabled PIDs
for pid in ACTIVE_PIDS:
    ACTIONS.append({"type": "cycle_strategy", "pid": pid}) 

# Adjust min_time in reasonable steps
for delta in [-30, -15, -5, -1, 1, 5, 15, 30]:
    ACTIONS.append({"type": "adjust", "param": "min_time", "delta": delta})

# Adjust send_period in similar steps
for delta in [-30, -15, -5, -1, 1, 5, 15, 30]:
    ACTIONS.append({"type": "adjust", "param": "send_period", "delta": delta})

# Cycle min_saved_records 1 → 2 → 3 → 4 → 1
ACTIONS.append({"type": "cycle", "param": "min_saved_records"})

In [5]:
n_pids = len(ACTIVE_PIDS)
g_base = n_pids * 2  # index where global settings start

def apply_action(state: list[int], action_id: int, 
                 trace: bool = True) -> tuple[list[int], dict]:
    """
    Applies a single action to the config vector and returns (new_state, trace_dict).
    The state layout is: [pid, strategy_idx] * N + [min_time, send_period, min_saved_records]
    """
    new_vector = deepcopy(state)
    trace_info = {}

    n_pids = len(ACTIVE_PIDS)
    g_base = n_pids * 2  # 2 elements per PID now

    def log(msg):
        if trace:
            print(f"[apply_action] {msg}")
        trace_info["change"] = msg

    # PID strategy cycle: action 0 to N-1
    if action_id < n_pids:
        idx = action_id
        base = idx * 2
        pid = new_vector[base]
        strategy = new_vector[base + 1]

        if strategy == 0:
            log(f"PID {pid} is off. Strategy unchanged.")
        else:
            next_strategy = (strategy - 1 + 1) % 4 + 1  # 1 → 2 → 3 → 4 → 1
            new_vector[base + 1] = next_strategy
            log(f"PID {pid} strategy: {STRATEGY_LIST[strategy]} → {STRATEGY_LIST[next_strategy]}")
        return new_vector, trace_info

    # Global actions
    global_action_id = action_id - n_pids

    # Adjust steps
    global_adjustments = {
        "min_time": {"index": g_base, "bounds": (0, 300)},
        "send_period": {"index": g_base + 1, "bounds": (0, 300)},
    }

    # Cycle min_saved_records separately
    if global_action_id >= len([-30, -15, -5, -1, 1, 5, 15, 30]) * 2:
        index = g_base + 2
        old_val = new_vector[index]
        new_vector[index] = (old_val % 4) + 1
        log(f"min_saved_records cycled from {old_val} → {new_vector[index]}")
        return new_vector, trace_info

    # For min_time and send_period
    deltas = [-30, -15, -5, -1, 1, 5, 15, 30]
    n_deltas = len(deltas)

    if global_action_id < n_deltas:
        label = "min_time"
        delta = deltas[global_action_id]
    else:
        label = "send_period"
        delta = deltas[global_action_id - n_deltas]

    index = global_adjustments[label]["index"]
    min_val, max_val = global_adjustments[label]["bounds"]
    old_val = new_vector[index]
    new_val = max(min(old_val + delta, max_val), min_val)
    new_vector[index] = new_val
    log(f"{label} changed from {old_val} → {new_val}")
    return new_vector, trace_info

### Reward function

reward = latency_score + sum(pid_scores)

For example:    
    latency_ms = 1850
```json
    pid_data_list = [
        {
            "pid": 12,
            "values": [1500, 1500, 1500],
            "strategy": "on_change",
            "precision": 100,
            "valid_range": (800, 6000)
        },
        // rest of the PIDs
]
```

**Precision**  
This defines how much a value must change to be considered “meaningful.”
Even if all values are valid, we may want to ignore tiny fluctuations (e.g., RPM changes by 1 unit) and log only useful variation (e.g., ±100 RPM).

In [6]:
def latency_score(latency_ms: float) -> float:
    """
    Reward stable and reasonable latency.
    - Under 5s: full reward
    - Between 5s–20s: moderate reward
    - 20s–30s: penalize
    - Over 30s: heavy penalty
    """
    if latency_ms <= 5000:
        return 1.0
    elif latency_ms <= 20000:
        return 0.6
    elif latency_ms <= 30000:
        return -0.3
    else:
        return -1.0


def global_config_penalty(send_period: int, min_time: int, min_saved: int) -> float:
    """
    Penalizes or rewards based on how close parameters are to ideal ranges.
    - Ideal:
        send_period ≈ 60–120
        min_time ≈ 60–300
        min_saved ≈ 2–4
    """
    score = 0.0

    # --- min_saved_records ---
    if min_saved < 2:
        score -= 0.4  # too aggressive
    elif min_saved > 4:
        score -= 0.5  # queues too much
    else:
        score += 0.1  # ideal

    # --- send_period ---
    if send_period < 10:
        score -= 0.6
    elif send_period < 30:
        score -= 0.3
    elif send_period > 150:
        score -= 0.4
    elif 60 <= send_period <= 120:
        score += 0.2  # optimal
    else:
        score += 0.05  # acceptable

    # --- min_time ---
    if min_time < 10:
        score -= 0.3
    elif min_time > 600:
        score -= 0.2
    elif 60 <= min_time <= 300:
        score += 0.2
    else:
        score += 0.05

    return round(score, 3)
    

def compute_pid_scores(pid_data_list: list[dict], send_period: int) -> list[dict]:
    scores = []
    for entry in pid_data_list:
        strategy = entry["strategy"]
        if strategy == "off":
            continue  # ← Skip unused PIDs

        score = data_quality_score(
            values=entry["values"],
            strategy=strategy,
            precision=entry["precision"],
            valid_range=entry["valid_range"],
            send_period=send_period
        )
        scores.append({
            "pid": entry["pid"],
            "score": round(score, 3)
        })
    return scores



def data_quality_score(
    values: list[float],
    strategy: str,
    precision: float,
    valid_range: tuple[float, float],
    send_period: int,
) -> float:
    """Scores PID quality: range check + smooth variation proportional to sampling interval."""
    if not values:
        return -1.0

    # 1. All values must be in valid range and non-zero
    if not all(valid_range[0] <= v <= valid_range[1] for v in values):
        return -1.0
    if all(v == 0 for v in values):
        return -1.0

    # 2. Variation scaled to time gap
    variation = max(values) - min(values)
    allowed_variation = precision * (send_period / 10)  # allow bigger jumps for longer intervals
    significant = variation >= allowed_variation

    # 3. Strategy reward logic
    match strategy:
        case "on_change":
            return 1.0 if not significant else 0.5
        case "on_delta_change":
            return 1.0 if significant else -0.5
        case "hysteresis":
            return 0.8 if significant else 0.2
        case "monitoring":
            return 0.6 if significant else -0.2

    return 0.0  # fallback


def compute_average_quality(pid_data_list: list[dict], send_period: int) -> float:
    scores = []
    for entry in pid_data_list:
        strategy = entry["strategy"]
        if strategy == "off":
            continue

        score = data_quality_score(
            values=entry["values"],
            strategy=strategy,
            precision=entry["precision"],
            valid_range=entry["valid_range"],
            send_period=send_period
        )
        scores.append(score)

    return round(sum(scores), 3) if scores else 0.0


def clip_reward(score, min_val=-1.0, max_val=3.0):
    """Optional clipping to stabilize training and limit outliers."""
    return max(min(score, max_val), min_val)


def compute_reward(latency_ms: int, pid_data_list: list[dict], send_period: int, min_time: int, min_saved: int) -> float:
    latency = latency_score(latency_ms)
    quality_sum = compute_average_quality(pid_data_list, send_period)
    penalty = global_config_penalty(send_period, min_time, min_saved)
    return round(latency + quality_sum + penalty, 3)


def compute_reward_with_details(latency_ms: int, pid_data_list: list[dict], send_period: int, 
                                min_time: int, min_saved: int) -> tuple[float, dict]:
    for entry in pid_data_list:
        entry["precision"] = PID_PRECISION.get(entry["pid"], 1)

    latency = latency_score(latency_ms)
    pid_scores = compute_pid_scores(pid_data_list, send_period)
    total_pid_score = sum(entry["score"] for entry in pid_scores)
    config_penalty = global_config_penalty(send_period, min_time, min_saved)

    total_reward = round(latency + total_pid_score + config_penalty, 3)

    breakdown = {
        "latency_ms": latency_ms,
        "latency_score": round(latency, 3),
        "pid_scores": pid_scores,
        "total_pid_score": round(total_pid_score, 3),
        "global_config_penalty": config_penalty,
    }

    return total_reward, breakdown


In [7]:
def flag_latency_outliers(freq_df: pd.DataFrame, threshold_ms: int = 7_200_000) -> pd.DataFrame:
    """
    Adds a boolean column 'is_latency_outlier' to the input DataFrame,
    where True indicates latency greater than the given threshold (default: 2 hours).
    """

    # Convert timestamps
    freq_df["ts_recorded"] = pd.to_datetime(freq_df["ts_recorded"])
    freq_df["ts_uploaded"] = pd.to_datetime(freq_df["ts_uploaded"])

    # Compute latency in milliseconds
    freq_df["latency_ms"] = (freq_df["ts_uploaded"] - freq_df["ts_recorded"]).dt.total_seconds() * 1000

    # Flag all rows with latency above the threshold
    freq_df["is_latency_outlier"] = freq_df["latency_ms"] > threshold_ms

    return freq_df


def get_latency(start_ts, end_ts, method='median'):
    freq_df = pd.read_csv('../../data_proc/csv_data/qa_device/frequencies.csv', low_memory=False)
    freq_df.drop('Unnamed: 0', inplace=True, axis=1)

    # Parse timestamps
    freq_df["ts_recorded"] = pd.to_datetime(freq_df["ts_recorded"])
    freq_df["ts_uploaded"] = pd.to_datetime(freq_df["ts_uploaded"])

    # Filter within time range
    mask = (freq_df["ts_recorded"] >= pd.to_datetime(start_ts)) & (freq_df["ts_recorded"] <= pd.to_datetime(end_ts))
    filtered_df = freq_df[mask].copy()

    if filtered_df.empty:
        return None  # or raise an exception if preferred

    # Compute latency in ms
    filtered_df["latency_ms"] = (filtered_df["ts_uploaded"] - filtered_df["ts_recorded"]).dt.total_seconds() * 1000

    if method == 'median':
        return filtered_df["latency_ms"].median()
    elif method == 'mean':
        return filtered_df["latency_ms"].mean()
    else:
        raise ValueError("method must be 'median' or 'mean'")


# Define a field-to-PID mapping
FIELD_TO_PID = {
    "obd.rpm.value": 12,
    "obd.speed.value": 13,
    "obd.fuel_level.value": 28,
    "obd.coolant_temp.value": 38,
    "obd.engine_load.value": 39,
    "obd.intake_temp.value": 20,
    "obd.maf.value": 21,
    "obd.throttle_pos.value": 41,
    "obd.ambient_air_temp.value": 131,
    "obd.distance_since_codes_clear.value": 31,
    "obd.time_since_codes_cleared.value": 47,
    # Add more as needed...
}


def extract_pid_statistics(obd_df: pd.DataFrame, start_ts: str, end_ts: str) -> list[dict]:
    """
    Extracts per-PID reward inputs (PID ID, values, precision, valid range)
    from an obd_export dataframe filtered by timestamp range.
    Returns a list of dictionaries ready for reward scoring.
    """
    # Filter by time window
    obd_df["@ts"] = pd.to_datetime(obd_df["@ts"])
    mask = (obd_df["@ts"] >= pd.to_datetime(start_ts)) & (obd_df["@ts"] <= pd.to_datetime(end_ts))
    obd_df = obd_df[mask].copy()

    results = []

    for field, pid in FIELD_TO_PID.items():
        if field not in obd_df.columns:
            continue

        values = obd_df[field].dropna().astype(float).tolist()
        if not values:
            continue

        vmin, vmax = min(values), max(values)
        vrange = vmax - vmin

        precision = round(vrange * 0.1, 3) if vrange > 0 else 1.0
        valid_range = (vmin - vrange * 0.1, vmax + vrange * 0.1)

        results.append({
            "pid": pid,
            "values": values,
            "precision": precision,
            "valid_range": valid_range
        })

    return results

In [8]:
def summarize_pid_statistics(pid_data_list: list[dict]) -> pd.DataFrame:
    """
    Returns a DataFrame summarizing each PID's stats (excluding raw values).
    """
    summary = []

    for entry in pid_data_list:
        values = entry["values"]
        pid_summary = {
            "PID": entry["pid"],
            "Count": len(values),
            "Min": min(values) if values else None,
            "Max": max(values) if values else None,
            "Precision": PID_PRECISION.get(entry["pid"], 1),
            "Valid Range": entry["valid_range"],
        }
        summary.append(pid_summary)

    return pd.DataFrame(summary).sort_values(by="PID")


### Q-Agent

In [9]:
class QAgent:
    def __init__(self, action_space_size, epsilon=0.2, alpha=0.5, gamma=0.9):
        self.q_table = defaultdict(self.default_q_values)
        self.epsilon = epsilon  # exploration rate NOTE tune better the parameters
        self.alpha = alpha      # learning rate
        self.gamma = gamma      # discount factor
        self.action_space_size = action_space_size

    def default_q_values(self):
        return [0.0] * len(ACTIONS)

    def select_action(self, state_vector: list[int]) -> int:
        state_key = tuple(state_vector)

        if random.random() < self.epsilon:
            return random.randint(0, self.action_space_size - 1)  # explore
        else:
            q_values = self.q_table[state_key]
            return int(q_values.index(max(q_values)))  # exploit

    def update(self, state: list[int], action: int, reward: float, next_state: list[int]):
        state_key = tuple(state)
        next_state_key = tuple(next_state)

        old_value = self.q_table[state_key][action]
        next_max = max(self.q_table[next_state_key])

        # Q-learning update rule
        new_value = old_value + self.alpha * (reward + self.gamma * next_max - old_value)
        self.q_table[state_key][action] = new_value

    def decay_epsilon(self, decay_rate=0.99):
        self.epsilon *= decay_rate

    def save_q_table(self, path='q_table.json'):
        # Convert keys to strings for JSON serialization
        json_q = {str(k): v for k, v in self.q_table.items()}
        with open(path, 'w') as f:
            json.dump(json_q, f, indent=2)

    def load_q_table(self, path='q_table.json'):
        with open(path, 'r') as f:
            json_q = json.load(f)
        self.q_table = defaultdict(lambda: [0.0] * self.action_space_size)
        for k, v in json_q.items():
            self.q_table[tuple(eval(k))] = v

    def get_best_config_vector(self):
        """
        Returns the state (config vector) with the highest Q-value across all actions.
        """
        best_state = None
        best_value = float("-inf")

        for state, action_values in self.q_table.items():
            max_q = max(action_values)
            if max_q > best_value:
                best_value = max_q
                best_state = state

        if best_state is None:
            return []

        return list(best_state)


### Vehicle trip simulation / Q-Agent training

1. Start from a known config (state)
2. Select an action with ε-greedy
3. Applies it to get next_state
4. Run run_vehicle_sim() to compute reward
5. Update Q-table
6. Repeat for N episodes

In [10]:
def run_vehicle_sim(config_vector, trip_df, trace=False):
    config = decode_config(config_vector)
    pid_data_list = []

    # Parse timestamps
    trip_df["ts_recorded"] = pd.to_datetime(trip_df["ts_recorded"], utc=True, errors="coerce")
    trip_df["ts_uploaded"] = pd.to_datetime(trip_df["ts_uploaded"], utc=True, errors="coerce")
    trip_df = trip_df.dropna(subset=["ts_recorded", "ts_uploaded"])

    if trip_df.empty:
        print("Trip dataframe was empty after filtering.")
        if trace:
            print("[run_vehicle_sim] trip_df is EMPTY after filtering.")
        return -0.5, {
            "latency_ms": float("nan"),
            "latency_score": -0.5,
            "pid_scores": [],
            "total_pid_score": 0.0,
            "global_config_score": -1.0,
        }

    # Extract values from trip log for enabled PIDs
    for param in config["obd_parameters"]:
        if not param["enabled"]:
            continue

        pid = param["pid"]
        strategy = param["strategy"]

        if pid == 12:
            values = trip_df["obd_rpm"].tolist()
            valid_range = (0, 8000)
        elif pid == 13:
            values = trip_df["obd_speed"].tolist()
            valid_range = (0, 250)
        elif pid == 39:
            values = trip_df["obd_engine_load"].tolist()
            valid_range = (0, 100)
        else:
            continue

        pid_data_list.append({
            "pid": pid,
            "values": values,
            "strategy": strategy,
            "valid_range": valid_range
        })

    # Calculate average latency
    latency_ms = (trip_df["ts_uploaded"] - trip_df["ts_recorded"]).dt.total_seconds().mean() * 1000

    # Extract global config params
    send_period = config.get("send_period", 60)
    min_time = config.get("min_time", 60)
    min_saved = config.get("min_saved_records", 2)

    # Compute full reward with latency, PIDs, and global config
    reward, breakdown = compute_reward_with_details(
        latency_ms,
        pid_data_list,
        send_period=send_period,
        min_time=min_time,
        min_saved=min_saved
    )

    if trace:
        print(f"Reward: {reward:.2f}, Latency: {latency_ms:.1f} ms, Breakdown: {breakdown}")

    return reward, breakdown

In [11]:
def train_q_agent(
    agent,
    baseline_config_vector,
    start_ts,
    end_ts,
    obd_csv_path,
    episodes=50,
    trace=True
):

    # Load and filter OBD data
    df = pd.read_csv(obd_csv_path)
    df["ts_recorded"] = pd.to_datetime(df["ts_recorded"])
    start_dt, end_dt = pd.to_datetime(start_ts), pd.to_datetime(end_ts)
    df_trip = df[(df["ts_recorded"] >= start_dt) & (df["ts_recorded"] <= end_dt)].copy()

    state = baseline_config_vector.copy()
    rewards_per_episode = []

    for ep in range(episodes):
        action = agent.select_action(state)
        next_state, trace_info = apply_action(state, action, trace=False)

        try:
            reward, breakdown = run_vehicle_sim(
                config_vector=next_state,
                trip_df=df_trip,
                trace=False
            )
        except Exception as e:
            print(f"[Episode {ep}] Simulation failed: {e}")
            reward = -1.0
            breakdown = {
                "latency_ms": float("nan"),
                "latency_score": -0.5,
                "pid_scores": [],
                "total_pid_score": 0.0
            }

        agent.update(state, action, reward, next_state)
        rewards_per_episode.append(reward)
        state = next_state

        if trace:
            print(f"[Episode {ep}] Latency: {breakdown['latency_ms']:.0f} ms")
            print(f"[Episode {ep}] Action: {action}, Reward: {reward:.3f}")

        agent.decay_epsilon()

    return rewards_per_episode

In [19]:
def run_100episodes():
    for num in range(0, 5):

        # Load config JSON and its vector representation
        config_, c_vector = load_config(num)        

        start_time = config_["test_metadata"][0]["ts_start"]
        end_time = config_["test_metadata"][0]["ts_end"]
        obd_csv_path = './es_events/poc_and_v0_data.csv'

        init_epsilon=0.3
        init_alpha=0.7
        init_gamma=0.5

        # Instantiate QAgent
        agent = QAgent(
            action_space_size=len(ACTIONS),
            epsilon=init_epsilon,
            alpha=init_alpha,
            gamma=init_gamma
        )

        # Train agent
        print(f"Running training for config_{num} with epsilon={init_epsilon}, gamma={init_gamma}, alpha={init_alpha}")
        rewards = train_q_agent(
            agent=agent,
            baseline_config_vector=c_vector,
            start_ts=start_time,
            end_ts=end_time,
            obd_csv_path=obd_csv_path,
            episodes=100,
            trace=True
        )

        # === Save to results folder ===
        results_dir = f"./results/v0/test_{num}/"
        os.makedirs(results_dir, exist_ok=True)

        # Save reward plot
        plt.figure()
        plt.plot(rewards)
        plt.title("Q-Learning Reward", loc="left")
        plt.xlabel("Episode")
        plt.ylabel("Total Reward")
        plt.annotate(
            f"ε={init_epsilon:.2f}, γ={init_gamma:.2f}, α={init_alpha:.2f}",
            xy=(1.0, 1.01),
            xycoords='axes fraction',
            ha='right',
            va='bottom',
            fontsize=10
        )
        plt.grid(True)
        plt.savefig(os.path.join(results_dir, "reward_progress.png"))
        plt.close()

        # Save Q-table
        with open(os.path.join(results_dir, "q_table.pkl"), "wb") as f:
            pickle.dump(agent.q_table, f)

        # Save best config
        best_config = agent.get_best_config_vector()
        with open(os.path.join(results_dir, "best_config.txt"), "w") as f:
            f.write(",".join(map(str, best_config)))

        # Save test metadata
        with open(os.path.join(results_dir, "test_parameters.txt"), "w") as f:
            f.write(f"csv_file_used: {obd_csv_path}\n")
            f.write(f"config_file: config_{num}.json\n")
            f.write(f"epsilon: {agent.epsilon}\n")
            f.write(f"gamma: {agent.gamma}\n")
            f.write(f"alpha: {agent.alpha}\n")

In [21]:
run_100episodes()

Config vector 0 loaded: [12, 1, 13, 1, 39, 1, 300, 120, 1]
Running training for config_0 with epsilon=0.3, gamma=0.5, alpha=0.7
[Episode 0] Latency: 3510 ms
[Episode 0] Action: 0, Reward: 2.100
[Episode 1] Latency: 3510 ms
[Episode 1] Action: 0, Reward: 1.300
[Episode 2] Latency: 3510 ms
[Episode 2] Action: 0, Reward: 0.600
[Episode 3] Latency: 3510 ms
[Episode 3] Action: 15, Reward: 0.600
[Episode 4] Latency: 3510 ms
[Episode 4] Action: 13, Reward: 0.600
[Episode 5] Latency: 3510 ms
[Episode 5] Action: 10, Reward: 0.600
[Episode 6] Latency: 3510 ms
[Episode 6] Action: 10, Reward: 0.600
[Episode 7] Latency: 3510 ms
[Episode 7] Action: 10, Reward: 0.600
[Episode 8] Latency: 3510 ms
[Episode 8] Action: 9, Reward: 0.600
[Episode 9] Latency: 3510 ms
[Episode 9] Action: 10, Reward: 0.600
[Episode 10] Latency: 3510 ms
[Episode 10] Action: 10, Reward: 0.600
[Episode 11] Latency: 3510 ms
[Episode 11] Action: 10, Reward: 0.600
[Episode 12] Latency: 3510 ms
[Episode 12] Action: 10, Reward: 0.600

In [14]:
caca

NameError: name 'caca' is not defined

In [None]:
def run_experiment_series(
    param_name: str,
    values: list[float],
    config_vector: list[int],
    start_ts: str,
    end_ts: str,
    obd_csv_path: str,
    fixed_epsilon: float,
    fixed_gamma: float,
    fixed_alpha: float,
    episodes: int = 100
) -> dict:
    """
    Runs a sweep over one RL hyperparameter and logs average reward over the last 20 episodes.
    Returns a dict mapping each parameter value to its average reward.
    """
    results = {}

    for val in values:
        print(f"\n=== Running with {param_name} = {val:.2f} ===")

        # Set current value for the parameter being varied
        if param_name == "epsilon":
            epsilon, gamma, alpha = val, fixed_gamma, fixed_alpha
        elif param_name == "gamma":
            epsilon, gamma, alpha = fixed_epsilon, val, fixed_alpha
        elif param_name == "alpha":
            epsilon, gamma, alpha = fixed_epsilon, fixed_gamma, val
        else:
            raise ValueError(f"Unknown param_name: {param_name}")

        # Initialize agent
        agent = QAgent(
            action_space_size=len(ACTIONS),
            epsilon=epsilon,
            gamma=gamma,
            alpha=alpha
        )

        # Train agent
        rewards = train_q_agent(
            agent=agent,
            baseline_config_vector=config_vector,
            start_ts=start_ts,
            end_ts=end_ts,
            obd_csv_path=obd_csv_path,
            episodes=episodes,
            trace=False
        )

        # Compute average of last 20 episodes
        avg_final = round(sum(rewards[-20:]) / 20, 3)
        results[val] = avg_final
        print(f"→ avg_final_reward = {avg_final:.3f}")

    return results


#### $\epsilon$ sweep

In [None]:
epsilon_values = [0.05, 0.1, 0.2, 0.3]
config_, c_vector = load_config(0) # Config0 showed the most stable behavior during first three tests

results_epsilon = run_experiment_series(
    param_name="epsilon",
    values=epsilon_values,
    config_vector=c_vector,
    start_ts=config_["test_metadata"][0]["ts_start"],
    end_ts=config_["test_metadata"][0]["ts_end"],
    obd_csv_path="./es_events/poc_and_v0_data.csv",
    fixed_epsilon=0.0,  ### Not used in this sweep
    fixed_gamma=0.85,
    fixed_alpha=0.60,
    episodes=100
)

#### $\gamma$ sweep

In [None]:
gamma_values = [0.70, 0.85, 0.95]

results_gamma = run_experiment_series(
    param_name="gamma",
    values=gamma_values,
    config_vector=c_vector,
    start_ts=config_["test_metadata"][0]["ts_start"],
    end_ts=config_["test_metadata"][0]["ts_end"],
    obd_csv_path="./es_events/poc_and_v0_data.csv",
    fixed_epsilon=0.30,  # best from last sweep
    fixed_gamma=0.0,     # ignored in sweep
    fixed_alpha=0.60,
    episodes=100
)

#### $\alpha$ sweep

In [None]:
alpha_values = [0.3, 0.5, 0.7]

results_alpha = run_experiment_series(
    param_name="alpha",
    values=alpha_values,
    config_vector=c_vector,
    start_ts=config_["test_metadata"][0]["ts_start"],
    end_ts=config_["test_metadata"][0]["ts_end"],
    obd_csv_path="./es_events/poc_and_v0_data.csv",
    fixed_epsilon=0.30,
    fixed_gamma=0.70,
    fixed_alpha=0.0,  # not used in this sweep
    episodes=100
)

### Unit tests


In [None]:
### Helper functions

def generate_random_config() -> dict:
    config = {
        "obd_parameters": [],
        "global_settings": {}
    }

    for pid in ACTIVE_PIDS:
        enabled = random.choice([True, False])
        strategy = random.choice(STRATEGY_LIST)
        config["obd_parameters"].append({
            "pid": pid,
            "enabled": enabled,
            "strategy": strategy
        })

    config["global_settings"] = {
        "min_time": random.choice([5, 10, 30, 60, 120, 300]),
        "send_period": random.choice([10, 30, 60, 120, 300]),
        "min_saved_records": random.randint(1, 10)
    }

    return config

In [None]:
raise StopIteration("Manual testing block below. Execution stopped.")

#### Encode / decode

In [None]:
config = {
    "obd_parameters": [
        {"pid": 12, "enabled": True, "strategy": "on_change"},
        {"pid": 13, "enabled": True, "strategy": "monitoring"},
        {"pid": 39, "enabled": False, "strategy": "hysteresis"},
    ],
    "global_settings": {
        "min_time": 10,
        "send_period": 60,
        "min_saved_records": 1
    }
}

vec = encode_config(config)
decoded = decode_config(vec)
assert config == decoded  # Should pass


#### Apply action

In [None]:
def test_apply_action_on_all_ids():
    base_config = generate_random_config()
    baseline_vector = encode_config(base_config)
    print("Base Config:", decode_config(baseline_vector))

    n_pids = len(ACTIVE_PIDS)
    total_actions = len(ACTIONS)

    for action_id in range(total_actions):
        print(f"\n--- Testing action {action_id} ---")
        new_vector, trace = apply_action(baseline_vector, action_id, trace=True, baseline=baseline_vector)

        # Assert same length
        assert len(new_vector) == len(baseline_vector), f"Vector length changed for action {action_id}"

        # Assert valid strategy index
        for i in range(n_pids):
            strategy_idx = new_vector[i * 3 + 2]
            assert 0 <= strategy_idx < len(STRATEGY_LIST), f"Invalid strategy index {strategy_idx} after action {action_id}"

        # Global settings sanity check
        g_base = n_pids * 3
        assert new_vector[g_base] >= 1, "min_time below 1"
        assert new_vector[g_base + 1] >= 1, "send_period below 1"
        assert 1 <= new_vector[g_base + 2] <= 10, "min_saved_records out of bounds"

        print("Trace:", trace)
        print("✅ Passed all assertions for action", action_id)

In [None]:
test_apply_action_on_all_ids()

#### Q-Agent

In [None]:
def test_qagent_basic():
    agent = QAgent(action_space_size=13, epsilon=0.0)  # deterministic
    state = encode_config({
        "obd_parameters": [
            {"pid": 12, "enabled": True, "strategy": "on_change"},
            {"pid": 13, "enabled": True, "strategy": "on_change"},
            {"pid": 39, "enabled": True, "strategy": "on_change"}
        ],
        "global_settings": {"min_time": 10, "send_period": 60, "min_saved_records": 1}
    })

    action = agent.select_action(state)
    next_state, _ = apply_action(state, action, baseline=state)
    reward = 2.5
    agent.update(state, action, reward, next_state)

    # Assert Q-value updated
    state_key = tuple(state)
    assert action < len(agent.q_table[state_key]), "Q-table entry missing"
    q_value = agent.q_table[state_key][action]
    assert q_value != 0.0, "Q-value not updated"
    print(f"✅ Q-table updated: Q[state][{action}] = {q_value:.3f}")

In [None]:
test_qagent_basic()

In [None]:
def test_qagent_training():
    baseline_config = {
        "obd_parameters": [
            {"pid": 12, "enabled": True, "strategy": "monitoring"},
            {"pid": 13, "enabled": True, "strategy": "monitoring"},
            {"pid": 39, "enabled": True, "strategy": "monitoring"}
        ],
        "global_settings": {"min_time": 300, "send_period": 120, "min_saved_records": 1}
    }

    baseline_vector = encode_config(baseline_config)
    agent = QAgent(action_space_size=13, epsilon=0.3)

    rewards = train_q_agent(
        agent,
        baseline_vector,
        start_ts="2025-05-16T06:40:38Z",
        end_ts="2025-05-17T23:59:00Z"  ,
        freq_csv_path="../../data_proc/csv_data/qa_device/frequencies.csv",
        obd_csv_path="../../data_proc/csv_data/qa_device/obd_export.csv",
        episodes=5,
        trace=True
    )

    assert len(rewards) == 5, "Incorrect number of training episodes"
    assert all(isinstance(r, (float, int)) for r in rewards), "Non-numeric reward detected"
    print("✅ Training rewards:", rewards)

In [None]:
test_qagent_training()

In [None]:
### Training

baseline_config = {
    "obd_parameters": [
        {"pid": 12, "enabled": True, "strategy": "monitoring"},
        {"pid": 13, "enabled": True, "strategy": "monitoring"},
        {"pid": 39, "enabled": True, "strategy": "monitoring"}
    ],
    "global_settings": {"min_time": 300, "send_period": 120, "min_saved_records": 1}
}

baseline_vector = encode_config(baseline_config)

agent = QAgent(action_space_size=13, epsilon=0.3)
rewards = train_q_agent(
    agent,
    baseline_vector,
    start_ts="2025-05-23T13:55:00Z",
    end_ts="2025-05-23T14:07:00Z",
    freq_csv_path="frequencies.csv",
    obd_csv_path="obd_export.csv"
)

plot_rewards(rewards)

### References



**Implementation references**

- OpenAI Spinning Up: https://spinningup.openai.com
Although it focuses more on policy-gradient methods, it gives good context on where Q-learning fits in the broader RL ecosystem.

- RL Course by David Silver (DeepMind)
Lectures 4–6 cover model-free methods, including Q-Learning.

- Towards Data Science
https://towardsdatascience.com/reinforcement-learning-explained-visually-part-4-q-learning-step-by-step-b65efb731d3e/



**Academic references**

1. Watkins, C.J.C.H., & Dayan, P. (1992)
   Q-learning: https://link.springer.com/article/10.1007/BF00992698

   
2. Sutton, R. S., & Barto, A. G. (2018)
    Reinforcement Learning: An Introduction (2nd Edition)
    Chapter 6 covers Q-Learning in depth.
    http://incompleteideas.net/book/the-book-2nd.html 