# DEIA Analytics Explorer

This notebook provides interactive exploration of DEIA analytics data produced by the local ETL.

What you can do here:
- Load staging NDJSON tables (sessions, events, heartbeats, etc.)
- Inspect row counts and sample records
- Plot event distributions and time series
- Review heartbeats (last seen per bot)
- Summarize sessions, decisions, and action items

Note: This assumes the ETL has run (see `.deia/analytics/README.md`).

## Setup & Imports

Imports core libraries. If a library is missing, reads will fallback to the Python standard library JSON lines reader.

In [None]:
import os, sys, json, glob, pathlib, datetime
from pathlib import Path

# Optional dependencies
try:
    import pandas as pd
    import matplotlib.pyplot as plt
    HAVE_PANDAS = True
except Exception as e:
    HAVE_PANDAS = False
    print('[warn] pandas/matplotlib not available; using minimal loaders. Error:', e)

plt.rcParams['figure.figsize'] = (10, 4) if 'plt' in globals() else None

## Configure Paths

The helper below locates the project root (folder containing `.deia`). Adjust `PROJECT_ROOT` manually if needed.

In [None]:
def find_project_root(start: Path = Path.cwd()) -> Path:
    p = start.resolve()
    for _ in range(6):
        if (p / '.deia').is_dir():
            return p
        if p.parent == p:
            break
        p = p.parent
    return start.resolve()

PROJECT_ROOT = find_project_root()
STAGING_DIR = PROJECT_ROOT / '.deia' / 'analytics' / 'staging'
print('PROJECT_ROOT =', PROJECT_ROOT)
print('STAGING_DIR  =', STAGING_DIR)

## Load Staging Tables (NDJSON)

Loads all partitions under `dt=YYYY-MM-DD` for each table. If pandas is available, returns DataFrames. Otherwise, returns lists of dicts.

In [None]:
TABLES = [
    'sessions',
    'session_decisions',
    'session_action_items',
    'session_files_modified',
    'events',
    'heartbeats',
]

def load_ndjson_table(table: str, staging_dir: Path = STAGING_DIR):
    base = staging_dir / table
    files = sorted(base.glob('dt=*/*.ndjson'))
    rows = []
    for fp in files:
        with fp.open('r', encoding='utf-8', errors='replace') as f:
            for line in f:
                s = line.strip()
                if not s:
                    continue
                try:
                    rows.append(json.loads(s))
                except Exception:
                    continue
    if HAVE_PANDAS:
        return pd.DataFrame(rows)
    return rows

DATA = {t: load_ndjson_table(t) for t in TABLES}
{t: (len(df) if HAVE_PANDAS else len(df)) for t, df in DATA.items()}

## Quick Inventory

Row counts per table and a few sample rows.

In [None]:
for t, df in DATA.items():
    n = len(df) if HAVE_PANDAS else len(df)
    print(f'{t}: {n} rows')

if HAVE_PANDAS and len(DATA['sessions']) > 0:
    display(DATA['sessions'].head(5))
if HAVE_PANDAS and len(DATA['events']) > 0:
    display(DATA['events'].head(5))

## Event Types Distribution

Bar chart of `event_type` counts across all events.

In [None]:
if HAVE_PANDAS and len(DATA['events']) > 0:
    ev = DATA['events'].copy()
    if 'event_type' in ev.columns:
        counts = ev['event_type'].value_counts().sort_values(ascending=True)
        counts.plot(kind='barh', title='Event Types Distribution')
        plt.tight_layout()
        plt.show()
    else:
        print('[info] events table has no event_type column')
else:
    print('[info] pandas not available or events empty — skipping chart')

## Events Over Time

Time series of events by day (or hour).

In [None]:
if HAVE_PANDAS and len(DATA['events']) > 0:
    ev = DATA['events'].copy()
    # Normalize timestamp column heuristically
    ts_col = 'ts' if 'ts' in ev.columns else ('timestamp' if 'timestamp' in ev.columns else None)
    if ts_col:
        ev['_ts'] = pd.to_datetime(ev[ts_col], errors='coerce', utc=True)
        daily = ev.dropna(subset=['_ts']).set_index('_ts').resample('D').size()
        daily.plot(title='Events per Day')
        plt.tight_layout()
        plt.show()
    else:
        print('[info] could not find timestamp column in events')
else:
    print('[info] pandas not available or events empty — skipping chart')

## Heartbeats — Last Seen Per Bot

Shows the most recent heartbeat timestamp by `bot_id`.

In [None]:
if HAVE_PANDAS and len(DATA['heartbeats']) > 0:
    hb = DATA['heartbeats'].copy()
    for c in ['ts', 'timestamp']:
        if c in hb.columns:
            hb['_ts'] = pd.to_datetime(hb[c], errors='coerce', utc=True)
            break
    if '_ts' in hb.columns and 'bot_id' in hb.columns:
        last_seen = hb.dropna(subset=['_ts']).sort_values('_ts').groupby('bot_id')['_ts'].tail(1)
        # Convert to DataFrame for plotting
        df = hb.loc[last_seen.index, ['bot_id', '_ts']]
        df = df.drop_duplicates('bot_id').set_index('bot_id').sort_values('_ts')
        df['_ts'].astype('int64') // 10**9  # touch type
        df['_ts'].plot(kind='barh', title='Heartbeats — Last Seen per Bot')
        plt.tight_layout()
        plt.show()
    else:
        print('[info] heartbeats missing bot_id or timestamp columns')
else:
    print('[info] pandas not available or heartbeats empty — skipping chart')

## Sessions — Activity Summary

Counts of sessions per day, and decision/action item densities.

In [None]:
if HAVE_PANDAS and len(DATA['sessions']) > 0:
    ss = DATA['sessions'].copy()
    ss['_start'] = pd.to_datetime(ss.get('ts_start', None), errors='coerce', utc=True)
    per_day = ss.dropna(subset=['_start']).set_index('_start').resample('D').size()
    ax = per_day.plot(title='Sessions per Day')
    plt.tight_layout(); plt.show()

    # Join decisions and action items per session
    dec = DATA['session_decisions'] if HAVE_PANDAS else []
    act = DATA['session_action_items'] if HAVE_PANDAS else []
    if HAVE_PANDAS and len(dec) > 0 and len(act) > 0 and 'session_id' in ss.columns:
        dec_c = dec.groupby('session_id').size().rename('decisions')
        act_c = act.groupby('session_id').size().rename('action_items')
        joined = ss.set_index('session_id').join(dec_c, how='left').join(act_c, how='left').fillna(0)
        display(joined[['decisions','action_items']].describe())
else:
    print('[info] pandas not available or sessions empty — skipping charts')

## Next Ideas

- Add Parquet export and DuckDB views (Phase 2)
- Create `agents` table (derived from events/heartbeats)
- Build dashboards for event taxonomy and session outcomes
- Add privacy redaction hooks and incremental `--since` loads