# DEIA Analytics Explorer (Plotly)

Interactive exploration of DEIA analytics data (no matplotlib required).
Run the ETL first (see `.deia/analytics/README.md`).

## Setup & Imports
Imports pandas and plotly; falls back to minimal loaders if unavailable.

In [None]:
import json
from pathlib import Path
# Optional deps
try:
    import pandas as pd
    HAVE_PANDAS = True
except Exception as e:
    HAVE_PANDAS = False; print('[warn] pandas not available:', e)
try:
    import plotly.express as px
    HAVE_PLOTLY = True
except Exception as e:
    HAVE_PLOTLY = False; print('[info] plotly not available:', e)


## Locate Project Root and Staging Directory

In [None]:
def find_project_root(start: Path = Path.cwd()) -> Path:
    p = start.resolve()
    for _ in range(8):
        if (p / '.deia').is_dir(): return p
        if p.parent == p: break
        p = p.parent
    return start.resolve()
PROJECT_ROOT = find_project_root()
STAGING_DIR = PROJECT_ROOT / '.deia' / 'analytics' / 'staging'
print('PROJECT_ROOT =', PROJECT_ROOT)
print('STAGING_DIR  =', STAGING_DIR)


## Load Staging NDJSON Tables

In [None]:
TABLES = ['sessions','session_decisions','session_action_items','session_files_modified','events','heartbeats']
def load_ndjson_table(table: str):
    files = sorted((STAGING_DIR / table).glob('dt=*/*.ndjson'))
    rows = []
    for fp in files:
        for line in fp.read_text(encoding='utf-8', errors='replace').splitlines():
            s=line.strip();
            if not s: continue
            try: rows.append(json.loads(s))
            except Exception: pass
    return pd.DataFrame(rows) if HAVE_PANDAS else rows
DATA = {t: load_ndjson_table(t) for t in TABLES}
{t: (len(df) if HAVE_PANDAS else len(df)) for t, df in DATA.items()}


## Event Types Distribution (Plotly)

In [None]:
if HAVE_PANDAS and HAVE_PLOTLY and len(DATA['events'])>0:
    ev = DATA['events']
    if 'event_type' in ev.columns:
        counts = ev['event_type'].value_counts().sort_values(ascending=True).reset_index()
        counts.columns=['event_type','count']
        fig = px.bar(counts, x='count', y='event_type', orientation='h', title='Event Types Distribution')
        fig.show()
    else: print('[info] no event_type column')
else: print('[info] plotly/pandas not available or events empty')


## Events Over Time (Daily)

In [None]:
if HAVE_PANDAS and HAVE_PLOTLY and len(DATA['events'])>0:
    ev = DATA['events']
    ts_col = 'ts' if 'ts' in ev.columns else ('timestamp' if 'timestamp' in ev.columns else None)
    if ts_col:
        ev['_ts'] = pd.to_datetime(ev[ts_col], errors='coerce', utc=True)
        daily = ev.dropna(subset=['_ts']).set_index('_ts').resample('D').size().to_frame('count').reset_index()
        fig = px.line(daily, x='_ts', y='count', title='Events per Day')
        fig.show()
    else: print('[info] no timestamp found')
else: print('[info] plotly/pandas not available or events empty')


## Heartbeats — Last Seen per Bot

In [None]:
if HAVE_PANDAS and HAVE_PLOTLY and len(DATA['heartbeats'])>0:
    hb = DATA['heartbeats']
    ts_col = 'ts' if 'ts' in hb.columns else ('timestamp' if 'timestamp' in hb.columns else None)
    if ts_col and 'bot_id' in hb.columns:
        hb['_ts'] = pd.to_datetime(hb[ts_col], errors='coerce', utc=True)
        hb2 = hb.dropna(subset=['_ts']).sort_values('_ts').groupby('bot_id').tail(1)[['bot_id','_ts']].drop_duplicates('bot_id').sort_values('_ts')
        fig = px.bar(hb2, x='_ts', y='bot_id', orientation='h', title='Heartbeats — Last Seen per Bot')
        fig.show()
    else: print('[info] missing bot_id or timestamp')
else: print('[info] plotly/pandas not available or heartbeats empty')


## Sessions — Counts and Summary

In [None]:
if HAVE_PANDAS and len(DATA['sessions'])>0:
    ss = DATA['sessions']
    ss['_start'] = pd.to_datetime(ss.get('ts_start', None), errors='coerce', utc=True)
    daily = ss.dropna(subset=['_start']).set_index('_start').resample('D').size().to_frame('count').reset_index()
    if HAVE_PLOTLY:
        fig = px.line(daily, x='_start', y='count', title='Sessions per Day')
        fig.show()
    display(ss.head(5))
    # Decisions/action items summary if present
    dec = DATA['session_decisions']; act = DATA['session_action_items']
    if len(dec)>0 and len(act)>0 and 'session_id' in ss.columns:
        dec_c = dec.groupby('session_id').size().rename('decisions')
        act_c = act.groupby('session_id').size().rename('action_items')
        joined = ss.set_index('session_id').join(dec_c, how='left').join(act_c, how='left').fillna(0)
        display(joined[['decisions','action_items']].describe())
else:
    print('[info] pandas not available or sessions empty')


## Next Ideas
- Add Parquet export and DuckDB views (Phase 2)
- Build dashboards for event taxonomy and session outcomes
- Add privacy redaction hooks and incremental loads