# Read Edge AI Data (All Formats)
*Generated 2025-08-19T04:24:17.353547+00:00*

This notebook demonstrates how to read **JSONL**, **Parquet**, **Avro**, and **Protobuf** records from this repository.

## Setup
Install dependencies (if needed) and set base paths.

In [None]:
import os, json, glob, sys, struct
from pathlib import Path
import pandas as pd

BASE = Path('..').resolve().parent if (Path.cwd().name == 'notebooks') else Path('.').resolve()
DATA = BASE / 'data' / 'samples'
SCHEMA = BASE / 'schema'
PROTO = BASE / 'proto'
print('BASE:', BASE)
print('DATA:', DATA)
print('SCHEMA:', SCHEMA)
print('PROTO:', PROTO)

## JSONL (hot logs)

In [None]:
jsonl_files = sorted((DATA / 'hot' / 'temperature').rglob('*.jsonl'))
jsonl_files[:3]

In [None]:
rows = []
for f in jsonl_files:
    with open(f) as fh:
        for line in fh:
            line=line.strip()
            if not line: continue
            rows.append(json.loads(line))
df_jsonl = pd.DataFrame(rows)
df_jsonl.head()

## Parquet (batch analytics)

In [None]:
# We try pyarrow first; if not available use fastparquet
pq_files = sorted((DATA / 'batch').rglob('*.parquet'))
pq_files[:3]

In [None]:
df_parquet = None
if pq_files:
    try:
        df_parquet = pd.read_parquet(pq_files[0])
    except Exception as e:
        try:
            df_parquet = pd.read_parquet(pq_files[0], engine='fastparquet')
        except Exception as e2:
            print('Parquet read failed:', e, e2)
df_parquet.head() if df_parquet is not None else 'No parquet files found (install pyarrow or fastparquet).'


## Avro (data contracts + records)

In [None]:
from fastavro import parse_schema, writer, reader

avsc = json.load(open(SCHEMA / 'temperature.avsc'))
parsed = parse_schema(avsc)

# Create a temp in-memory Avro file to demonstrate round-trip
records = [
    {"device_id":"D-1","site":"A","ts": 1724054400000, "celsius": 70.1, "status": None},
    {"device_id":"D-2","site":"A","ts": 1724058000000, "celsius": 83.3, "status": "ALERT"}
]
tmp_avro = BASE / 'data' / 'samples' / 'avro-demo'
tmp_avro.mkdir(parents=True, exist_ok=True)
avro_path = tmp_avro / 'temperature-demo.avro'
with open(avro_path, 'wb') as out:
    writer(out, parsed, records)
print('Wrote', avro_path)

with open(avro_path, 'rb') as inp:
    recs = list(reader(inp))
pd.DataFrame(recs).head()

## Protobuf (binary records)

In [None]:
import subprocess, tempfile, sys, os
from pathlib import Path

proto = PROTO / 'temperature.proto'
if not proto.exists():
    raise FileNotFoundError(proto)

# Attempt to compile .proto -> Python module at runtime if protoc is available
module_dir = BASE / 'notebooks' / '__pb__'
module_dir.mkdir(parents=True, exist_ok=True)
py_out = module_dir

def have_protoc():
    from shutil import which
    return which('protoc') is not None

compiled = False
if have_protoc():
    cmd = ['protoc', f'--proto_path={PROTO}', f'--python_out={py_out}', str(proto)]
    r = subprocess.run(cmd, capture_output=True, text=True)
    if r.returncode == 0:
        compiled = True
    else:
        print('protoc failed:', r.stderr)
else:
    print('protoc not found; skipping runtime compile. You can precompile with `protoc`.')

if compiled:
    sys.path.append(str(module_dir))
    import temperature_pb2 as pb
    m = pb.TemperatureReading()
    m.device_id = 'D-123'; m.site='A'; m.ts_ms=1724054400000; m.celsius=81.2; m.status='ALERT'
    b = m.SerializeToString()
    print('Encoded bytes len:', len(b))
    m2 = pb.TemperatureReading()
    m2.ParseFromString(b)
    m2
else:
    'Install protoc to compile and parse protobuf messages in this notebook.'


## DuckDB quick SQL on Parquet/JSON
Optional, but handy for ad-hoc exploration.

In [None]:
import duckdb
con = duckdb.connect()
con.execute("PRAGMA threads=4;")
parquet_glob = str((DATA / 'batch').resolve() / '**/*.parquet')
jsonl_glob = str((DATA / 'hot' / 'temperature').resolve() / '**/*.jsonl')
try:
    print('JSONL sample:')
    print(con.execute(f"SELECT * FROM read_json_auto('{jsonl_glob}') LIMIT 5").fetchdf())
except Exception as e:
    print('DuckDB JSONL read error:', e)
try:
    print('Parquet sample:')
    print(con.execute(f"SELECT * FROM read_parquet('{parquet_glob}') LIMIT 5").fetchdf())
except Exception as e:
    print('DuckDB Parquet read error:', e)