# Recovered Notebook
The original file was not valid JSON. Its raw contents are included below for reference.

```
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "7fb27b941602401d91542211134fc71a",
   "metadata": {},
   "source": [
    "# Read Edge AI Data (All Formats)\n",
    "*Generated 2025-08-19T04:24:17.353547+00:00*\n\n",
    "This notebook demonstrates how to read **JSONL**, **Parquet**, **Avro**, and **Protobuf** records from this repository.\n",
    "\n",
    "## Setup\n",
    "Install dependencies (if needed) and set base paths."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "acae54e37e7d407bbb7b55eff062a284",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import sys\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "\n",
    "BASE = Path('..').resolve().parent if (Path.cwd().name == 'notebooks') else Path('.').resolve()\n",
    "DATA = BASE / 'data' / 'samples'\n",
    "SCHEMA = BASE / 'schema'\n",
    "PROTO = BASE / 'proto'\n",
    "print('BASE:', BASE)\n",
    "print('DATA:', DATA)\n",
    "print('SCHEMA:', SCHEMA)\n",
    "print('PROTO:', PROTO)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9a63283cbaf04dbcab1f6479b197f3a8",
   "metadata": {},
   "source": [
    "## JSONL (hot logs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8dd0d8092fe74a7c96281538738b07e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "jsonl_files = sorted((DATA / 'hot' / 'temperature').rglob('*.jsonl'))\n",
    "jsonl_files[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72eea5119410473aa328ad9291626812",
   "metadata": {},
   "outputs": [],
   "source": [
    "rows = []\n",
    "for f in jsonl_files:\n",
    "    with open(f) as fh:\n",
    "        for line in fh:\n",
    "            line=line.strip()\n",
    "            if not line:\n",
                    continue\n",
    "            rows.append(json.loads(line))\n",
    "df_jsonl = pd.DataFrame(rows)\n",
    "df_jsonl.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8edb47106e1a46a883d545849b8ab81b",
   "metadata": {},
   "source": [
    "## Parquet (batch analytics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10185d26023b46108eb7d9f57d49d2b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# We try pyarrow first; if not available use fastparquet\n",
    "pq_files = sorted((DATA / 'batch').rglob('*.parquet'))\n",
    "pq_files[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8763a12b2bbd4a93a75aff182afb95dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_parquet = None\n",
    "if pq_files:\n",
    "    try:\n",
    "        df_parquet = pd.read_parquet(pq_files[0])\n",
    "    except Exception as e:\n",
    "        try:\n",
    "            df_parquet = pd.read_parquet(pq_files[0], engine='fastparquet')\n",
    "        except Exception as e2:\n",
    "            print('Parquet read failed:', e, e2)\n",
    "df_parquet.head() if df_parquet is not None else 'No parquet files found (install pyarrow or fastparquet).'\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7623eae2785240b9bd12b16a66d81610",
   "metadata": {},
   "source": [
    "## Avro (data contracts + records)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7cdc8c89c7104fffa095e18ddfef8986",
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastavro import parse_schema, writer, reader\n",
    "\n",
    "avsc = json.load(open(SCHEMA / 'temperature.avsc'))\n",
    "parsed = parse_schema(avsc)\n",
    "\n",
    "# Create a temp in-memory Avro file to demonstrate round-trip\n",
    "records = [\n",
    "    {\"device_id\":\"D-1\",\"site\":\"A\",\"ts\": 1724054400000, \"celsius\": 70.1, \"status\": None},\n",
    "    {\"device_id\":\"D-2\",\"site\":\"A\",\"ts\": 1724058000000, \"celsius\": 83.3, \"status\": \"ALERT\"}\n",
    "]\n",
    "tmp_avro = BASE / 'data' / 'samples' / 'avro-demo'\n",
    "tmp_avro.mkdir(parents=True, exist_ok=True)\n",
    "avro_path = tmp_avro / 'temperature-demo.avro'\n",
    "with open(avro_path, 'wb') as out:\n",
    "    writer(out, parsed, records)\n",
    "print('Wrote', avro_path)\n",
    "\n",
    "with open(avro_path, 'rb') as inp:\n",
    "    recs = list(reader(inp))\n",
    "pd.DataFrame(recs).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b118ea5561624da68c537baed56e602f",
   "metadata": {},
   "source": [
    "## Protobuf (binary records)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "938c804e27f84196a10c8828c723f798",
   "metadata": {},
   "outputs": [],
   "source": [
    "import subprocess\n",
    "from pathlib import Path\n",
    "\n",
    "proto = PROTO / 'temperature.proto'\n",
    "if not proto.exists():\n",
    "    raise FileNotFoundError(proto)\n",
    "\n",
    "# Attempt to compile .proto -> Python module at runtime if protoc is available\n",
    "module_dir = BASE / 'notebooks' / '__pb__'\n",
    "module_dir.mkdir(parents=True, exist_ok=True)\n",
    "py_out = module_dir\n",
    "\n",
    "def have_protoc():\n",
    "    from shutil import which\n",
    "    return which('protoc') is not None\n",
    "\n",
    "compiled = False\n",
    "if have_protoc():\n",
    "    cmd = ['protoc', f'--proto_path={PROTO}', f'--python_out={py_out}', str(proto)]\n",
    "    r = subprocess.run(cmd, capture_output=True, text=True)\n",
    "    if r.returncode == 0:\n",
    "        compiled = True\n",
    "    else:\n",
    "        print('protoc failed:', r.stderr)\n",
    "else:\n",
    "    print('protoc not found; skipping runtime compile. You can precompile with `protoc`.')\n",
    "\n",
    "if compiled:\n",
    "    sys.path.append(str(module_dir))\n",
    "    import temperature_pb2 as pb\n",
    "    m = pb.TemperatureReading()\n",
    "    m.device_id = 'D-123'\n",
            m.site='A'\n",
            m.ts_ms=1724054400000\n",
            m.celsius=81.2\n",
            m.status='ALERT'\n",
    "    b = m.SerializeToString()\n",
    "    print('Encoded bytes len:', len(b))\n",
    "    m2 = pb.TemperatureReading()\n",
    "    m2.ParseFromString(b)\n",
    "    m2\n",
    "else:\n",
    "    'Install protoc to compile and parse protobuf messages in this notebook.'\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "504fb2a444614c0babb325280ed9130a",
   "metadata": {},
   "source": [
    "## DuckDB quick SQL on Parquet/JSON\n",
    "Optional, but handy for ad-hoc exploration."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59bbdb311c014d738909a11f9e486628",
   "metadata": {},
   "outputs": [],
   "source": [
    "import duckdb\n",
    "con = duckdb.connect()\n",
    "con.execute(\"PRAGMA threads=4;\")\n",
    "parquet_glob = str((DATA / 'batch').resolve() / '**/*.parquet')\n",
    "jsonl_glob = str((DATA / 'hot' / 'temperature').resolve() / '**/*.jsonl')\n",
    "try:\n",
    "    print('JSONL sample:')\n",
    "    print(con.execute(f\"SELECT * FROM read_json_auto('{jsonl_glob}') LIMIT 5\").fetchdf())\n",
    "except Exception as e:\n",
    "    print('DuckDB JSONL read error:', e)\n",
    "try:\n",
    "    print('Parquet sample:')\n",
    "    print(con.execute(f\"SELECT * FROM read_parquet('{parquet_glob}') LIMIT 5\").fetchdf())\n",
    "except Exception as e:\n",
    "    print('DuckDB Parquet read error:', e)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.x"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}

```