In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 01: Data Collection for Chronic Condition Coach\n",
    "\n",
    "## Goal\n",
    "Load and explore health knowledge documents for our RAG system."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 1: Imports\n",
    "import json\n",
    "import os\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "\n",
    "print(\"‚úÖ Libraries imported\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 2: Setup directories\n",
    "Path(\"../data/raw\").mkdir(parents=True, exist_ok=True)\n",
    "Path(\"../data/cleaned\").mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "print(\"üìÅ Directories ready\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 3: Load all documents\n",
    "raw_docs = []\n",
    "raw_files = []\n",
    "\n",
    "for file in os.listdir(\"../data/raw\"):\n",
    "    if file.endswith(\".json\"):\n",
    "        with open(f\"../data/raw/{file}\", \"r\") as f:\n",
    "            doc = json.load(f)\n",
    "            raw_docs.append(doc)\n",
    "            raw_files.append(file)\n",
    "\n",
    "print(f\"üìö Loaded {len(raw_docs)} documents from data/raw/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 4: Create a DataFrame for analysis\n",
    "df = pd.DataFrame(raw_docs)\n",
    "df['text_length'] = df['text'].apply(len)\n",
    "df['word_count'] = df['text'].apply(lambda x: len(x.split()))\n",
    "\n",
    "print(\"\\nüìä Document Statistics:\")\n",
    "print(f\"Total documents: {len(df)}\")\n",
    "print(f\"Unique sources: {df['source'].nunique()}\")\n",
    "print(f\"\\nSources breakdown:\")\n",
    "print(df['source'].value_counts())\n",
    "print(f\"\\nAverage word count: {df['word_count'].mean():.0f} words\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 5: Preview documents\n",
    "print(\"\\nüîç Sample Documents:\")\n",
    "for i, row in df.head(3).iterrows():\n",
    "    print(f\"\\n{i+1}. {row['title']} ({row['source']})\")\n",
    "    print(f\"   Text preview: {row['text'][:150]}...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cell 6: Save summary\n",
    "summary = {\n",
    "    \"total_documents\": len(raw_docs),\n",
    "    \"sources\": list(df['source'].unique()),\n",
    "    \"total_words\": int(df['word_count'].sum()),\n",
    "    \"sample_titles\": df['title'].tolist()[:5]\n",
    "}\n",
    "\n",
    "with open(\"../data/raw/summary.json\", \"w\") as f:\n",
    "    json.dump(summary, f, indent=2)\n",
    "\n",
    "print(\"\\n‚úÖ Data collection complete!\")\n",
    "print(f\"Summary saved to: data/raw/summary.json\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}