# Mandatory EDA: Document Corpus Exploration

This notebook performs exploratory data analysis on the document corpus for RAG comparison.

**Objectives**:
1. Inventory all documents in `data/raw/`
2. Analyze file statistics (size, format, distribution)
3. Check data quality (encoding, readability)
4. Assess GFS compatibility (file size limits, supported formats)
5. Estimate processing costs

In [None]:
%matplotlib inline

import sys
from pathlib import Path

# Add src to path for imports
project_root = Path.cwd().parent
sys.path.append(str(project_root / "src"))

import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_loader import scan_documents, check_gfs_compatibility, compute_file_hash
from utils import format_bytes

# Plotting setup
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

print("Imports successful")

## 1. File Inventory and Statistics

In [None]:
# Scan documents
data_dir = project_root / "data" / "raw"
df = scan_documents(data_dir)

print(f"Total files found: {len(df)}")
print(f"\nDataFrame schema:")
print(df.schema)
print(f"\nFirst 5 files:")
df.head()

In [None]:
# Summary statistics
if len(df) > 0:
    print("File Statistics:")
    print(f"Total size: {format_bytes(df['size_bytes'].sum())}")
    print(f"\nSize distribution:")
    print(df.select([
        pl.col("size_mb").min().alias("min_mb"),
        pl.col("size_mb").median().alias("median_mb"),
        pl.col("size_mb").mean().alias("mean_mb"),
        pl.col("size_mb").max().alias("max_mb"),
    ]))
else:
    print("No files found in data/raw/")
    print("Add your documents to continue exploration.")

In [None]:
# File type distribution
if len(df) > 0:
    file_types = df.group_by("extension").agg([
        pl.count().alias("count"),
        pl.col("size_mb").sum().alias("total_size_mb")
    ]).sort("count", descending=True)
    
    print("\nFile types:")
    print(file_types)

## 2. Visualizations

In [None]:
# Size distribution histogram
if len(df) > 0:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # File size histogram
    df_pd = df.to_pandas()
    ax1.hist(df_pd["size_mb"], bins=30, edgecolor="black", alpha=0.7)
    ax1.axvline(100, color="red", linestyle="--", label="GFS limit (100MB)")
    ax1.set_xlabel("File Size (MB)")
    ax1.set_ylabel("Count")
    ax1.set_title("File Size Distribution")
    ax1.legend()
    
    # File type counts
    file_types_pd = file_types.to_pandas()
    ax2.barh(file_types_pd["extension"], file_types_pd["count"])
    ax2.set_xlabel("Count")
    ax2.set_title("Files by Type")
    
    plt.tight_layout()
    plt.show()

## 3. GFS Compatibility Check

In [None]:
# Check GFS compatibility
if len(df) > 0:
    df_compat = check_gfs_compatibility(df)
    
    compatible_count = df_compat.filter(pl.col("gfs_compatible")).height
    incompatible_count = len(df_compat) - compatible_count
    
    print(f"GFS Compatibility:")
    print(f"  Compatible: {compatible_count}")
    print(f"  Incompatible: {incompatible_count}")
    
    if incompatible_count > 0:
        print("\nIncompatible files:")
        print(df_compat.filter(~pl.col("gfs_compatible")).select([
            "file_name", "extension", "size_mb", "gfs_compatible"
        ]))

## 4. Data Quality Checks

In [None]:
# Sample content inspection (for text files)
if len(df) > 0:
    text_files = df.filter(pl.col("extension").is_in([".txt", ".md"]))
    
    if len(text_files) > 0:
        print("Sample text file content:")
        sample_file = Path(text_files[0, "file_path"])
        print(f"\nFile: {sample_file.name}")
        print("=" * 60)
        
        try:
            with open(sample_file, "r", encoding="utf-8") as f:
                content = f.read(500)  # First 500 chars
            print(content)
            print("\n[truncated...]")
        except Exception as e:
            print(f"Error reading file: {e}")

## 5. Cost Estimation

In [None]:
# Estimate GFS indexing cost
# Assuming ~1 token per 4 characters for estimation
if len(df) > 0:
    total_mb = df["size_mb"].sum()
    total_bytes = df["size_bytes"].sum()
    
    # Rough token estimation (text files)
    # Assumes average 1KB = ~250 tokens
    estimated_tokens = (total_bytes / 1024) * 250
    
    # GFS pricing: $0.15 per 1M tokens for indexing
    indexing_cost = (estimated_tokens / 1_000_000) * 0.15
    
    print("Cost Estimation (rough):")
    print(f"  Total data size: {total_mb:.2f} MB")
    print(f"  Estimated tokens: {estimated_tokens:,.0f}")
    print(f"  Estimated GFS indexing cost: ${indexing_cost:.4f}")
    print("\nNote: This is a rough estimate. Actual cost depends on file content.")

## Summary

This EDA provides:
- File inventory and statistics
- GFS compatibility assessment
- Data quality checks
- Cost projections

**Next Steps**:
1. Add documents to `data/raw/` if empty
2. Proceed to `02_gfs_setup.ipynb` to create file search store
3. Implement custom RAG baseline for comparison