In [None]:
import pandas as pd
import os
import math
import json

INPUT_CSV = ""
OUTPUT_DIR = ""
CHUNK_SIZE = 40

os.makedirs(OUTPUT_DIR, exist_ok=True)

df = pd.read_csv(INPUT_CSV)
print(f"Loaded {len(df)} rows from {INPUT_CSV}")

grouped = df.groupby("file")

for filename, group in grouped:
    chunk_data = []
    chunk_id = 0

    for _, row in group.iterrows():
        words = row["text"].split()
        total_chunks = math.ceil(len(words) / CHUNK_SIZE)

        for i in range(total_chunks):
            chunk_text = " ".join(words[i * CHUNK_SIZE : (i + 1) * CHUNK_SIZE])
            position = (
                round(chunk_id / (total_chunks - 1), 4) if total_chunks > 1 else 0.0
            )

            chunk_data.append(
                {
                    "file": row["file"],
                    "course": row["course"],
                    "source": row["source"],
                    "start": row["start"],
                    "end": row["end"],
                    "chunk_id": chunk_id,
                    "text": chunk_text,
                    "position": position,
                }
            )
            chunk_id += 1

    out_path = os.path.join(OUTPUT_DIR, filename.replace(".json", "_chunks.json"))
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(chunk_data, f, indent=2)

    print(f"Saved {len(chunk_data)} chunks to {out_path}")