In [None]:
# ---------- Essentials: copy existing DB docs, then import your JSON transcripts ----------

from pathlib import Path
from datetime import datetime
import json
from pymongo import MongoClient, InsertOne, UpdateOne

# --- 0) Config (adjust to your setup) ---
MONGO_URI  = "mongodb://localhost:27018/"   # your Mongo connection
DB_NAME    = "transcriptions"
SRC_COL    = "transcripts"
DST_COL    = "transcripts_denis"

# Root directory where your JSON files are stored (e.g., results_<timestamp> folders)
# Example: Path.home() / "stanic_audio"
JSON_ROOT  = Path.home() / "stanic_audio"

MODEL_TAG  = "speechbrain_whisper_rescuespeech"  # consistent model tag for traceability

# --- 1) Connect to MongoDB ---
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
src = db[SRC_COL]
dst = db[DST_COL]

# Make sure we can upsert on filenames quickly
# (unique index prevents duplicates if you re-run this script)
try:
    dst.create_index("filename", unique=True)
except Exception:
    pass  # index may already exist; ignore

# --- 2) Copy all existing docs from `transcripts` -> `transcripts_denis` (idempotent) ---
# We do not drop; we perform blind inserts to keep it simple.
# If a doc with the same _id exists, it will error. To avoid that, we remove _id on copy.
batch = []
BATCH_SIZE = 1000

for doc in src.find({}, projection={"_id": False}):
    batch.append(InsertOne(doc))
    if len(batch) >= BATCH_SIZE:
        dst.bulk_write(batch, ordered=False)
        batch.clear()

if batch:
    dst.bulk_write(batch, ordered=False)

print("Copied all existing documents from 'transcripts' to 'transcripts_denis'.")

# --- 3) Load your JSON transcript files and upsert into `transcripts_denis` ---
# We search recursively for *.json under JSON_ROOT (e.g., .../results_<timestamp>/*.json)
json_files = sorted(JSON_ROOT.rglob("*.json"))

for jpath in json_files:
    # Skip non-files (just in case)
    if not jpath.is_file():
        continue

    # Read JSON: the file contains a list of segments [{start, end, text, words}, ...]
    with jpath.open("r", encoding="utf-8") as f:
        try:
            segments = json.load(f)
        except json.JSONDecodeError:
            # Minimal: skip unreadable JSON; no manual checkups requested
            continue

    # Prepare the document we store
    # Key choice: we upsert by filename to keep imports idempotent across runs
    doc = {
        "filename": jpath.name,
        "filepath": str(jpath),
        "model": MODEL_TAG,
        "ingested_at": datetime.utcnow(),
        "segments": segments
    }

    # Upsert: replace/set by unique filename
    dst.update_one(
        {"filename": jpath.name},
        {"$set": doc},
        upsert=True
    )

print(f"Imported/updated {len(json_files)} JSON files into 'transcripts_denis'.")
print("✅ Done.")


In [None]:
# DONE