In [None]:
!pip install openai dotenv tiktoken pandas tqdm qdrant_client


In [None]:
import pandas as pd
import os
from tqdm import tqdm
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from openai import OpenAI
import tiktoken



CHANNEL_NAME = "bluepigeon0810"  # example channel name
JSON_PATH = f"{CHANNEL_NAME}.jsonl"
COLLECTION_NAME = f"{CHANNEL_NAME}_videos"

# ==== Configuration ====
#TODO: replace with your actual keys
QDRANT_URL = "your_qdrant_url"       # e.g. "https://xxxx-xxxxx.eu-central.aws.cloud.qdrant.io"
QDRANT_API_KEY = "your_qdrant_api_key"     # from your Qdrant Cloud dashboard
OPENAI_API_KEY = "your_openai_api_key"

EMBED_MODEL = "text-embedding-3-small"
MAX_TOKENS = 8150
encoding = tiktoken.encoding_for_model(EMBED_MODEL)
vector_size = 1536 if EMBED_MODEL == "text-embedding-3-small" else 3072
# ==== Helper function ====
def clean_srt(transcript: str) -> str:
    """Strip SRT formatting (timestamps, line numbers)."""
    lines = transcript.splitlines()
    cleaned = [line.strip() for line in lines if line.strip() and not line.strip().isdigit() and "-->" not in line]
    return " ".join(cleaned)

# ==== Initialize clients ====
client = OpenAI(api_key=OPENAI_API_KEY)

# --- Qdrant Cloud connection ---
qdrant = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
)

# ==== Load JSON ====
df = pd.read_json(JSON_PATH, lines=True)

# ==== Create or reset collection ====
existing = [c.name for c in qdrant.get_collections().collections]
if COLLECTION_NAME not in existing:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
    )
    print(f"✅ Created collection '{COLLECTION_NAME}'")
else:
    print(f"ℹ️ Collection '{COLLECTION_NAME}' already exists. Clearing existing data...")
    qdrant.delete_collection(collection_name=COLLECTION_NAME)
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
    )
    print(f"✅ Re-created collection '{COLLECTION_NAME}'")

# ==== Embed & upload ====
points = []
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Embedding & uploading"):
    text = row["transcript"]
    video_id = row["video_id"]
    upload_date = row["upload_date"]
    video_title = row["title"]

    embedding_input = clean_srt(text)

    tokens = encoding.encode(embedding_input)
    if len(tokens) > MAX_TOKENS:
        tokens = tokens[:MAX_TOKENS]
        embedding_input = encoding.decode(tokens)

    emb = client.embeddings.create(
        model=EMBED_MODEL,
        input=embedding_input
    ).data[0].embedding

    points.append(PointStruct(
        id=idx,
        vector=emb,
        payload={
            "video_id": video_id,
            "title": video_title,
            "upload_date": upload_date,
            "transcript": text
        }
    ))




In [None]:
# Upload in batches to avoid timeout
BATCH_SIZE = 50
for i in range(0, len(points), BATCH_SIZE):
    batch = points[i:i + BATCH_SIZE]
    qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
    print(f"✅ Uploaded batch {i // BATCH_SIZE + 1}/{len(points) // BATCH_SIZE + 1}")