# UdaPlay RAG Pipeline (Starter Style)

This notebook loads local JSON files from `games/`, normalizes them, embeds into ChromaDB, and demonstrates semantic search using ChromaDB directly.

In [None]:
import json
import os
import uuid
from pathlib import Path

import chromadb
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

project_root = Path.cwd()
if not (project_root / "games").exists():
    project_root = project_root.parent

DATA_DIR = project_root / "games"
CHROMA_PATH = str(project_root / "chroma_db")
COLLECTION_NAME = "udaplay_games"
EMBED_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")

api_key = os.getenv("OPENAI_API_KEY", "")
client = OpenAI(api_key=api_key)


def embed_texts(texts):
    response = client.embeddings.create(model=EMBED_MODEL, input=texts)
    return [item.embedding for item in response.data]

In [None]:
def normalize_game(raw):
    title = raw.get("title") or raw.get("name") or raw.get("game_title") or ""
    release_date = raw.get("release_date") or raw.get("released") or raw.get("release") or ""
    platforms = raw.get("platforms") or raw.get("platform") or []
    developer = raw.get("developer") or raw.get("developers") or ""
    publisher = raw.get("publisher") or raw.get("publishers") or ""
    genre = raw.get("genre") or raw.get("genres") or []
    description = raw.get("description") or raw.get("summary") or raw.get("about") or ""

    if isinstance(platforms, str):
        platforms = [platforms]
    if isinstance(genre, str):
        genre = [genre]

    return {
        "title": str(title).strip(),
        "release_date": str(release_date).strip(),
        "platforms": [p for p in platforms if p],
        "developer": str(developer).strip(),
        "publisher": str(publisher).strip(),
        "genre": [g for g in genre if g],
        "description": str(description).strip(),
    }


def build_document_text(game):
    parts = [
        f"Title: {game.get('title', '')}",
        f"Release date: {game.get('release_date', '')}",
        f"Platforms: {', '.join(game.get('platforms', []))}",
        f"Developer: {game.get('developer', '')}",
        f"Publisher: {game.get('publisher', '')}",
        f"Genre: {', '.join(game.get('genre', []))}",
        f"Description: {game.get('description', '')}",
    ]
    return "\n".join([p for p in parts if p.strip()])


records = []
if DATA_DIR.exists():
    for path in DATA_DIR.glob("*.json"):
        with open(path, "r", encoding="utf-8") as handle:
            data = json.load(handle)
            if isinstance(data, list):
                records.extend(data)
            else:
                records.append(data)

print(f"JSON files: {len(list(DATA_DIR.glob('*.json')))}")
print(f"Total records: {len(records)}")

games = [normalize_game(r) for r in records]
if games:
    sample = games[0]
    print("Sample normalized record:", sample)
    print("Sample document:\n", build_document_text(sample))
else:
    print("No JSON files found. Add datasets to games/.")

Raw files: 0
No JSON files found. Add datasets to data/raw/.


In [None]:
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
try:
    chroma_client.delete_collection(name=COLLECTION_NAME)
except Exception:
    pass
collection = chroma_client.get_or_create_collection(name=COLLECTION_NAME)

documents = []
metadatas = []
ids = []

for game in games:
    doc = build_document_text(game)
    if not doc.strip():
        continue
    documents.append(doc)
    metadatas.append(
        {
            "title": game.get("title"),
            "release_date": game.get("release_date"),
            "platforms": game.get("platforms"),
            "developer": game.get("developer"),
            "publisher": game.get("publisher"),
        }
    )
    ids.append(str(uuid.uuid4()))

if api_key and documents:
    embeddings = embed_texts(documents)
    collection.add(ids=ids, documents=documents, metadatas=metadatas, embeddings=embeddings)
    print(f"Ingested {len(documents)} documents into ChromaDB.")
else:
    print("Skipping ingest (missing OPENAI_API_KEY or no data).")

Skipping ingest (missing OPENAI_API_KEY or no data).


In [None]:
queries = [
    "FIFA 21'i kim geliştirdi?",
    "God of War Ragnarok ne zaman çıktı?",
    "Pokemon Red hangi platformda çıktı?",
]

if api_key and documents:
    for q in queries:
        print("\nQuery:", q)
        query_embedding = embed_texts([q])
        results = collection.query(
            query_embeddings=query_embedding,
            n_results=3,
            include=["documents", "metadatas", "distances"],
        )
        for idx, doc in enumerate(results["documents"][0]):
            meta = results["metadatas"][0][idx]
            score = results["distances"][0][idx]
            print("-", meta.get("title"), "| score:", score)
            print(doc[:200], "...\n")
else:
    print("Skipping search (missing OPENAI_API_KEY or no data).")


Query: FIFA 21'i kim geliştirdi?


AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: YOUR_KEY. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}