In [1]:
!pip install openai faiss-cpu pandas tiktoken


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [23]:
import os
import openai
import pandas as pd
import faiss
import numpy as np
import tiktoken

openai.api_key = os.getenv("OPENAI_API_KEY")


In [22]:
df = pd.read_csv("/content/wiki_movie_plots_deduped.csv")

df = df[['Title', 'Plot']].head(200)
df.head(10)


Unnamed: 0,Title,Plot
0,Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr..."
1,Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov..."
2,The Martyred Presidents,"The film, just over a minute long, is composed..."
3,"Terrible Teddy, the Grizzly King",Lasting just 61 seconds and consisting of two ...
4,Jack and the Beanstalk,The earliest known adaptation of the classic f...
5,Alice in Wonderland,"Alice follows a large white rabbit down a ""Rab..."
6,The Great Train Robbery,The film opens with two bandits breaking into ...
7,The Suburbanite,The film is about a family who move to the sub...
8,The Little Train Robbery,The opening scene shows the interior of the ro...
9,The Night Before Christmas,Scenes are introduced using lines of the poem....


In [4]:
def chunk_text(text, max_words=300):
    words = text.split()
    return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

chunks = []
for _, row in df.iterrows():
    title = row['Title']
    plot = row['Plot']
    for c in chunk_text(plot):
        chunks.append({"title": title, "chunk": c})


[{'title': 'Kansas Saloon Smashers',
  'chunk': "A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]"},
 {'title': 'Love by the Light of the Moon',
  'chunk': "The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perch

In [5]:
from tqdm import tqdm

def get_embedding(text):
    response = openai.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding

embeddings = [get_embedding(c['chunk']) for c in tqdm(chunks)]
embeddings = np.array(embeddings).astype("float32")


100%|██████████| 240/240 [01:10<00:00,  3.40it/s]


In [6]:
dim = len(embeddings[0])
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

print("Total vectors stored:", index.ntotal)


Total vectors stored: 240


In [7]:
def search(query, k=3):
    query_emb = get_embedding(query)
    D, I = index.search(np.array([query_emb]).astype("float32"), k)
    results = [chunks[i] for i in I[0]]
    return results

search("Which movie has an evil computer?")


[{'title': 'Manhattan Madness',
  'chunk': 'do something so crude. He goes to see Melville and hires him to help with the play. He asks why Melville is so sure that Berlea is alive. Melville explains that he believes that Berlea has a compulsion to own beautiful things. Before he "died", he tried to buy three such objects, but was turned down. Now two of them have been stolen. Melville predicts that the third, the Starburst Diamond, will be purloined on Saturday at 11 pm. Bane has faith in him and keeps his staff after hours and notifies the authorities, who set up a stakeout at the bank where the jewel is stored. When the deadline passes without anything happening, Bane fires Melville. Melville agrees to go to Gregory\'s retreat for a rest. Gregory\'s men have dug a tunnel between the theater where the play, set on the front lines of World War I, will be performed and the bank. At the premiere, when mock explosions and firing are set off in a battle scene, they blast their way into th

In [20]:
import json
from openai import OpenAI

client = OpenAI()

def generate_answer(query, k=3):
    contexts = search(query, k)
    context_texts = [f"Title: {c['title']} | Chunk: {c['chunk']}" for c in contexts]

    user_prompt = f"""
You are a Retrieval-Augmented Generation (RAG) assistant.
Your job:
1. Use the provided movie plot chunks to answer the question.
2. Always mention the movie title(s) in the answer.
3. If no relevant info is found, say "I do not know".

Question: {query}

Context Chunks:
{chr(10).join(context_texts)}
"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": user_prompt}
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "rag_answer_schema",
                "schema": {
                    "type": "object",
                    "properties": {
                        "answer": {
                            "type": "string",
                            "description": "Direct natural language answer including movie title(s)."
                        },
                        "contexts": {
                            "type": "array",
                            "items": {"type": "string"},
                            "description": "The top retrieved chunks used to form the answer."
                        },
                        "reasoning": {
                            "type": "string",
                            "description": "Explain why the chosen chunk(s) are reliable for this answer, referencing the movie context."
                        }
                    },
                    "required": ["answer", "contexts", "reasoning"],
                    "additionalProperties": False
                }
            }
        }
    )

    raw_output = response.choices[0].message.content.strip()
    try:
        structured_output = json.loads(raw_output)
    except json.JSONDecodeError:
        structured_output = {
            "answer": "I do not know",
            "contexts": context_texts,
            "reasoning": "Failed to parse response as JSON"
        }

    return structured_output


In [24]:
query = "Which movie involves a jewel heist planned during a theater performance with explosions as cover?"
result = generate_answer(query)
print(json.dumps(result, indent=2))


{
  "answer": "The movie involving a jewel heist planned during a theater performance with explosions as cover is \"Manhattan Madness.\"",
  "contexts": [
    "Title: Manhattan Madness | Chunk: ...mock explosions and firing are set off in a battle scene, they blast their way into the vault.",
    "Title: Manhattan Madness | Chunk: ...Melville predicts to reporters that Berlea will next steal the Sunburst Diamond and kill the only witness, a butler, to the ruby theft."
  ],
  "reasoning": "The context chunks from 'Manhattan Madness' clearly describe a jewel heist that occurs during a theater performance where fake explosions are used as a cover to break into a vault, suggesting that this film fits exactly with the scenario outlined in the question."
}
