# IMDB examples

In [2]:
import os
import uuid

import numpy as np

from pprint import pprint
from transformers import pipeline
from datasets import load_dataset
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Building embeddings

In [3]:
data = list(load_dataset("stanfordnlp/imdb", split="train")['text'])
pprint(data[20])

('If the crew behind "Zombie Chronicles" ever read this, here\'s some advice '
 'guys: <br /><br />1. In a "Twist Ending"-type movie, it\'s not a good idea '
 'to insert close-ups of EVERY DEATH IN THE MOVIE in the opening credits. That '
 "tends to spoil the twists, y'know...? <br /><br />2. I know you produced "
 'this on a shoestring and - to be fair - you worked miracles with your budget '
 'but please, hire people who can actually act. Or at least, walk, talk and '
 "gesture at the same time. Joe Haggerty, I'm looking at you...<br /><br />3. "
 "If you're going to set a part of your movie in the past, only do this if you "
 'have the props and costumes of the time.<br /><br />4. Twist endings are '
 "supposed to be a surprise. Sure, we don't want twists that make no sense, "
 'but signposting the "reveal" as soon as you introduce a character? That\'s '
 'not a great idea.<br /><br />Kudos to the guys for trying, but in all '
 "honesty, I'd rather they hadn't...<br /><br />Only for

In [4]:
embedding_model = SentenceTransformer(
    "paraphrase-MiniLM-L3-v2",
    model_kwargs={'dtype': 'float16'}
)

In [5]:
if os.path.exists("imdb_example_files"):
    embeddings = np.load("imdb_example_files/embeddings.npy")
else:
    embeddings = embedding_model.encode(data, normalize_embeddings=True)
    os.mkdir("imdb_example_files")
    np.save("imdb_example_files/embeddings", embeddings)
    with open("imdb_example_files/.gitignore", "w") as f:
        f.write("embeddings.npy\n")

## Vector database

In [6]:
client = QdrantClient(":memory:")
embedding_size = embeddings.shape[1]

client.create_collection(
    collection_name="imdb",
    on_disk_payload=True,
    vectors_config=models.VectorParams(
        size=embedding_size,
        distance=models.Distance.COSINE,
        on_disk=True
    )
)

True

In [7]:
points = [
    models.PointStruct(
        id=str(uuid.uuid4()),
        vector=embeddings[i],
        payload={"text": data[i]}
    )
    for i in range(len(embeddings))
]
client.upsert(collection_name="imdb", points=points)

  client.upsert(collection_name="imdb", points=points)


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [13]:
def load_relevant_reviews(query: str) -> list[str]:
    embedding = embedding_model.encode(
        [query], normalize_embeddings=True
    )
    relevant_info = client.query_points(
        collection_name="imdb",
        query=embedding[0],
        limit=5,
        with_payload=True
    )
    return [res.payload['text'] for res in list(relevant_info)[0][1]]

In [19]:
ans = load_relevant_reviews("What is the typcal plot for a horror movie?")

In [20]:
for res in ans:
    pprint(res)
    print('\n')

("How can you tell that a horror movie is terrible? when you can't stop "
 'laughing about it of course! The plot has been well covered by other '
 "reviewers, so I'll just add a few things on the hilarity of it all.<br /><br "
 '/>Some reviews have placed the location in South America, others in Africa, '
 'I thought it was in some random island in the Pacific. Where exactly does '
 'this take place, seems to be a mystery. The cannibal tribe is conformed by a '
 'couple of black women some black men, and a man who looks like a young Frank '
 'Zappa banging the drums... the Devil God is a large black man with a '
 'terrible case of pink eyes.<br /><br />One of the "freakiest" moments in the '
 'film is when, "Pablito" find his partner hanging from a tree covered in what '
 'seems to be an orange substance that I assume is blood, starts screaming for '
 "minutes on and on (that's actually funny), and then the head of his partner "
 'falls in the ground and "Pablito" kicks it a bit for w

## Generation part

In [None]:
system_template = """
You are a movie expert. You are provided with reviews from the IMDb dataset that are relevant to the user's request.

Reviews:

{reviews}
""".strip()

def generate_system_prompt(reviews: list[str]) -> str:
    return system_template.format(reviews="\n\n".join(reviews))

In [None]:
print(
    generate_system_prompt(
        load_relevant_reviews("what is the typcal plot for a horror movie?")
    )
)

You are a movie expert. You are provided with reviews from the IMDb dataset that are relevant to the user's request.

Reviews:

How can you tell that a horror movie is terrible? when you can't stop laughing about it of course! The plot has been well covered by other reviewers, so I'll just add a few things on the hilarity of it all.<br /><br />Some reviews have placed the location in South America, others in Africa, I thought it was in some random island in the Pacific. Where exactly does this take place, seems to be a mystery. The cannibal tribe is conformed by a couple of black women some black men, and a man who looks like a young Frank Zappa banging the drums... the Devil God is a large black man with a terrible case of pink eyes.<br /><br />One of the "freakiest" moments in the film is when, "Pablito" find his partner hanging from a tree covered in what seems to be an orange substance that I assume is blood, starts screaming for minutes on and on (that's actually funny), and then 

In [31]:
generation_pipeline = pipeline(
    "text-generation",
    model="Qwen/Qwen2-1.5B-Instruct"
)

Device set to use cpu


In [None]:
def generate(request: str) -> str:
    system_prompt = generate_system_prompt(
        load_relevant_reviews(request)
    )
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": request}
    ]
    ans = generation_pipeline(messages, max_new_tokens=512, temperature=0.1, top_p=0.7)
    return ans[0]["generated_text"][-1]["content"]