In [None]:
import os
import uuid

import torch
import numpy as np

from pprint import pprint
from datasets import load_dataset
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Building embeddings

In [2]:
data = list(load_dataset("stanfordnlp/imdb", split="train")['text'])
pprint(data[20])

('If the crew behind "Zombie Chronicles" ever read this, here\'s some advice '
 'guys: <br /><br />1. In a "Twist Ending"-type movie, it\'s not a good idea '
 'to insert close-ups of EVERY DEATH IN THE MOVIE in the opening credits. That '
 "tends to spoil the twists, y'know...? <br /><br />2. I know you produced "
 'this on a shoestring and - to be fair - you worked miracles with your budget '
 'but please, hire people who can actually act. Or at least, walk, talk and '
 "gesture at the same time. Joe Haggerty, I'm looking at you...<br /><br />3. "
 "If you're going to set a part of your movie in the past, only do this if you "
 'have the props and costumes of the time.<br /><br />4. Twist endings are '
 "supposed to be a surprise. Sure, we don't want twists that make no sense, "
 'but signposting the "reveal" as soon as you introduce a character? That\'s '
 'not a great idea.<br /><br />Kudos to the guys for trying, but in all '
 "honesty, I'd rather they hadn't...<br /><br />Only for

In [3]:
embedding_model = SentenceTransformer(
    "paraphrase-MiniLM-L3-v2",
    model_kwargs={'torch_dtype': torch.float16}
)

In [35]:
if os.path.exists("imdb_example_files"):
    embeddings = np.load("imdb_example_files/embeddings.npy")
else:
    embeddings = embedding_model.encode(data, normalize_embeddings=True)
    os.mkdir("imdb_example_files")
    np.save("imdb_example_files/embeddings", embeddings)
    with open("imdb_example_files/.gitignore", "w") as f:
        f.write("embeddings.npy\n")

## Vector database

In [5]:
embedding_size = embeddings.shape[1]

In [6]:
client = QdrantClient(":memory:")

client.create_collection(
    collection_name="harrypotter",
    on_disk_payload=True,
    vectors_config=models.VectorParams(
        size=embedding_size,
        distance=models.Distance.COSINE,
        on_disk=True
    )
)

True

In [7]:
points = [
    models.PointStruct(
        id=str(uuid.uuid4()),
        vector=embeddings[i],
        payload={"text": chunks[i]}
    )
    for i in range(len(chunks))
]

In [8]:
client.upsert(collection_name="harrypotter", points=points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [47]:
hits = client.query_points(
    collection_name="harrypotter",
    query=embedding_model.encode(
        ["Weasley's family"],
        normalize_embeddings=True
    )[0],
    limit=5
)

In [49]:
print(list(hits)[0][1][0].payload['text'])

departure. Mrs. Weasley was still glowering as she kissed Mr. 
Weasley on the cheek, though not nearly as much as the twins, who 
had each hoisted their rucksacks onto their backs and walked out 
without a word to her. 
“Well, have a lovely time,” said Mrs. Weasley, “and behave your-
selves,” she called after the twins’ retreating backs, but they did not 
look back or answer. “I’ll send Bill, Charlie, and Percy along 
around midday,” Mrs. Weasley said to Mr. Weasley, as he, Harry, 
Ron, Hermione, and Ginny set off across the dark yard after Fred 
and George. 
It was chilly and the moon was still out. Only a dull, greenish 
tinge along the horizon to their right showed that daybreak was 
drawing closer. Harry, having been thinking about thousands of 
wizards speeding toward the Quidditch World Cup, sped up to 
walk with Mr. Weasley. 
“So how does everyone get there without all the Muggles notic-
ing?” he asked. 
“It’s been a massive organizational problem,” sighed Mr.
