In [3]:
import sys

sys.path.append("..")

import json

import numpy as np
import pandas as pd
import torch

from qdrant_client import QdrantClient
from qdrant_client.http import models
from sentence_transformers import SentenceTransformer

from tqdm.notebook import tqdm
import configparser

config=configparser.ConfigParser()
config.read('../config.cfg')

['../config.cfg']

In [4]:
QDRANT_HOST=config['QDRANT']['host']
QDRANT_PORT=config['QDRANT']['port']
QDRANT_API_KEY=config['QDRANT']['api_key']
DATA='../data'
COLLECTION_NAME='stoician_philosophy'

## Set relevant parameters


In [5]:
BOOK_FILENAME = "Pensées_pour_moi-même"

## Connect to Qdrant and create collection


In [6]:
client = QdrantClient(url=QDRANT_HOST, 
                      port=QDRANT_PORT, 
                      api_key=QDRANT_API_KEY)

client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=models.VectorParams(size=384, 
                                       distance=models.Distance.COSINE),
)

True

## Read sentences


In [7]:
with open(f"{DATA}/processed/{BOOK_FILENAME}/{BOOK_FILENAME}.json", "r") as file:
    meditations_json = json.load(file)

rows = []
for chapter in tqdm(meditations_json["data"]):
    for sentence in chapter["sentences"]:
        rows.append(
            (
                chapter["title"],
                chapter["url"],
                sentence,
            )
        )

df = pd.DataFrame(data=rows, columns=["title", "url", "sentence"])

df = df[df["sentence"].str.split().str.len() > 15]


  0%|          | 0/26 [00:00<?, ?it/s]

## Vectorize sentences


In [9]:
model = SentenceTransformer(
    "paraphrase-multilingual-MiniLM-L12-v2", # msmarco-MiniLM-L-6-v3
    device="cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu",
)

vectors = []
batch_size = 512
batch = []

for doc in tqdm(df["sentence"].to_list()):
    batch.append(doc)

    if len(batch) >= batch_size:
        vectors.append(model.encode(batch))
        batch = []

if len(batch) > 0:
    vectors.append(model.encode(batch))
    batch = []

vectors = np.concatenate(vectors)

book_name = meditations_json["book_title"]

client.upsert(
    collection_name=COLLECTION_NAME,
    points=models.Batch(
        ids=[i for i in range(df.shape[0])],
        payloads=[
            {
                "text": row["sentence"],
                "title": row["title"] + f", {book_name}",
                "url": row["url"],
            }
            for _, row in df.iterrows()
        ],
        vectors=[v.tolist() for v in vectors],
    ),
)


  0%|          | 0/1275 [00:00<?, ?it/s]

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)