In [1]:
import sys
sys.path.append("..")

import json

import numpy as np
import pandas as pd
import torch

from qdrant_client import QdrantClient
from qdrant_client.http import models
from pymongo import MongoClient

from tqdm.notebook import tqdm

from config import QDRANT_HOST, QDRANT_PORT, QDRANT_API_KEY, OPENAI_API_KEY, DATA, COLLECTION_NAME


## Connect to Qdrant and Mongo and create collection


In [2]:

mongo_client = MongoClient('localhost', 27017)
db = mongo_client.scientific_articles

client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, api_key=QDRANT_API_KEY)
# client.recreate_collection(
#     collection_name=COLLECTION_NAME,
#     vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
# )


## Load Data into DataFrame


In [3]:
articles_json = list(db.articles.find({}))

rows = []
for paper in tqdm(articles_json): 
    rows.append(
        (
            paper["title"],
            paper["link"],
            paper["abstract"],
            paper["body"],
        )
    )

df = pd.DataFrame(data=rows, columns=["title", "link", "abstract", "body"])


  0%|          | 0/12769 [00:00<?, ?it/s]

# Instanciate Model
Here we instantiate the sentence vectorizer and the word vectorizer, that will encode the papers that will be loaded into the retrieval system.
We compare if using word or sentence vectorizer can bring some performance improvement into the similarity search.

## Sentence Transformer

In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "msmarco-MiniLM-L-6-v3",
    device="cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu",
)

## Word Embeddings


## Vectorize docs


In [None]:
vectors = []

for doc in tqdm(df["body"].to_list()):
    vectors.append(model.encode(doc))

In [7]:
print(len(vectors))

1103


In [None]:
nsplit = 100
batch_load_step = int(df.shape[0]/nsplit)

for index in list(range(nsplit)):
    print(vectors[batch_load_step*index:batch_load_step*(index+1)])

In [5]:
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='papers_complete'), CollectionDescription(name='papers')])

In [72]:
client.delete_collection(collection_name="papers")

True

In [73]:
client.recreate_collection(
    collection_name="papers",
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
)

True

## Load data on Qdrant search engine

In [None]:
for index in list(range(nsplit)):
    for _, row in df.iloc[list(range(batch_load_step*index, batch_load_step*(index+1)))].iterrows():
        print(row['title'])

In [None]:
for vec in vectors:
    print((list(vec)))

In [None]:
for index in tqdm(list(range(nsplit))):
    client.upsert(
        collection_name=COLLECTION_NAME,
        points=models.Batch(
            ids=list(range(batch_load_step*index, batch_load_step*(index+1))),
            payloads=[
                {
                    "body": row["body"],
                    "abstract": row["abstract"],
                    "title": row["title"],
                    "link": row["link"],
                }
                for _, row in df.iloc[list(range(batch_load_step*index, batch_load_step*(index+1)))].iterrows()
            ],
            vectors= [vec.tolist() for vec in vectors[batch_load_step*index:batch_load_step*(index+1)]],
        ),
    )

In [12]:
query = "artificial intelligence medical applications?"
%store query

Stored 'query' (str)


In [13]:
similar_docs = client.search(
        collection_name="papers_complete",
        query_vector=model.encode(query),
        limit=6,
        offset=0,
        append_payload=True,
    )

%store similar_docs

Stored 'similar_docs' (list)


In [14]:
for doc in similar_docs: 
    print("[" + str(doc.score) + '] ' + doc.payload['title'])

[0.5947627] Applications of Artificial Intelligence in the Radiology Roundtrip: Process Streamlining, Workflow Optimization, and Beyond
[0.58876115] Artificial Intelligence for Cybersecurity: Literature Review and Future Research Directions
[0.5600363] A systematic approach to enhance the explainability of artificial intelligence in healthcare with application to diagnosis of diabetes
[0.53150547] Artificial intelligence for visually impaired
[0.5301596] How artificial intelligence uses to achieve the agriculture sustainability: Systematic review
[0.52678156] Artificial intelligence for secondary prevention of myocardial infarction: A qualitative study of patient and health professional perspectives
