In [1]:
import sys
sys.path.append("..")
import json
import numpy as np
import pandas as pd
import torch

from qdrant_client import QdrantClient
from qdrant_client.http import models
from pymongo import MongoClient

from tqdm.notebook import tqdm

from server.config import QDRANT_HOST, QDRANT_PORT, QDRANT_API_KEY, OPENAI_API_KEY, DATA, COLLECTION_NAME


## Connect to Qdrant and Mongo and create collection


In [2]:

# mongo_client = MongoClient('localhost', 27017)
# db = mongo_client.scientific_articles

client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, api_key=QDRANT_API_KEY)
# client.recreate_collection(
#     collection_name=COLLECTION_NAME,
#     vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
# )


## Load Data into DataFrame


In [None]:
import requests

HOST = "http://144.24.201.133:5000"
rows = []

for i in range(10):
    articles_json = requests.get(f"{HOST}/allPapers?skip={i*1000}").json()

    for paper in articles_json:
        rows.append(
            (
                paper["title"],
                paper["link"],
                paper["abstract"],
                paper["summary"]
            )
        )

# Instanciate Model

In [3]:
from transformers import AutoModel, AutoTokenizer

# model = SentenceTransformer(
#     "msmarco-MiniLM-L-6-v3",
#     device="cuda"
#     if torch.cuda.is_available()
#     else "mps"
#     if torch.backends.mps.is_available()
#     else "cpu",
# )

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-MiniLM-L-6-v3')
model = AutoModel.from_pretrained('sentence-transformers/msmarco-MiniLM-L-6-v3')

## Vectorize docs


In [15]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [16]:
nsplit = 100
batch_load_step = int(df.shape[0]/nsplit)
print(df.shape[0])

# for index in list(range(nsplit)):
#     print(batch_load_step*(index+1))

998


In [None]:
titles = [row[0] for row in rows[0:100]]
bodies = [row[3] for row in rows[0:100]]
summaries = [row[4] for row in rows[0:100]]

print(len(bodies))
print(len(titles))
print(len(summaries))
# Tokenize sentences
encoded_input = tokenizer(bodies, padding=True, truncation=True, return_tensors='pt').to("cuda")

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, max pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

print("Sentence embeddings:")
print(sentence_embeddings)

# vectors = []

# for doc in tqdm(df["body"].to_list()):
#     vectors.append(model.encode(doc))

In [25]:
batch_load_step = 100
nsplit = int(df.shape[0]/batch_load_step)


9


In [15]:
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='papers'), CollectionDescription(name='papers_complete')])

In [13]:
client.delete_collection(collection_name="papers")

True

In [14]:
client.recreate_collection(
    collection_name="papers",
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
)

True

## Load data on Qdrant search engine

In [None]:
for index in list(range(nsplit)):
    for _, row in df.iloc[list(range(batch_load_step*index, batch_load_step*(index+1)))].iterrows():
        print(row['title'])

In [None]:
for vec in vectors:
    print((list(vec)))

In [None]:
for index in tqdm(list(range(nsplit))):
    client.upsert(
        collection_name=COLLECTION_NAME,
        points=models.Batch(
            ids=list(range(batch_load_step*index, batch_load_step*(index+1))),
            payloads=[
                {
                    "body": row["body"],
                    "abstract": row["abstract"],
                    "title": row["title"],
                    "link": row["link"],
                }
                for _, row in df.iloc[list(range(batch_load_step*index, batch_load_step*(index+1)))].iterrows()
            ],
            vectors= [vec.tolist() for vec in vectors[batch_load_step*index:batch_load_step*(index+1)]],
        ),
    )

In [None]:
query = "how is artificial intelligence used in medical applications?"
%store query

Stored 'query' (str)


In [None]:
similar_docs = client.search(
        collection_name="papers",
        query_vector=model.encode(query),
        limit=6,
        offset=0,
        append_payload=True,
    )

%store similar_docs

Stored 'similar_docs' (list)


In [None]:
for doc in similar_docs:
    print("[" + str(doc.score) + '] ' + doc.payload['title'])

[0.6329287] Research article
Physicians’ attitudes and knowledge toward artificial intelligence in medicine: Benefits and drawbacks
[0.5812633] Knowledge, attitudes and practices towards artificial intelligence (AI) among radiologists in Saudi Arabia
[0.5647462] Patient views on the implementation of artificial intelligence in radiotherapy
[0.55050915] Case of the Season: Artificial Intelligence in Clinical Practice—Large Vessel Occlusion Triage in Stroke Imaging
[0.5367967] Applications of Artificial Intelligence in the Radiology Roundtrip: Process Streamlining, Workflow Optimization, and Beyond
[0.5233896] Research article
An architectural approach to modeling artificial general intelligence
