In [1]:
import sys
sys.path.append("..")
import pandas as pd

from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from qdrant_client.http import models
from pymongo import MongoClient

from tqdm.notebook import tqdm

from server.config import QDRANT_HOST, QDRANT_PORT, QDRANT_API_KEY, OPENAI_API_KEY, DATA, COLLECTION_NAME


## Connect to Qdrant and Mongo and create collection


In [5]:
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, api_key=QDRANT_API_KEY)

# w2v_papers: documents indexed by title and abstract
# client.delete_collection(
#     collection_name="st_finetuned_papers"
# )


client.recreate_collection(
    collection_name="st_finetuned_papers_summaries",
    vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE),
)


True

## Load Data into DataFrame


In [6]:
import requests
from tqdm import tqdm

HOST = "http://144.24.201.133:5000"
rows = []

for i in tqdm(range(10)):
    articles_json = requests.get(f"{HOST}/allPapers?skip={i*1000}").json()

    for paper in articles_json:
        if(len(paper["abstract"])>20):
            rows.append(
                (
                    paper["title"],
                    paper["link"],
                    paper["abstract"],
                    paper["summary"]
                )
            )

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [02:22<00:00, 14.24s/it]


In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/allenai-specter').cuda()

In [None]:
paper_embeddings = []

for paper in tqdm(rows):
    paper_embeddings.append(model.encode(paper[3]))

  6%|▌         | 584/9348 [03:35<37:03,  3.94it/s]  

  8%|▊         | 707/9348 [05:01<1:23:30,  1.72it/s]

In [None]:

for index, point_paper in tqdm(enumerate(paper_embeddings)):
    # try:
        client.upsert(
            collection_name="st_finetuned_papers_summaries",
            points=[    
                PointStruct(
                    id=index,
                    vector=point_paper.tolist(),
                    payload={"link": rows[index][1], "title": rows[index][0]}
                )
            ]
        )
   

9348it [21:11,  7.35it/s]


In [None]:
collection_info = client.get_collection(collection_name='st_finetuned_paper_summaries')

print(f"current vector count = ", collection_info.vectors_count)

In [24]:
query = "iot wearable devices for respiratory diseases"

In [25]:
similar_docs = client.search(
        collection_name="st_finetuned_papers_abstract",
        query_vector=model.encode(query),
        limit=6,
        offset=0,
        append_payload=True,
    )

In [1]:
for doc in similar_docs:
    print(doc.payload)

NameError: name 'similar_docs' is not defined