## Load Data into DataFrame


In [1]:
import requests
from tqdm.notebook import tqdm
import pandas as pd

HOST = "http://144.24.201.133:5000"

articles_json = requests.get(f"{HOST}/allPapers").json()

rows = []
for paper in tqdm(articles_json):
    rows.append(
        (
            paper["title"],
            paper["link"],
            paper["abstract"],
            paper["body"],
            paper["summary"]
        )
    )

df = pd.DataFrame(data=rows, columns=["title", "link", "abstract", "body", "summary"])

df = df[df["body"].str.split().str.len() > 15]


  0%|          | 0/1000 [00:00<?, ?it/s]

# Instanciate Model

In [2]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-MiniLM-L-6-v3')
model = AutoModel.from_pretrained('sentence-transformers/msmarco-MiniLM-L-6-v3').cuda()

## Vectorize docs

In [7]:
import torch
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def embed_sentences(sentences):
    #Clean df titles, bodies and summaries:
    #chunk[payload] = chunk[payload].apply(clean_text)
    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to("cuda")

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling. In this case, max pooling.
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embeddings[0]
    # vectors = []

    # for doc in tqdm(df["body"].to_list()):
    #     vectors.append(model.encode(doc))

In [16]:
paper_vectors = []
for paper in tqdm(rows):
    paper_vectors.append({"title": paper[0], "embedding": embed_sentences(paper[0]).cpu().numpy()})
    

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
data = [paper_vector["embedding"] for paper_vector in paper_vectors]
print(data[0])

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

nbrs = NearestNeighbors(n_neighbors=5, algorithm='brute', metric="cosine").fit(data)

In [77]:
rows[120][0]

'Knowledge, attitudes and practices towards artificial intelligence (AI) among radiologists in Saudi Arabia'

In [76]:
query = embed_sentences("Biology and ai").cpu().numpy()
# print(query)
result = nbrs.kneighbors([query])
print(result)

(array([[0.5131613, 0.5715173, 0.5816949, 0.5861953, 0.5877244]],
      dtype=float32), array([[120, 217, 558, 219, 712]], dtype=int64))
