In [None]:
import requests
from pymongo import MongoClient
import random
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

Connect to database and extract articles with a doi storing all unique doi in doi_list

In [3]:
client = MongoClient("mongodb://localhost:27017/")
db = client["Arxiv"]
collection = db["Arxiv Papers"]

query = {"doi": { "$exists": True, "$ne": None }}
documents = collection.find(query, {"doi" : 1, "_id" : 0})
doi_list = [[doc["doi"]] for doc in documents]
#doi_list = doi_list[0:10] #slicing to reduce computational expensiveness

Add citings to mongodb database - perform error checking, was not able to extract citings for all doi's

In [None]:
index = 0
for doi in doi_list:
    doi = doi[0]
    query = {"doi" : doi}
    print(index)
    url = f"https://opencitations.net/index/coci/api/v1/citations/{doi}"
    response = requests.get(url).json()
    arr = [citing["citing"] for citing in response]
    update = {"$set": {"citings" : arr}}
    collection.update_one(query, update)
    doi_list[index].append(arr)
    index += 1

In [None]:
#errors in adding citings, simply ignoring this for model
doi_list = doi_list[0:635]
doi_list = doi_list[0:len(doi_list) - 2]

Using mutual citations does not work since they are limited

In [None]:
# Extract all unique DOIs in the dataset
all_dois = {entry[0] for entry in doi_list}
citation_pairs = []
index = 0
for entry in doi_list:
    print(index)
    index += 1
    source_doi = entry[0]
    cited_dois = entry[1]
    
    for cited_doi in cited_dois:
        # Only include citations where the cited DOI exists in the dataset
        if cited_doi in all_dois:
            citation_pairs.append((source_doi, cited_doi))

In [54]:
mutual_citations = []

# Convert citation_pairs to a set for faster lookups
citation_set = set(citation_pairs)

for source, cited in citation_pairs:
    # Check if the reverse (cited, source) exists
    if (cited, source) in citation_set:
        mutual_citations.append((source, cited))

Relationships based on direction citations, co-citations or shared references may be better

In [74]:
def remove_duplicate_citations(doi_list):
    for item in doi_list:
        doi = item[0]  # The main DOI
        citation_lists = item[1:]  # All citation lists

        # Flatten the citation lists into a single list
        all_citations = [citation for sublist in citation_lists for citation in sublist]

        # Remove duplicates using a set
        unique_citations = list(set(all_citations))

        # Reconstruct the item with unique citations
        item[1:] = [unique_citations]  # Replace all citation lists with a single list of unique citations

    return doi_list

In [75]:
cleaned_doi = remove_duplicate_citations(doi_list)

In [None]:
positive_pairs = []

for source, cited_dois in cleaned_doi:
    for cited in cited_dois:
        positive_pairs.append((source, cited))

print(positive_pairs)

Train model based on these positive pairs

First we generate negative pairs

In [79]:
all_citations = []
for source, citations in cleaned_doi:
        for i in range(len(citations)):
                all_citations.append(citations[i])


In [81]:
negative_examples = []

for source, citation in cleaned_doi:
    negative_examples.append([source])
    choices = []
    for i in range(0, 10):
        choice = random.choice(all_citations)
        while choice in citation:
            choice = random.choice(all_citations)
        choices.append(choice)
    negative_examples[-1].append(choices)


In [82]:
negative_pairs = []
for base, negative in negative_examples:
    for negatives in negative:
        negative_pairs.append((base, negatives))


In [84]:
training_data = []
for positive in positive_pairs:
    training_data.append(InputExample(texts=[positive[0],positive[1]], label= 1))
for negative in negative_pairs:
    training_data.append(InputExample(texts=[negative[0],negative[1]], label= 0))

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
model = model.to('cuda')

In [86]:
train_dataloader = DataLoader(training_data, shuffle= True, batch_size= 32)
train_loss = losses.CosineSimilarityLoss(model)

Now we fine-tune the model accordingly

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs= 5, warmup_steps=100, optimizer_params= {"lr": 2e-5}, output_path="./fine_tuned", show_progress_bar= True)

We can vindicate performance using InformationRetrievalEvaluator, after this we use the model to change the embeddings of each arxiv article

After changing the embeddings we push this into the recommendation pipeline (start with something simple like KNN)

In [88]:
model = SentenceTransformer("./fine_tuned")