# Personalization

> The personalization of arXiv paper is done using a vector search between the latest papers that appears in an arXiv category and the papers that the user is currently focussing on in his research. All those papers exists in a Zotero folder.

In [None]:
#| default_exp personalize

## Imports

In [None]:
#| exports
#| output: false
import arxiv
import chromadb
import cohere
import os
from nameparser import HumanName
from pyzotero import zotero
from readnext.arxiv_categories import exists
from readnext.embedding import pdf_to_text, get_embeddings, embedding_system
from rich import print
from rich.progress import Progress

## Get a Zotero collection ID from its name

When interacting with the Zotero API, it is always expecting a collection ID. However, it is very hard to get the ID of that collection from the Zotero user interface. This utility function is used to get the ID of a collection from its name.

In [None]:
#| export

def get_collection_id_from_name(collection_name: str) -> str:
    """Return the ID of a collection from its name. 
       Return an empty string if no collection's name doesn't exists.
       The comparison is case insensitive."""
    
    zot = zotero.Zotero(os.environ.get('ZOTERO_LIBRARY_ID'), os.environ.get('ZOTERO_LIBRARY_TYPE'), os.environ.get('ZOTERO_API_KEY'))

    for collection in zot.collections():
        if collection['data']['name'].lower() == collection_name.lower():
            return collection['key']
        
    return ''

## Get all the items of a Zotero collection name

Gets all the items of a Zotero collection from its name. It will reuse the function `get_collection_id_from_name` to get the collection ID from its name. An item can be very broad, those are not just the PDF papers, it could be links to web pages, full text notes, etc.

In [None]:
#| export

def get_target_collection_items(collection_name: str):
    """Given the name of a Zotero collection, return all the items from that collection."""
    collection = get_collection_id_from_name(collection_name)

    if collection != "":        
        zot = zotero.Zotero(os.environ.get('ZOTERO_LIBRARY_ID'), os.environ.get('ZOTERO_LIBRARY_TYPE'), os.environ.get('ZOTERO_API_KEY'))
        return zot.collection_items(collection)
    else:
        return {}

## Create corpus of interests from Zotero collection

What we call a "corpus of interest" is a Zotero collection that contains all the papers that the user is currently focussing on in his research. This function will create a corpus of interest from a Zotero collection name.

This corpus of interest is used to create an "embedding of interest" that will be used to select the most relevant papers that are published every day.

In [None]:
#| export

def create_interests_corpus(collection_name: str) -> str:
    """Create a corpus of interests from all the documents
    existing in a Zotero collection. This corpus will be used to
    match related daily papers published on ArXiv."""
    interests_corpus = ""

    for item in get_target_collection_items(collection_name):
        if item['data']['itemType'] != 'attachment':
            if 'title' in item['data']:
                interests_corpus = interests_corpus + ' ' + item['data']['title']
            if 'abstractNote' in item['data']:
                interests_corpus = interests_corpus + ' ' + item['data']['abstractNote']
            interests_corpus = interests_corpus + ' ' + '\n'
    
    return interests_corpus

## Get personalized papers

Query the embeddings space of the input category using the embedding of the corpus of interests. Returns `nb_proposals` more relevant papers.

In [None]:
#| export

def get_personalized_papers(category: str, zotero_collection: str, nb_proposals=10) -> dict:
    """Given a ArXiv category and a Zotero personalization collection. 
    Returns a dictionary where the keys are the personalized ArXiv IDs, 
    and the value the distance to the personalization embedding."""

    chroma_client = chromadb.PersistentClient(path=os.environ.get('CHROMA_DB_PATH'))

    ids = {}

    if exists(category): 
        papers_category_collection = chroma_client.get_or_create_collection(name='all' + embedding_system() if category == 'all' else 'arxiv_' + category + '_' + embedding_system())

        interesting_papers = papers_category_collection.query(
            query_embeddings=get_embeddings(create_interests_corpus(zotero_collection)),
            n_results=int(nb_proposals)) # need to force int() to convert when from the command line.

        for index, pdf in enumerate(interesting_papers['ids'][0]):
            ids[pdf.rstrip('.pdf')] = str(interesting_papers['distances'][0][index]) 

    return ids

## Get the summary of a PDF file

In addition, the user may want to have a summary of the paper (other than the abstract written by the author). If it is the case, then the paper's text will be summarized by an external summarization service (currently Cohere) and will return the summary. That summary will then be added as an attachement to the paper's item in Zotero.

In [None]:
#| export

def get_pdf_summary(pdf) -> str:
    text = pdf_to_text(pdf)

    co = cohere.Client(os.environ.get('COHERE_API_KEY'))

    res = co.summarize(text[:100000], length='medium')

    return res.summary

## Check if a given paper is already in the collection of proposed papers

This is used to avoid duplicated papers in the Zotero collection, otherwise every time someone run ReadNext, it will duplicate the proposed papers if they were already proposed in the past.

In [None]:
#| export

def check_already_in_zotero_proposals(title: str, proposals_collection: str) -> bool:
    """Check if a paper is already in the proposals collection."""
    for item in get_target_collection_items(proposals_collection):
        if item['data']['itemType'] != 'attachment':
            if 'title' in item['data']:
                if item['data']['title'] == title:
                    return True
    
    return False

## Save all personalized papers in Zotero

Save all the personalized papers in Zotero. By default, no artifacts are saved in Zotero. The reason is that users have 200mo free with their account, and that space is taken rapidly if we save artifacts days in days out. However, if the user is paying for more space, then he most likely want to have the artifacts saved in Zotero.

In [None]:
#| export

def save_personalized_papers_in_zotero(ids: dict, proposals_collection, with_artifacts: bool):
    """Get all personalized papers propositions and upload them to the 
    `proposals_collection` Zotero collection.
    
    If `with_artifacts=True`, then all documents artifacts will be
    uploaded to Zotero as well (namely PDFs and summary documents), 
    but it will take more space to the Zotero account and will be 
    slower to process."""

    zot = zotero.Zotero(os.environ.get('ZOTERO_LIBRARY_ID'), os.environ.get('ZOTERO_LIBRARY_TYPE'), os.environ.get('ZOTERO_API_KEY'))

    # get information for each matched articles directly from ArXiv
    search = arxiv.Search(id_list=ids.keys())

    with Progress() as progress:
        task = progress.add_task("[cyan]Uploading papers to Zotero...", total=len(list(search.results())))

        for index, result in enumerate(search.results()):
            # skip if the paper is already in the proposals collection
            if(check_already_in_zotero_proposals(result.title, proposals_collection)):
                if not progress.finished:
                    progress.update(task, advance=1)
                continue

            # build the template for the Zotero item
            template = zot.item_template('preprint')

            template['title'] = result.title

            creators = []
            for creator in result.authors:
                name = HumanName(creator.name)
                creators.append({'creatorType': 'author', 'firstName': name.first, 'lastName': name.last})

            template['abstractNote'] = result.summary
            template['creators'] = creators
            template['url'] = result.entry_id
            template['DOI'] = result.doi
            template['repository'] = 'arXiv'
            template['archiveID'] = 'arxiv:' + result.get_short_id()
            template['libraryCatalog'] = 'arXiv.org'
            template['collections'] = [get_collection_id_from_name(proposals_collection)]

            zot.check_items([template])

            resp = zot.create_items([template])

            if '0' in resp['success']:
                if(with_artifacts):
                    parentid = resp['success']['0']
                    rec_path = os.environ.get('RECOMMENDATIONS_PATH').rstrip('/') + '/';

                    if not os.path.exists(rec_path):
                        os.makedirs(rec_path)

                    paper = next(arxiv.Search(id_list=[result.get_short_id()]).results())
                    paper.download_pdf(dirpath=rec_path, filename=result.get_short_id() + '.pdf')

                    # create a new text file
                    with open(rec_path + result.get_short_id() + '.txt', 'w') as f:
                        f.write(get_pdf_summary(rec_path + result.get_short_id() + '.pdf'))
                    
                    zot.attachment_both([[result.get_short_id() + '.pdf', rec_path + result.get_short_id() + '.pdf'],
                                        ['cohere_summary.txt', rec_path + result.get_short_id() + '.txt']], parentid)
            else:
                print("Could not upload paper to Zotero")
            
            if not progress.finished:
                progress.update(task, advance=1)