In [9]:
from tools import fetch_arxiv_papers

papers = fetch_arxiv_papers('language models', 10)

In [10]:
[paper['title'] for paper in papers]

['Goldfish: Monolingual Language Models for 350 Languages',
 'Counterfactually Probing Language Identity in Multilingual Models',
 'Fence - An Efficient Parser with Ambiguity Support for Model-Driven Language Specification',
 'Continuous multilinguality with language vectors',
 'Comparing Fifty Natural Languages and Twelve Genetic Languages Using Word Embedding Language Divergence (WELD) as a Quantitative Measure of Language Distance',
 'The Geometry of Multilingual Language Model Representations',
 'Between Circuits and Chomsky: Pre-pretraining on Formal Languages Imparts Linguistic Biases',
 'Cedille: A large autoregressive French language model',
 "What's in a Name?",
 'Are All Languages Equally Hard to Language-Model?']

In [16]:
from llama_index.core import Document

def create_documents_from_papers(papers):
    documents = []
    for paper in papers:
        content = (
            f'Title: {paper['title']}\n'
            f'Authors: {", ".join(paper['authors'])}\n'
            f'Summary: {paper['summary']}\n'
            f'Journal: {paper['journal_ref']}\n'
            f'Doi: {paper['doi']}\n'
            f'Primary category: {paper['primary_category']}\n'
            f'Categories: {", ".join(paper['categories'])}\n'
            f'PDF url: {paper['pdf_url']}\n'
        )
        documents.append(Document(text = content))
    return documents

In [17]:
documents = create_documents_from_papers(papers)

In [22]:
documents

[Document(id_='09be6cbc-7f5b-40a2-bf06-02bbeaca497c', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Title: Goldfish: Monolingual Language Models for 350 Languages\nAuthors: Tyler A. Chang, Catherine Arnett, Zhuowen Tu, Benjamin K. Bergen\nSummary: For many low-resource languages, the only available language models are large\nmultilingual models trained on many languages simultaneously. However, using\nFLORES perplexity as a metric, we find that these models perform worse than\nbigrams for many languages (e.g. 24% of languages in XGLM 4.5B; 43% in BLOOM\n7.1B). To facilitate research that focuses on low-resource languages, we\npre-train and release Goldfish, a suite of monolingual autoregressive\nTransformer language models up to 125M parameters for 350 languages. The\nGoldfish reach lower FLORES perpl

In [23]:
from llama_index.core import Settings, VectorStoreIndex
from constants import embed_model

Settings.chunk_size = 1024
Settings.chunk_overlap = 50

index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

In [24]:
index.storage_context.persist("index/")