In [69]:
import urllib
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt


from model import get_embedding

In [30]:
def fetch_papers():
    """Fetches papers from the arXiv API and returns them as a list of strings."""

    url = 'http://export.arxiv.org/api/query?search_query=ti:llama&start=0&max_results=70'
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    root = ET.fromstring(data)

    papers_list = []

    for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
        paper_info = f"Title: {title}\nSummary: {summary}\n"
        papers_list.append(paper_info)

    return papers_list

In [39]:
papers_list = fetch_papers()
df = pd.DataFrame(papers_list, columns=['TitleAbstract'])
df.head()

Unnamed: 0,TitleAbstract
0,Title: Lawyer LLaMA Technical Report\nSummary:...
1,Title: Label Supervised LLaMA Finetuning\nSumm...
2,Title: LLAMA: Leveraging Learning to Automatic...
3,Title: Challenges and opportunities integratin...
4,Title: LLaMA: Open and Efficient Foundation La...


In [43]:
df['TitleAbstract'].apply(len).max()

2058

In [44]:
import re
def remove_linebreaks(input_string):
    """Removes linebreaks and tabs from string using regular expression."""
    cleaned_string = re.sub(r'[\n\t]', ' ', input_string)
    return cleaned_string
    
df['TitleAbstract'] = df['TitleAbstract'].apply(remove_linebreaks)

import spacy
def extract_keywords(text):
    # Load the spaCy NLP model
    nlp = spacy.load("en_core_web_sm")

    # Process the text using spaCy
    doc = nlp(text)

    # Extract keywords (nouns and adjectives)
    keywords = [token.text for token in doc if token.pos_ in ["NOUN", "ADJ"]]
    keywords = " ".join(keywords)
    
    return keywords

df['KeyWords'] = df['TitleAbstract'].apply(extract_keywords)

df['KeyWords'].apply(len).max()

1131

In [70]:
# Calculate embedding for all papers
df['Embedding'] = df['TitleAbstract'].apply(get_embedding)

In [76]:
df.to_csv('df_papers_llama2.csv')
np.save('Embeddings.npy', df['Embedding'])