Skip to content

Commit

Permalink
Debugged & expections added
Browse files Browse the repository at this point in the history
  • Loading branch information
gamingflexer committed Jan 17, 2024
1 parent d1e24cf commit 2f67f06
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 21 deletions.
29 changes: 19 additions & 10 deletions src/scrapper/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import requests
from tqdm.auto import tqdm
from decouple import config
import uuid

"""
Usage : get_paper_id("8-bit matrix multiplication for transformers at scale") -> 2106.09680
Expand Down Expand Up @@ -91,7 +92,7 @@ def __init__(self, paper_id: str):
# initialize the requests session
self.session = requests.Session()

def load(self, save: bool = False):
def load(self, path_author : str ,save: bool = False):
"""Load the paper from the ArXiv API or from a local file
if it already exists. Stores the paper's text content and
meta data in self.content and other attributes.
Expand All @@ -101,22 +102,30 @@ def load(self, save: bool = False):
:type save: bool, optional
"""
# check if pdf already exists
# to_save_path = os.path.join(path_author, str(self.id)+".json")
if os.path.exists(f'papers/{self.id}.json'):
print(f'Loading papers/{self.id}.json from file')
with open(f'papers/{self.id}.json', 'r') as fp:
attributes = json.loads(fp.read())
for key, value in attributes.items():
setattr(self, key, value)
else:
res = self.session.get(self.url)
with open(f'temp.pdf', 'wb') as fp:
fp.write(res.content)
# extract text content
self._convert_pdf_to_text()
# get meta for PDF
self._download_meta()
if save:
self.save()
try:
res = self.session.get(self.url)
print(f'Downloading {self.url}')
# uuid_small = str(uuid.uuid4())[:8]
temp_pdf_path = f'./temp.pdf'
with open(temp_pdf_path, 'wb') as fp:
fp.write(res.content)
# extract text content
self._convert_pdf_to_text()
# get meta for PDF
self._download_meta()
if save:
self.save()
except Exception as e:
print(f"Error while downloading paper {self.id}: {e}")
raise e

def get_refs(self, extractor, text_splitter):
"""Get the references for the paper.
Expand Down
24 changes: 13 additions & 11 deletions src/scrapper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ def __init__(self, author_name: str):
self.author_name = author_name
self.extractor, self.text_splitter = init_extractor(template=reference_extraction['template'], openai_api_key=OPENAI_API_KEY)

def get_results_google(self, number_of_results: int = 25):
result_dict = get_google_scrape(self.author_name +" research papers arxiv.org",num=number_of_results)
def get_results_google(self, number_of_results = 25):
result_dict = get_google_scrape(str(self.author_name)+" research papers arxiv.org",num=number_of_results)
paper_links = []
for i in result_dict['organic_results']:
if "arxiv.org" in i['link']:
Expand All @@ -36,12 +36,14 @@ def get_paper_details_batch(self, paper_ids: list, path: str = "./data/papers"):
path_author = os.path.join(path, self.author_name.replace(" ", "_"))
data = {}
for i in tqdm(paper_ids):
paper = Arxiv(i)
paper.load()
paper.get_meta()
refs = paper.get_refs(
extractor=self.extractor,
text_splitter=self.text_splitter,)
paper.chunker()
paper.save_chunks(include_metadata=True, path=path_author)

try:
paper = Arxiv(i)
paper.load(path_author)
paper.get_meta()
refs = paper.get_refs(
extractor=self.extractor,
text_splitter=self.text_splitter,)
paper.chunker()
paper.save_chunks(include_metadata=True, path=path_author)
except Exception as e:
print(f"Error processing paper {i}: {e}")

0 comments on commit 2f67f06

Please sign in to comment.