Debugged & expections added

gamingflexer · Jan 17, 2024 · 2f67f06 · 2f67f06
1 parent d1e24cf
commit 2f67f06
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 21 deletions.
diff --git a/src/scrapper/arxiv.py b/src/scrapper/arxiv.py
@@ -10,6 +10,7 @@
 import requests
 from tqdm.auto import tqdm
 from decouple import config
+import uuid
 
 """
 Usage : get_paper_id("8-bit matrix multiplication for transformers at scale") -> 2106.09680
@@ -91,7 +92,7 @@ def __init__(self, paper_id: str):
         # initialize the requests session
         self.session = requests.Session()
 
-    def load(self, save: bool = False):
+    def load(self, path_author : str ,save: bool = False):
         """Load the paper from the ArXiv API or from a local file
         if it already exists. Stores the paper's text content and
         meta data in self.content and other attributes.
@@ -101,22 +102,30 @@ def load(self, save: bool = False):
         :type save: bool, optional
         """
         # check if pdf already exists
+        # to_save_path = os.path.join(path_author, str(self.id)+".json")
         if os.path.exists(f'papers/{self.id}.json'):
             print(f'Loading papers/{self.id}.json from file')
             with open(f'papers/{self.id}.json', 'r') as fp:
                 attributes = json.loads(fp.read())
             for key, value in attributes.items():
                 setattr(self, key, value)
         else:
-            res = self.session.get(self.url)
-            with open(f'temp.pdf', 'wb') as fp:
-                fp.write(res.content)
-            # extract text content
-            self._convert_pdf_to_text()
-            # get meta for PDF
-            self._download_meta()
-            if save:
-                self.save()
+            try:
+                res = self.session.get(self.url)
+                print(f'Downloading {self.url}')
+                # uuid_small = str(uuid.uuid4())[:8]
+                temp_pdf_path = f'./temp.pdf'
+                with open(temp_pdf_path, 'wb') as fp:
+                    fp.write(res.content)
+                # extract text content
+                self._convert_pdf_to_text()
+                # get meta for PDF
+                self._download_meta()
+                if save:
+                    self.save()
+            except Exception as e:
+                print(f"Error while downloading paper {self.id}: {e}")
+                raise e
 
     def get_refs(self, extractor, text_splitter):
         """Get the references for the paper.

diff --git a/src/scrapper/main.py b/src/scrapper/main.py
@@ -11,8 +11,8 @@ def __init__(self, author_name: str):
         self.author_name = author_name
         self.extractor, self.text_splitter = init_extractor(template=reference_extraction['template'], openai_api_key=OPENAI_API_KEY)
 
-    def get_results_google(self, number_of_results: int = 25):
-        result_dict = get_google_scrape(self.author_name +" research papers arxiv.org",num=number_of_results)
+    def get_results_google(self, number_of_results = 25):
+        result_dict = get_google_scrape(str(self.author_name)+" research papers arxiv.org",num=number_of_results)
         paper_links = []
         for i in result_dict['organic_results']:
             if "arxiv.org" in i['link']:
@@ -36,12 +36,14 @@ def get_paper_details_batch(self, paper_ids: list, path: str = "./data/papers"):
         path_author = os.path.join(path, self.author_name.replace(" ", "_"))
         data = {}
         for i in tqdm(paper_ids):
-            paper = Arxiv(i)
-            paper.load()
-            paper.get_meta()
-            refs = paper.get_refs(
-            extractor=self.extractor,
-            text_splitter=self.text_splitter,)
-            paper.chunker()
-            paper.save_chunks(include_metadata=True, path=path_author)
-
+            try:
+                paper = Arxiv(i)
+                paper.load(path_author)
+                paper.get_meta()
+                refs = paper.get_refs(
+                extractor=self.extractor,
+                text_splitter=self.text_splitter,)
+                paper.chunker()
+                paper.save_chunks(include_metadata=True, path=path_author)
+            except Exception as e:
+                print(f"Error processing paper {i}: {e}")