### Fetching the papers from arXiv API 
###### Retrieving the papers in a JSON format including title, published date, abstract, authors and the link 

In [18]:
import collections
import types
from collections import defaultdict
import json
import arxiv
import time
import seaborn as sb
from semanticscholar.Paper import Paper
from tqdm import tqdm
from semanticscholar import SemanticScholar
import numpy as np
from os.path import basename
import asyncio

def search(queries=[], field="all", cats=["cs.CL", "cs.LG", 'cs.AI', 'cs.CV'], start='202201010000', end='202202010000'):
    # Use the arxiv API for the below categories
    query_string, client = "", arxiv.Client(num_retries=40, page_size=1000)
    if queries:
        query_string += "(" + " OR ".join(f"{field}:{query}" for query in queries) + ")"
    if cats:
        if query_string:
            query_string += " AND "
        query_string += "(" + " OR ".join(f"cat:{cat}" for cat in cats) + ")"
    query_string += f" AND submittedDate:[{start} TO {end}]"
    print(query_string)
    return client.results(arxiv.Search(
        query=query_string,
        sort_by=arxiv.SortCriterion.SubmittedDate,
    ))

def _get_papers(self, paper_ids, fields: list = None):
    if not fields:
        fields = Paper.SEARCH_FIELDS

    url = f'{self._AsyncSemanticScholar.api_url}/graph/v1/paper/batch'
    fields = ','.join(fields)
    parameters = f'&fields={fields}'

    payload = {"ids": paper_ids}

    while True:
        try:
            data = asyncio.run(self._AsyncSemanticScholar._requester.get_data_async(
                url, parameters, self._AsyncSemanticScholar.auth_header, payload))
            break
        except:
            print('time out; sleep 2 seconds...')
            time.sleep(2)
            continue
    papers = [Paper(item) if item else None for item in data]
    return papers

def get_papers(file="papers.shelf", cached=False, start='202201010000', end='202404010000'):
    papers = defaultdict(list)

    if cached:
        print("Loading cached papers.")
    else:
        print("Downloading papers.")
        results = []
        batch = []
        progress = 0
        print("Progress:", 0)
        count = 1
        for result in search(start=start, end=end):
            if len(batch) < 500:
                results.append(result)
                batch.append("arxiv:" + basename(result.entry_id).split("v")[0])
                progress += 1
            else:
                print(f"{result.published.year}-{result.published.month}-Progress{count}:", progress)
                count += 1
                results, batch = [], []

        if len(results) > 0:
            print("Final Progress:", progress)

    return papers

sch = SemanticScholar(timeout=100)
sch.get_papers = types.MethodType(_get_papers, sch)

def get_citations(batch):
    papers = sch.get_papers(paper_ids=batch)
    citation_counts = []
    for p in papers:
        try:
            citation_counts.append(p.citationCount)
        except:
            citation_counts.append(0)
    return citation_counts
class Args:
    start = '202201010000'
    end = '202202010000'
    use_cache = False

args = Args()

sb.set()

papers = get_papers(cached=args.use_cache, start=args.start, end=args.end)
papers_list = [vars(a) for a in sum(dict(papers).values(), [])]
json_path = "data_collected_Jan2022-Feb2022.json"
with open(json_path, 'w', encoding='utf-8') as jsonf:
    json.dump(papers_list, jsonf, ensure_ascii=False, indent=4)



Downloading papers.
Progress: 0
(cat:cs.CL OR cat:cs.LG OR cat:cs.AI OR cat:cs.CV) AND submittedDate:[202201010000 TO 202202010000]
2022-1-Progress1: 500
2022-1-Progress2: 1000
2022-1-Progress3: 1500
2022-1-Progress4: 2000
2022-1-Progress5: 2500
2022-1-Progress6: 3000
Final Progress: 3235


In [20]:
#!/usr/bin/env python
import collections
import types
from collections import defaultdict
import json
import arxiv
import time
import seaborn as sb
from semanticscholar.Paper import Paper
from tqdm import tqdm
from semanticscholar import SemanticScholar
import numpy as np
from os.path import basename
import asyncio
from datetime import datetime

"""
    The `search_arxiv` function constructs a search query string based on provided parameters such as 
    queries, fields, categories, start date, and end date. It then uses the arXiv API to execute the 
    search query and fetches results within the specified date range. The function returns the search 
    results obtained from the arXiv repository.
"""   
def search(queries=[], field="all", cats=["cs.CL", "cs.LG", 'cs.AI', 'cs.CV'], start='202201010000', end='202202010000'):
    query_string, client = "", arxiv.Client(num_retries=40, page_size=1000)
    if queries:
        query_string += "(" + " OR ".join(f"{field}:{query}" for query in queries) + ")"
    if cats:
        if query_string:
            query_string += " AND "
        query_string += "(" + " OR ".join(f"cat:{cat}" for cat in cats) + ")"
    query_string += f" AND submittedDate:[{start} TO {end}]"
    print(query_string)
    return client.results(arxiv.Search(
        query=query_string,
        sort_by=arxiv.SortCriterion.SubmittedDate,
    ))
"""
    The `get_papers` function retrieves academic papers from the arXiv repository within a specified 
    date range. It iterates over the search results obtained using the `search_arxiv` function and 
    extracts relevant information such as the title, authors, publication date, summary, and PDF link 
    for each paper. Progress updates are printed every 500 papers processed, and the final number of 
    papers fetched is displayed before returning the collected data.
"""
def get_papers(cached=False, start='202201010000', end='202202010000'):
    papers = []

    print("Downloading papers.")
    progress = 0
    print("Progress:", 0)
    count = 1
    for result in search(start=start, end=end):
        published_date = result.published  # Directly use the datetime object
        pdf_link = f"https://arxiv.org/pdf/{basename(result.entry_id)}.pdf"
        paper_info = {
            "title": result.title,
            "authors": [author.name for author in result.authors],
            "published": published_date.strftime('%Y-%m-%dT%H:%M:%SZ'),  # Format datetime as string if needed
            "summary": result.summary,
            "pdf_link": pdf_link
        }
        papers.append(paper_info)
        progress += 1
        if progress % 500 == 0:
            print(f"Progress{count}: {progress}")
            count += 1

    print("Final Progress:", progress)
    return papers


sb.set()

#adjust dates and parameters to fetch the papers of different periods
start_date = '202201010000'
end_date = '202202010000'
papers = get_papers(cached=False, start=start_date, end=end_date)
json_path = "data_collected_Jan2022-Feb2022.json"
with open(json_path, 'w', encoding='utf-8') as jsonf:
    json.dump(papers, jsonf, ensure_ascii=False, indent=4)


Downloading papers.
Progress: 0
(cat:cs.CL OR cat:cs.LG OR cat:cs.AI OR cat:cs.CV) AND submittedDate:[202201010000 TO 202202010000]
Progress1: 500
Progress2: 1000
Progress3: 1500
Progress4: 2000
Progress5: 2500
Progress6: 3000
Final Progress: 3241
