Skip to content

Commit

Permalink
feat: add support for bulk retrieval in paper search
Browse files Browse the repository at this point in the history
Resolves: #62
  • Loading branch information
danielnsilva committed Mar 10, 2024
1 parent 2f788fc commit 0fa8aac
Show file tree
Hide file tree
Showing 22 changed files with 876,870 additions and 125 deletions.
4 changes: 2 additions & 2 deletions README.md
Expand Up @@ -142,7 +142,7 @@ Output:

> **Warning**
>
> From the [official documentation](https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/get_graph_get_paper_search): "Because of the subtleties of finding partial phrase matches in different parts of the document, be cautious about interpreting the total field as a count of documents containing any particular word in the query."
> From the [official documentation](https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/get_graph_paper_relevance_search): "Because of the subtleties of finding partial phrase matches in different parts of the document, be cautious about interpreting the total field as a count of documents containing any particular word in the query."
To search for authors by name:

Expand Down Expand Up @@ -320,7 +320,7 @@ Output:

#### ```fields_of_study: list```

Restrict results to a given list of fields of study. Check [official documentation](https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/get_graph_get_paper_search) for a list of available fields.
Restrict results to a given list of fields of study. Check [official documentation](https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/get_graph_paper_relevance_search) for a list of available fields.

```python
from semanticscholar import SemanticScholar
Expand Down
34 changes: 30 additions & 4 deletions semanticscholar/AsyncSemanticScholar.py
Expand Up @@ -354,12 +354,21 @@ async def search_paper(
fields: list = None,
publication_date_or_year: str = None,
min_citation_count: int = None,
limit: int = 100
limit: int = 100,
bulk: bool = False,
sort: str = None
) -> PaginatedResults:
'''Search for papers by keyword
'''Search for papers by keyword. Performs a search query based on the \
S2 search relevance algorithm, or a bulk retrieval of basic paper \
data without search relevance (if bulk=True). Paper relevance \
search is the default behavior and returns up to 1,000 results. \
Bulk retrieval instead returns up to 10,000,000 results (1,000 \
in each page).
:calls: `GET /paper/search <https://api.semanticscholar.org/api-docs/\
graph#tag/Paper-Data/operation/get_graph_get_paper_search>`_
graph#tag/Paper-Data/operation/get_graph_paper_relevance_search>`_
:calls: `GET /paper/search <https://api.semanticscholar.org/api-docs/\
graph#tag/Paper-Data/operation/get_graph_paper_bulk_search>`_
:param str query: plain-text search query string.
:param str year: (optional) restrict results to the given range of \
Expand All @@ -380,6 +389,13 @@ async def search_paper(
with at least the given number of citations.
:param int limit: (optional) maximum number of results to return \
(must be <= 100).
:param bool bulk: (optional) bulk retrieval of basic paper data \
without search relevance (ignores the limit parameter if True \
and returns up to 1,000 results in each page).
:param str sort: (optional) sorts results (only if bulk=True) using \
<field>:<order> format, where "field" is either paperId, \
publicationDate, or citationCount, and "order" is asc \
(ascending) or desc (descending).
:returns: query results.
:rtype: :class:`semanticscholar.PaginatedResults.PaginatedResults`
'''
Expand All @@ -393,6 +409,14 @@ async def search_paper(

base_url = self.api_url + self.BASE_PATH_GRAPH
url = f'{base_url}/paper/search'

if bulk:
url += '/bulk'
if sort:
query += f'&sort={sort}'
elif sort:
warnings.warn(
'The sort parameter is only used when bulk=True.')

query += f'&year={year}' if year else ''

Expand Down Expand Up @@ -423,6 +447,8 @@ async def search_paper(

if min_citation_count:
query += f'&minCitationCount={min_citation_count}'

max_results = 10000000 if bulk else 1000

results = await PaginatedResults.create(
self._requester,
Expand All @@ -432,7 +458,7 @@ async def search_paper(
fields,
limit,
self.auth_header,
max_results=1000
max_results=max_results
)

return results
Expand Down
21 changes: 15 additions & 6 deletions semanticscholar/PaginatedResults.py
Expand Up @@ -3,6 +3,7 @@
import nest_asyncio

from semanticscholar.ApiRequester import ApiRequester
from semanticscholar.SemanticScholarException import NoMorePagesException


class PaginatedResults:
Expand Down Expand Up @@ -40,6 +41,7 @@ def __init__(
self._next = 0
self._parameters = ''
self._items = []
self._continuation_token = None
nest_asyncio.apply()

@classmethod
Expand Down Expand Up @@ -104,9 +106,11 @@ def __getitem__(self, key: int) -> Any:
return self._items[key]

def _has_next_page(self) -> bool:
has_more_results = (self._offset + self._limit) == self._next
under_limit = (self._offset + self._limit) < (self._max_results - 1)
return has_more_results and under_limit
has_token = self._continuation_token is not None
next_page_offset = self._offset + self._limit
has_more_results = next_page_offset == self._next or has_token
is_under_limit = next_page_offset < (self._max_results - 1)
return has_more_results and is_under_limit

async def _request_data(self) -> Union[dict, List[dict]]:
return await self._requester.get_data_async(
Expand All @@ -124,9 +128,10 @@ async def _async_get_next_page(self) -> Union[dict, List[dict]]:

def _get_next_page(self) -> list:

self._build_params()
if not self._has_next_page():
raise NoMorePagesException('No more pages to fetch.')

result_items = []
self._build_params()

loop = asyncio.get_event_loop()
results = loop.run_until_complete(self._request_data())
Expand All @@ -137,6 +142,9 @@ def _build_params(self) -> None:

self._parameters = f'query={self._query}' if self._query else ''

if self._continuation_token:
self._parameters += f'&token={self._continuation_token}'

fields = ','.join(self._fields)
self._parameters += f'&fields={fields}'

Expand All @@ -156,8 +164,9 @@ def _update_params(self, results: Union[dict, List[dict]]) -> list:

self._data = results['data']
self._total = results['total'] if 'total' in results else 0
self._offset = results['offset']
self._offset = results['offset'] if 'offset' in results else 0
self._next = results['next'] if 'next' in results else 0
self._continuation_token = results['token'] if 'token' in results else None

for item in results['data']:
result_items.append(self._data_type(item))
Expand Down
26 changes: 22 additions & 4 deletions semanticscholar/SemanticScholar.py
Expand Up @@ -266,12 +266,21 @@ def search_paper(
fields: list = None,
publication_date_or_year: str = None,
min_citation_count: int = None,
limit: int = 100
limit: int = 100,
bulk: bool = False,
sort: str = None
) -> PaginatedResults:
'''Search for papers by keyword
'''Search for papers by keyword. Performs a search query based on the \
S2 search relevance algorithm, or a bulk retrieval of basic paper \
data without search relevance (if bulk=True). Paper relevance \
search is the default behavior and returns up to 1,000 results. \
Bulk retrieval instead returns up to 10,000,000 results (1,000 \
in each page).
:calls: `GET /paper/search <https://api.semanticscholar.org/api-docs/\
graph#tag/Paper-Data/operation/get_graph_get_paper_search>`_
graph#tag/Paper-Data/operation/get_graph_paper_relevance_search>`_
:calls: `GET /paper/search <https://api.semanticscholar.org/api-docs/\
graph#tag/Paper-Data/operation/get_graph_paper_bulk_search>`_
:param str query: plain-text search query string.
:param str year: (optional) restrict results to the given range of \
Expand All @@ -292,6 +301,13 @@ def search_paper(
with at least the given number of citations.
:param int limit: (optional) maximum number of results to return \
(must be <= 100).
:param bool bulk: (optional) bulk retrieval of basic paper data \
without search relevance (ignores the limit parameter if True \
and returns up to 1,000 results in each page).
:param str sort: (optional) sorts results (only if bulk=True) using \
<field>:<order> format, where "field" is either paperId, \
publicationDate, or citationCount, and "order" is asc \
(ascending) or desc (descending).
:returns: query results.
:rtype: :class:`semanticscholar.PaginatedResults.PaginatedResults`
'''
Expand All @@ -308,7 +324,9 @@ def search_paper(
fields=fields,
publication_date_or_year=publication_date_or_year,
min_citation_count=min_citation_count,
limit=limit
limit=limit,
bulk=bulk,
sort=sort
)
)

Expand Down
3 changes: 3 additions & 0 deletions semanticscholar/SemanticScholarException.py
Expand Up @@ -11,3 +11,6 @@ class BadQueryParametersException(SemanticScholarException):

class ObjectNotFoundException(SemanticScholarException):
'''Paper or Author ID not found.'''

class NoMorePagesException(SemanticScholarException):
'''No more pages to fetch.'''

0 comments on commit 0fa8aac

Please sign in to comment.