# **Retrieving papers from OpenAlex API**

This notebook implements a simple pipeline to retrieve papers from the OpenAlex API based on specific criteria. The documentation of the `pyalex` library can be found [here](https://docs.openalex.org/how-to-use-the-api/api-overview).

In [173]:
from pyalex import Works, Sources
from itertools import chain
import pandas as pd

In [174]:
pa_id   = "https://openalex.org/S29331042"   # Political Analysis
lsq_id  = "https://openalex.org/S48869744"   # Legislative Studies Quarterly
ajps_id = "https://openalex.org/S90314269"   # American Journal of Political Science
apsr_id = "https://openalex.org/S176007004"  # American Political Science Review
jpla_id = "https://openalex.org/S2736728258" # Journal of Politics in Latin America
pp_id   = "https://openalex.org/S6959732"    # Party Politics
psrm_id = "https://openalex.org/S2764571748" # Political Science Research and Methods
arps_id = "https://openalex.org/S8194976"    # Annual Review of Political Science

source_ids = [pa_id, lsq_id, ajps_id, apsr_id, jpla_id, pp_id, psrm_id, arps_id]

In [175]:
journals = {}
print("Selected journals:")
for id_ in source_ids:
    source_ = Sources().filter(openalex_id=id_).get()[0]
    print("-", source_["display_name"])
    journals[id_] = source_["display_name"]

Selected journals:
- Political Analysis
- Legislative Studies Quarterly
- American Journal of Political Science
- American Political Science Review
- Journal of Politics in Latin America
- Party Politics
- Political Science Research and Methods
- Annual Review of Political Science


In [176]:
query = '''
(
  "text as data"
  OR "text-as-data"
  OR "word embeddings"
  OR "computational text analysis"
  OR "text mining"
  OR "natural language processing"
  OR nlp
)
AND
(
  legislative
  OR legislature
  OR parliament
  OR parliamentary
  OR congressional
  OR congress
  OR bill
  OR bills
  OR "legislative proposal*"
  OR "legislative text*"
  OR "political text*"
)
'''

In [177]:
def reconstruct_abstract(abstract_inverted_index):
    """
    Rebuilds the abstract text from the inverted index.
    """
    if not abstract_inverted_index:
        return None

    # create a position to word mapping
    position_word = {}
    for word, positions in abstract_inverted_index.items():
        for pos in positions:
            position_word[pos] = word

    # reconstruct the abstract by ordering words by their positions
    return " ".join(
        position_word[pos] for pos in sorted(position_word)
    )

In [178]:
works_all = []

PER_PAGE = 200
MAX_PAGES = 200  # safety cap

for journal_id in source_ids:
    paginator = (
        Works()
        .filter(primary_location={"source": {"id": journal_id}})
        .search(query)
        .select([
            "id", "doi", "title",
            "publication_year", "publication_date",
            "primary_topic",
            "cited_by_count",
            "type",
            "primary_location",
            "authorships",
            "abstract_inverted_index"
        ])
        .paginate(per_page=PER_PAGE)
    )

    n_pages = 0 # count pages retrieved
    n_papers = 0 # count papers retrieved
    
    for i, page in enumerate(paginator, start=0):
        works_all.extend(page)

        # if the next page is empty, break
        if len(page) == 0:
            break

        n_pages += 1
        n_papers += len(page)

        # if we reach the max pages, break
        if i >= MAX_PAGES:
            break
        
    print(f"Retrieved {n_papers} papers from {n_pages} pages for {journals[journal_id]}.")

Retrieved 36 papers from 1 pages for Political Analysis.
Retrieved 8 papers from 1 pages for Legislative Studies Quarterly.
Retrieved 6 papers from 1 pages for American Journal of Political Science.
Retrieved 24 papers from 1 pages for American Political Science Review.
Retrieved 1 papers from 1 pages for Journal of Politics in Latin America.
Retrieved 6 papers from 1 pages for Party Politics.
Retrieved 11 papers from 1 pages for Political Science Research and Methods.
Retrieved 5 papers from 1 pages for Annual Review of Political Science.


In [179]:
# a way of iterating the paginator object directly
# for record in chain(paginator):
#     print(record)

In [180]:
for work in works_all:
    work["abstract"] = reconstruct_abstract(work.get("abstract_inverted_index"))

In [181]:
print(f"Total works found: {len(works_all)}")

Total works found: 97


In [182]:
df = pd.DataFrame(works_all)

In [183]:
# the column primary_topic is a dictionary; we can expand it into multiple columns
primary_topics = pd.json_normalize(df['primary_topic'])
primary_locations = pd.json_normalize(df['primary_location'])
authorships = pd.json_normalize(df['authorships'])

df = pd.concat(
    [
        df.drop(columns=['primary_topic', 'primary_location', 'authorships', 'abstract_inverted_index']),
        primary_topics.add_prefix('primary_topic.'), 
        primary_locations.add_prefix('primary_location.'), 
        authorships.add_prefix('authorships.')
        ], 
    axis=1)

In [184]:
authorships_0 = pd.json_normalize(df["authorships.0"])
authorships_1 = pd.json_normalize(df["authorships.1"])
authorships_2 = pd.json_normalize(df["authorships.2"])
authorships_3 = pd.json_normalize(df["authorships.3"])
authorships_4 = pd.json_normalize(df["authorships.4"])
authorships_5 = pd.json_normalize(df["authorships.5"])
authorships_6 = pd.json_normalize(df["authorships.6"])

df = pd.concat(
    [
        df.drop(columns=[
            'authorships.0', 'authorships.1', 'authorships.2', 
            'authorships.3', 'authorships.4', 'authorships.5', 
            'authorships.6']),
        authorships_0.add_prefix('authorships.0.'),
        authorships_1.add_prefix('authorships.1.'),
        authorships_2.add_prefix('authorships.2.'),
        authorships_3.add_prefix('authorships.3.'),
        authorships_4.add_prefix('authorships.4.'),
        authorships_5.add_prefix('authorships.5.'),
        authorships_6.add_prefix('authorships.6.')
    ],
    axis=1)

In [185]:
df_flt = df[[
    "id", 
    "doi", 
    "title", 
    "publication_year", 
    "publication_date", 
    "cited_by_count", 
    "type", 
    "abstract",
    "primary_topic.display_name", 
    "primary_topic.subfield.display_name",
    "primary_location.source.display_name",
    "primary_location.is_oa",
    "authorships.0.author.display_name",
    "authorships.1.author.display_name",
    "authorships.2.author.display_name",
    "authorships.3.author.display_name",
    "authorships.4.author.display_name",
    "authorships.5.author.display_name",
    "authorships.6.author.display_name"
    ]]

In [186]:
df_flt.sort_values(by=["cited_by_count"], ascending=False, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_flt.sort_values(by=["cited_by_count"], ascending=False, inplace=True)


In [188]:
df_flt.to_excel("../data/openalex_systematic_review.xlsx", index=False)