# Notebook to collect data for workshop

Emilio Lehoucq - 1/17/25

In [1]:
import requests
import time
import pandas as pd

In [2]:
# https://chatgpt.com/share/678aafcb-41bc-8004-971f-12ed99e8e37e
def fetch_openalex_works(search_terms, num_results=500, retries=3):
    """
    Fetch articles from OpenAlex API based on search terms, ensuring abstracts and metadata are included.

    Args:
        search_terms (str): Search terms combined with AND logic.
        num_results (int): Number of desired results (max 500 due to API constraints).
        retries (int): Number of retries for network/server issues.

    Returns:
        list: A list of dictionaries, each containing metadata for an article.
    """
    base_url = "https://api.openalex.org/works"
    per_page = 200  # Maximum allowed per page
    max_pages = (num_results // per_page) + (1 if num_results % per_page > 0 else 0)

    # Construct the query parameters
    search_query = f"({search_terms})"
    filters = "has_abstract:true"
    results = []
    ids = []

    for page in range(1, max_pages + 1):
        params = {
            "search": search_query,
            "filter": filters,
            "per-page": per_page,
            "page": page
        }

        attempt = 0
        while attempt < retries:
            try:
                response = requests.get(base_url, params=params, timeout=10)

                if response.status_code == 200:
                    data = response.json()

                    # Process the data
                    for work in data.get("results", []):
                        if "abstract_inverted_index" in work:
                          if len(work.get("abstract_inverted_index")) > 30: # I added this
                            if work.get("id") not in ids: # I added this
                                ids.append(work.get("id")) # I added this
                                results.append({
                                    "id": work.get("id"),
                                    "title": work.get("title"),
                                    "abstract": work.get("abstract_inverted_index"), # I changed this
                                    "year": work.get("publication_year"),
                                    "source": work.get("host_venue", {}).get("name"),
                                    "authors": [author.get("author", {}).get("display_name") for author in work.get("authorships", [])],
                                    "cites": work.get("referenced_works"),
                                    "cited_by": work.get("cited_by_count"),
                                    "topic": [concept.get("display_name") for concept in work.get("concepts", [])],
                                    "subfield": work.get("primary_location", {}).get("source"),
                                    "field": work.get("primary_location", {}).get("source_type"),
                                    "domain": work.get("primary_location", {}).get("url")
                                })

                    print(f"Page {page} fetched successfully. Total results so far: {len(results)}")

                    # Break out of the retry loop on success
                    break
                else:
                    print(f"Error {response.status_code}: {response.text}")
                    attempt += 1
                    time.sleep(2 ** attempt)

            except Exception as e: # I changed this
                print(f"Request failed: {e}")
                attempt += 1
                time.sleep(2 ** attempt)

        else:
            print(f"Failed to fetch page {page} after {retries} retries. Moving to the next page.")

        # Stop fetching if we already have enough results
        if len(results) >= num_results:
            break

    # Trim results to the requested number
    trimmed_results = results[:num_results]
    print(f"Fetched {len(trimmed_results)} results in total.")

    return trimmed_results

In [3]:
search_terms = "(environmental sustainability)"
articles = fetch_openalex_works(search_terms, num_results=1000)

Page 1 fetched successfully. Total results so far: 196
Request failed: 'NoneType' object has no attribute 'get'
Page 2 fetched successfully. Total results so far: 389
Page 3 fetched successfully. Total results so far: 584
Request failed: 'NoneType' object has no attribute 'get'
Page 4 fetched successfully. Total results so far: 776
Page 5 fetched successfully. Total results so far: 969
Fetched 969 results in total.


In [4]:
for article in articles:
  abstract = article["abstract"]
  abstract = " ".join(abstract.keys())
  if abstract[:8].lower() == "abstract":
    abstract = abstract[8:]
  article["abstract"] = abstract

In [5]:
df = pd.json_normalize(articles)

In [6]:
df.shape

(969, 24)

In [7]:
df.head()

Unnamed: 0,id,title,abstract,year,source,authors,cites,cited_by,topic,field,...,subfield.issn,subfield.is_oa,subfield.is_in_doaj,subfield.is_core,subfield.host_organization,subfield.host_organization_name,subfield.host_organization_lineage,subfield.host_organization_lineage_names,subfield.type,subfield
0,https://openalex.org/W2177463640,THE CONCEPT OF ENVIRONMENTAL SUSTAINABILITY,Species distribution models (SDMs) are numeric...,1995,,[Robert Goodland],"[https://openalex.org/W120804692, https://open...",1232,"[Abundance (ecology), Sustainability, Ecology,...",,...,"[0066-4162, 2330-1902]",False,False,True,https://openalex.org/P4310320373,Annual Reviews,[https://openalex.org/P4310320373],[Annual Reviews],journal,
1,https://openalex.org/W2037974166,Environmental Sustainability: A Definition for...,While acknowledging the need for “sustainabili...,2011,,[John Morelli],"[https://openalex.org/W1792736940, https://ope...",809,"[Sustainability, Context (archaeology), Sustai...",,...,[2159-2519],True,False,True,,,[],[],journal,
2,https://openalex.org/W2103667938,Determinants of Market Competitiveness in an E...,This article introduces a detailed new model o...,2000,,[Salah S. Hassan],"[https://openalex.org/W1981351858, https://ope...",962,"[Competitor analysis, Business, Marketing, Des...",,...,"[0047-2875, 1552-6763]",False,False,True,https://openalex.org/P4310320017,SAGE Publishing,[https://openalex.org/P4310320017],[SAGE Publishing],journal,
3,https://openalex.org/W2163150231,Information systems innovation for environment...,Human life is dependent upon the natural envir...,2010,,[Nigel P. Melville],"[https://openalex.org/W1268042937, https://ope...",775,"[Sustainability, Business, Perspective (graphi...",,...,"[0276-7783, 2162-9730]",False,False,True,https://openalex.org/P4327875293,MIS Quarterly,[https://openalex.org/P4327875293],[MIS Quarterly],journal,
4,https://openalex.org/W1510971620,Information Systems and Environmentally Sustai...,While many corporations and Information System...,2010,,"[Watson, Marie‐Claude Boudreau, Chen Chen]","[https://openalex.org/W120469747, https://open...",1032,"[Informatics, Sustainable development, Informa...",,...,"[0276-7783, 2162-9730]",False,False,True,https://openalex.org/P4327875293,MIS Quarterly,[https://openalex.org/P4327875293],[MIS Quarterly],journal,


In [8]:
df.to_csv("data.csv", index=False)