# OpenAlex Cited References
### Adapted from: https://github.com/eschares/OpenAlex-CitedReferences/tree/main with thanks to Eric Schares, Iowa State University; [eschares.github.io](eschares.github.io) and Sandra Mierz; [https://github.com/smierz](https://github.com/smierz) 
---

In [6]:
# needed dependencies
import requests
import pandas as pd
import pyarrow

In [26]:
email = "ui@openalex.org"

Create API query

In [12]:
filtered_works_url = "https://api.openalex.org/works?page=1&filter=title_and_abstract.search:sonification+OR+%22auditory+display%22,keywords.id:sonification,keywords.id:auditory-display&sort=relevance_score:desc&per_page=10&mailto=ui@openalex.org"

In [7]:
def get_metadata_using_cursor_paging(openalex_url):
    session = requests.Session()

    # url with a placeholder for cursor
    openalex_url_with_cursor = openalex_url + '&per_page=200&cursor={}'

    # loop through pages
    cursor = '*'
    while cursor:
        # set cursor value and request page from OpenAlex
        url = openalex_url_with_cursor.format(cursor)
        print(url)
        page_with_results = session.get(url).json()

        # update cursor to meta.next_cursor
        cursor = page_with_results['meta']['next_cursor']

        # return page results to user to process
        results = page_with_results['results']
        yield results

In [18]:
def extract_selected_fields(openalex_work):
    return (  openalex_work['id'],
              openalex_work['doi'],
              openalex_work['publication_year'],
              openalex_work['title'],
       #       openalex_work['host_venue']['display_name'],
       #       openalex_work['host_venue']['publisher'],
       #       openalex_work['host_venue']['issn_l'],
              len(openalex_work['referenced_works'])
            )

In [9]:
def extract_references(work):
    return [(work['id'], ref) for ref in work['referenced_works']]

In [10]:
data_folder = '../data/openAlex/test'

def store_in_file(data, column_names, filename):
    data_in_df = pd.DataFrame(data, columns=column_names)
    if filename.endswith("csv"):
        data_in_df.to_csv(filename, index=False)
    else:
        data_in_df.to_parquet(filename)

In [16]:
api_response = requests.get(filtered_works_url)
parsed_response = api_response.json()

count = parsed_response['meta']['count']
print(f"number of publications: {count}")

per_page = 200
number_of_pages_needed = int(count / per_page) + (count % per_page > 0)
print(f"number of requests needed (with per_page set to {per_page}): {number_of_pages_needed}")

number of publications: 690
number of requests needed (with per_page set to 200): 4


In [20]:
%%time

# get all publications
publications = []
pub2ref = []

results_per_page = get_metadata_using_cursor_paging(filtered_works_url)
for results in results_per_page:
    for work in results:
        publications.append(extract_selected_fields(work))
        pub2ref.extend(extract_references(work))

# store publications
store_in_file(publications, 
                  ['publication_id', 'publication_doi', 'publication_year', 'publication_title', 'num_cited_references'], 
                  f'{data_folder}/publications.csv')

# store connections from publications to their references
store_in_file(pub2ref, ['publication_id', 'reference_id'], f'{data_folder}/pub2ref.csv')

https://api.openalex.org/works?page=1&filter=title_and_abstract.search:sonification+OR+%22auditory+display%22,keywords.id:sonification,keywords.id:auditory-display&sort=relevance_score:desc&per_page=10&mailto=ui@openalex.org&per_page=200&cursor=*
https://api.openalex.org/works?page=1&filter=title_and_abstract.search:sonification+OR+%22auditory+display%22,keywords.id:sonification,keywords.id:auditory-display&sort=relevance_score:desc&per_page=10&mailto=ui@openalex.org&per_page=200&cursor=Ils1Mi4xMTI4NywgOTkuMCwgNiwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL1c0MzAzMTg0MjcwJ10i
https://api.openalex.org/works?page=1&filter=title_and_abstract.search:sonification+OR+%22auditory+display%22,keywords.id:sonification,keywords.id:auditory-display&sort=relevance_score:desc&per_page=10&mailto=ui@openalex.org&per_page=200&cursor=IlsxMi43MjE3MjQsIDk4LjAsIDM0LCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzIxODg5MjY3NTAnXSI=
https://api.openalex.org/works?page=1&filter=title_and_abstract.search:sonification+OR+%22auditory+displa

In [21]:
ref_ids = [p2r[1] for p2r in pub2ref]
print(f'number of references in pub2ref: {len(ref_ids)}')
                                                   
unique_ref_ids = list(dict.fromkeys(ref_ids))
print(f'number of unique references in pub2ref: {len(unique_ref_ids)}')

number of references in pub2ref: 9844
number of unique references in pub2ref: 5887


In [22]:
def build_url_for_references(openalex_ids, per_page, mailto):
    # specify endpoint
    endpoint = 'works'

    # build the 'filter' parameter
    openalex_only_ids = [openalex_id.replace("https://openalex.org/", "") for openalex_id in openalex_ids]
    filters = f'openalex:{"|".join(openalex_only_ids)}'
    
    # put the URL together
    return f'https://api.openalex.org/{endpoint}?filter={filters}&per_page={per_page}&mailto={mailto}'

In [23]:
def get_references(reference_ids, mailto):
    chunk_size = 50
    session = requests.Session()
    
    for i in range(0, len(reference_ids), chunk_size):
        ref_ids_slice = reference_ids[i:i + chunk_size]
        references_url = build_url_for_references(ref_ids_slice, chunk_size, mailto)
        
        page_with_results = session.get(references_url)
        results = page_with_results.json()['results']
        yield results

In [24]:
count = len(unique_ref_ids)
per_page = 50
number_of_pages_needed = int(count / per_page) + (count % per_page > 0)
print(f"number of requests needed (with per_page set to {per_page}): {number_of_pages_needed}")

number of requests needed (with per_page set to 50): 118


In [28]:
%%time

# call OpenAlex API 
references = []
results_per_page = get_references(unique_ref_ids, email)
for i, results in enumerate(results_per_page):
    if i % 100 == 0: print(f'{i} requests sent')
    for work in results:
        # extract fields
        references.append(extract_selected_fields(work))

# store references
store_in_file(references, 
                  ['reference_id','reference_doi','reference_year','reference_title','reference_citation_count'],
                  f'{data_folder}/references.parquet')

0 requests sent
100 requests sent
CPU times: user 3.91 s, sys: 349 ms, total: 4.26 s
Wall time: 1min 37s


In [29]:
# references only
refs_df = pd.read_parquet(f'{data_folder}/references.parquet')
refs_df.head(10)

Unnamed: 0,reference_id,reference_doi,reference_year,reference_title,reference_citation_count
0,https://openalex.org/W1507811735,https://doi.org/10.1007/978-3-642-01129-0,2009.0,Applications of Evolutionary Computing,12
1,https://openalex.org/W2037001540,https://doi.org/10.2307/3680606,1995.0,"Auditory Display: Sonification, Audification, ...",0
2,https://openalex.org/W1517677814,https://doi.org/10.1186/1743-0003-3-11,2006.0,Recent developments in biofeedback for neuromo...,112
3,https://openalex.org/W1499933681,,1994.0,An introduction to auditory display,0
4,https://openalex.org/W167948671,,2008.0,TAXONOMY AND DEFINITIONS FOR SONIFICATION AND ...,8
5,https://openalex.org/W2018654500,https://doi.org/10.1145/1240624.1240642,2007.0,Shoogle,11
6,https://openalex.org/W1560144794,,1999.0,Listen to your Data: Model-Based Sonification ...,5
7,https://openalex.org/W1596196963,,2002.0,Sonification for Exploratory Data Analysis,0
8,https://openalex.org/W2124490600,https://doi.org/10.1007/11678816_35,2006.0,AcouMotion – An Interactive Sonification Syste...,12
9,https://openalex.org/W9119819,https://doi.org/10.1016/s0021-9290(96)00147-9,2005.0,An introduction to interactive sonification,4


In [30]:
# publications and their references
pub2ref_df = pd.read_csv(f'{data_folder}/pub2ref.csv')

df = pubs_df.join(pub2ref_df.set_index('publication_id'), on='publication_id')
df = df.join(refs_df.set_index('reference_id'), on='reference_id')
df.head(10)

NameError: name 'pubs_df' is not defined