In [2]:
import requests

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df_paper_authors = pd.read_csv('../data/author benchmarks - paper_authors.csv')

In [6]:
df_paper_authors.doi.notna().sum()

2651

In [7]:
dois = df_paper_authors['doi'].dropna().unique()

endpoint = "works"
size = 50
loop_index = 0
works = []
for list_index in range(0, len(dois), size):
    subset = dois[list_index:list_index+size]
    pipe_separated_ids = "|".join(subset)
    r = requests.get(f"https://api.openalex.org/{endpoint}?filter=doi:{pipe_separated_ids}&per-page={size}")
    results = r.json()['results']
    works.extend(results)
    loop_index += 1
print(f"collected {len(works)} works using {loop_index} api calls")

collected 2359 works using 48 api calls


In [8]:
data = []
for work in works:
    doi = work['doi'].replace('https://doi.org/', '')
    authors = []
    for position, authorship in enumerate(work['authorships']):
        author_name = authorship['author']['display_name']
        author_id = authorship['author']['id'].replace('https://openalex.org/', '')
        authors.append( (position, author_name, author_id) )
    data.append({
        'doi': doi,
        'authors': authors,
    })
openalex_author_info = pd.DataFrame(data)

In [9]:
df_paper_authors = df_paper_authors.merge(openalex_author_info, how='left', on='doi')

In [11]:
df_paper_authors.to_csv('../data/scopus_paper_authors_with_openalex_author_info.csv')

In [38]:
data = []
for work in works:
    doi = work['doi'].replace('https://doi.org/', '') if work['doi'] else None
    openalex_id = work['id'].replace('https://openalex.org/', '')
    for authorship in work['authorships']:
        author_id = authorship['author']['id'].replace('https://openalex.org/', '')

        data.append({
            'doi': doi,
            'openalex_work_id': openalex_id,
            'openalex_author_id': author_id,
        })
df_alex_paper_authors = pd.DataFrame(data)

In [48]:
_df = df_paper_authors[['scopus_author_id', 'doi']].dropna(subset=['doi'])
_df = _df.merge(df_alex_paper_authors, how='inner', on='doi')

In [52]:
modes = _df.groupby('scopus_author_id')['openalex_author_id'].agg(pd.Series.mode)
modes.to_csv('../data/author_modes.csv')

In [12]:
author_id = 'A2140778852'

works_alex = []
r = requests.get(f"https://api.openalex.org/{endpoint}?filter=author.id:{author_id}")
r.json()


{'meta': {'count': 255, 'db_response_time_ms': 46, 'page': 1, 'per_page': 25},
 'results': [{'id': 'https://openalex.org/W2150220236',
   'doi': 'https://doi.org/10.1007/s11192-009-0146-3',
   'title': 'Software survey: VOSviewer, a computer program for bibliometric mapping',
   'display_name': 'Software survey: VOSviewer, a computer program for bibliometric mapping',
   'publication_year': 2010,
   'publication_date': '2010-01-01',
   'ids': {'openalex': 'https://openalex.org/W2150220236',
    'doi': 'https://doi.org/10.1007/s11192-009-0146-3',
    'mag': '2150220236',
    'pmid': 'https://pubmed.ncbi.nlm.nih.gov/20585380',
    'pmcid': 'https://www.ncbi.nlm.nih.gov/pmc/articles/2883932'},
   'primary_location': {'is_oa': True,
    'landing_page_url': 'https://doi.org/10.1007/s11192-009-0146-3',
    'pdf_url': None,
    'source': {'id': 'https://openalex.org/S148561398',
     'display_name': 'Scientometrics',
     'issn_l': '0138-9130',
     'issn': ['1588-2861', '0138-9130'],
     'h

In [13]:
cursor = '*'

# select = ",".join((
#     'id',
#     'ids',
#     'title',
#     'display_name',
#     'publication_year',
#     'publication_date',
#     'primary_location',
#     'open_access',
#     'authorships',
#     'cited_by_count',
#     'is_retracted',
#     'is_paratext',
#     'updated_date',
#     'created_date',
# ))

# loop through pages
works_alex = []
loop_index = 0
while cursor:
    
    # set cursor value and request page from OpenAlex
    url = f"https://api.openalex.org/{endpoint}?filter=author.id:{author_id}&cursor={cursor}"
    page_with_results = requests.get(url).json()
    
    results = page_with_results['results']
    works_alex.extend(results)

    # update cursor to meta.next_cursor
    cursor = page_with_results['meta']['next_cursor']
    loop_index += 1
    if loop_index in [5, 10, 20, 50, 100] or loop_index % 500 == 0:
        print(f'{loop_index} api requests made so far')
print(f'done. made {loop_index} api requests. collected {len(works_alex)} works')

5 api requests made so far
10 api requests made so far
done. made 12 api requests. collected 255 works


In [24]:
data = []
for work in works_alex:
    doi = work['doi'].replace('https://doi.org/', '') if work['doi'] else None
    openalex_id = work['id'].replace('https://openalex.org/', '')
    data.append({
        'doi': doi,
        'openalex_id': openalex_id,
    })
df_works_alex = pd.DataFrame(data)

In [33]:
subset = df_paper_authors[df_paper_authors['scopus_author_id']==14632830700].dropna(subset=['doi'])
_df = subset.merge(df_works_alex.dropna(subset=['doi']), how='left', on='doi')

In [35]:
_df.openalex_id.isna().value_counts()

False    85
True     10
Name: openalex_id, dtype: int64

In [30]:
df_works_alex.doi.duplicated().sum()

110

In [31]:
df_works_alex.sort_values('doi')

Unnamed: 0,doi,openalex_id
71,10.1002/asi.20647,W3121267476
47,10.1002/asi.20872,W3123636951
10,10.1002/asi.21075,W3122100588
52,10.1002/asi.21354,W2950735408
12,10.1002/asi.21421,W2953107155
...,...,...
222,,W3125437681
223,,W3130697658
224,,W3140476962
232,,W3210993721


In [22]:
_df = df_works_alex.dropna(subset=['doi'])
_df[_df.doi.str.contains('qss')]

Unnamed: 0,doi,openalex_id
27,https://doi.org/10.1162/qss_a_00112,https://openalex.org/W3123554164
63,https://doi.org/10.1162/qss_a_00035,https://openalex.org/W3013794879
68,https://doi.org/10.1162/qss_e_00026,https://openalex.org/W3007990842
70,https://doi.org/10.1162/qss_a_00109,https://openalex.org/W3121088567
107,https://doi.org/10.1162/qss_e_00025,https://openalex.org/W3006840736
118,https://doi.org/10.1162/qss_e_00115,https://openalex.org/W3154335143
145,https://doi.org/10.1162/qss_a_00109/v2/response1,https://openalex.org/W3123025481
251,https://doi.org/10.1162/qss_a_00212,https://openalex.org/W4297385942
253,https://doi.org/10.1162/qss_e_00214,https://openalex.org/W4312953325


In [53]:
df_authors = pd.read_csv('../data/author benchmarks - authors.csv')

In [56]:
author_ids = df_authors['openalex_main_author_id'].dropna().unique()

endpoint = "authors"
size = 50
loop_index = 0
authors_alex = []
for list_index in range(0, len(author_ids), size):
    subset = author_ids[list_index:list_index+size]
    pipe_separated_ids = "|".join(subset)
    r = requests.get(f"https://api.openalex.org/{endpoint}?filter=openalex:{pipe_separated_ids}&per-page={size}")
    results = r.json()['results']
    authors_alex.extend(results)
    loop_index += 1
print(f"collected {len(works)} works using {loop_index} api calls")

collected 2359 works using 1 api calls


In [58]:
data = []
for au in authors_alex:
    openalex_id = au['id'].replace('https://openalex.org/', '')
    display_name = au['display_name']
    h_index = au['summary_stats']['h_index']
    affil = au['last_known_institution'].get('display_name', None)
    works_count = au['works_count']
    data.append({
        'openalex_main_author_id': openalex_id,
        'openalex_display_name': display_name,
        'openalex_num_papers': works_count,
        'openalex_affil': affil,
        'openalex_h_index': h_index,
    })

df_authors_alex_api = pd.DataFrame(data)

In [63]:
out = df_authors_alex_api.merge(df_authors[['scopus_author_id', 'openalex_main_author_id']], how='inner', on='openalex_main_author_id').sort_values('scopus_author_id')
out.to_csv('../data/df_authors_alex_api.csv', index=False)

In [77]:
def get_works_one_author(author_id):
    endpoint = 'works'
    cursor = '*'

    select = ",".join((
        'id',
        'title',
        'doi',
    ))

    # loop through pages
    works_alex = []
    loop_index = 0
    while cursor:
        
        # set cursor value and request page from OpenAlex
        url = f"https://api.openalex.org/{endpoint}?filter=author.id:{author_id}&cursor={cursor}&select={select}"
        page_with_results = requests.get(url).json()
        
        results = page_with_results['results']
        works_alex.extend(results)

        # update cursor to meta.next_cursor
        cursor = page_with_results['meta']['next_cursor']
        loop_index += 1
        # if loop_index in [5, 10, 20, 50, 100] or loop_index % 500 == 0:
        #     print(f'{loop_index} api requests made so far')
    print(f'for {author_id}: made {loop_index} api requests. collected {len(works_alex)} works')
    return works_alex

In [78]:
author_ids = df_authors_alex_api.openalex_main_author_id.dropna().unique()
display_names = df_authors_alex_api.dropna(subset=['openalex_main_author_id']).set_index('openalex_main_author_id')['openalex_display_name']
data = []
for author_id in author_ids:
    this_author_works = get_works_one_author(author_id)
    for work in this_author_works:
        doi = work['doi'].replace('https://doi.org/', '') if work['doi'] else None
        openalex_work_id = work['id'].replace('https://openalex.org/', '')
        data.append({
            'openalex_author_id': author_id,
            'openalex_author_display_name': display_names[author_id],
            'doi': doi,
            'title': work['title'],
            'openalex_work_id': openalex_work_id,
        })
df_alex_paper_authors = pd.DataFrame(data)

for A2618799584: made 312 api requests. collected 7757 works
for A1993486354: made 47 api requests. collected 1138 works
for A40084183: made 30 api requests. collected 714 works
for A98572569: made 29 api requests. collected 687 works
for A2195478976: made 26 api requests. collected 618 works
for A291762745: made 20 api requests. collected 455 works
for A2143783256: made 13 api requests. collected 300 works
for A2123775253: made 12 api requests. collected 270 works
for A2140778852: made 12 api requests. collected 255 works
for A2636854936: made 11 api requests. collected 228 works
for A2127341518: made 9 api requests. collected 199 works
for A266176790: made 9 api requests. collected 183 works
for A2122189410: made 8 api requests. collected 167 works
for A109906890: made 7 api requests. collected 145 works
for A2787483045: made 6 api requests. collected 119 works
for A2000627202: made 5 api requests. collected 80 works


In [80]:
df_alex_paper_authors.to_csv('../data/df_alex_paper_authors.csv')