In [1]:
import requests
import duckdb
import pandas as pd
from dotenv import load_dotenv

#### 1. Get OpenAlex author data snapshot file locations from most recent manifest

In [24]:
manifest = requests.get('https://openalex.s3.amazonaws.com/data/authors/manifest').json()

In [27]:
# get file paths
files = []

for entry in manifest['entries']:
    file = entry['url']
    files.append(file)

print(len(files))

89


#### 2. Install `httpfs` and review snapshot data files using `duckdb`

In [28]:
duckdb.sql("install 'httpfs'; load 'httpfs'; set s3_region='us-east-1';")

In [32]:
duckdb.sql("select * from read_json_auto('https://openalex.s3.amazonaws.com/data/authors/updated_date%3D2023-08-15/part_001.gz', format='newline_delimited', compression='gzip')")

┌──────────────────────┬──────────────────────┬────────────────┬───┬──────────────┬─────────────┬──────────────────────┐
│      x_concepts      │ display_name_alter…  │ cited_by_count │ … │ created_date │ works_count │       updated        │
│ struct(score doubl…  │      varchar[]       │     int64      │   │     date     │    int64    │       varchar        │
├──────────────────────┼──────────────────────┼────────────────┼───┼──────────────┼─────────────┼──────────────────────┤
│ [{'score': 100.0, …  │ [K. Palczynski, K …  │             40 │ … │ 2023-07-21   │           2 │ 2023-08-15T21:27:1…  │
│ [{'score': 100.0, …  │ [TE Nyirenda, T Ny…  │             40 │ … │ 2023-07-21   │           6 │ 2023-08-15T18:56:5…  │
│ [{'score': 100.0, …  │ [Gabriela Popek]     │             40 │ … │ 2023-07-21   │           1 │ 2023-08-15T21:27:5…  │
│ [{'score': 100.0, …  │ [Kristine R. Greer…  │             40 │ … │ 2023-07-21   │           2 │ 2023-08-15T10:56:4…  │
│ [{'score': 89.3, '…  │ [I.Zh. 

#### 3. Explore a small sample of the records in these snapshot files to see how things are structured

In [33]:
df = duckdb.sql("select * from read_json_auto('https://openalex.s3.amazonaws.com/data/authors/updated_date%3D2023-08-15/part_001.gz', format='newline_delimited', compression='gzip') limit 100").df()

In [34]:
df.columns

Index(['x_concepts', 'display_name_alternatives', 'cited_by_count',
       'most_cited_work', 'counts_by_year', 'last_known_institution', 'orcid',
       'display_name', 'summary_stats', 'works_api_url', 'ids', 'id',
       'updated_date', 'created_date', 'works_count', 'updated'],
      dtype='object')

In [35]:
df

Unnamed: 0,x_concepts,display_name_alternatives,cited_by_count,most_cited_work,counts_by_year,last_known_institution,orcid,display_name,summary_stats,works_api_url,ids,id,updated_date,created_date,works_count,updated
0,"[{'score': 100.0, 'level': 1, 'id': 'https://o...","[K. Palczynski, K Palczynski]",40,Potential Biosignatures in Super-Earth Atmosph...,"[{'cited_by_count': 1, 'year': 2023, 'works_co...","{'country_code': 'DE', 'ror': 'https://ror.org...",,K. Palczynski,"{'cited_by_count': 40, '2yr_i10_index': 0, 'h_...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5003040756...,https://openalex.org/A5003040756,2023-08-15T21:27:12.541593,2023-07-21,2,2023-08-15T21:27:12.541593
1,"[{'score': 100.0, 'level': 0, 'id': 'https://o...","[TE Nyirenda, T Nyirenda, Nyirenda Te, T.E. Ny...",40,Using a bus service for transporting sputum sp...,"[{'cited_by_count': 1, 'year': 2023, 'works_co...","{'country_code': 'MW', 'ror': 'https://ror.org...",,Nyirenda Te,"{'cited_by_count': 40, '2yr_i10_index': 0, 'h_...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5003051334...,https://openalex.org/A5003051334,2023-08-15T18:56:52.521905,2023-07-21,6,2023-08-15T18:56:52.521905
2,"[{'score': 100.0, 'level': 4, 'id': 'https://o...",[Gabriela Popek],40,PCSK9 signaling pathways and their potential i...,"[{'cited_by_count': 12, 'year': 2023, 'works_c...","{'country_code': 'PL', 'ror': 'https://ror.org...",,Gabriela Popek,"{'cited_by_count': 40, '2yr_i10_index': 0, 'h_...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5003067678...,https://openalex.org/A5003067678,2023-08-15T21:27:56.504137,2023-07-21,1,2023-08-15T21:27:56.504137
3,"[{'score': 100.0, 'level': 1, 'id': 'https://o...","[Kristine R. Greer, K. Greer]",40,Primary Motor Area Activation during Precision...,"[{'cited_by_count': 1, 'year': 2023, 'works_co...","{'country_code': 'US', 'ror': 'https://ror.org...",,Kristine R. Greer,"{'cited_by_count': 40, '2yr_i10_index': 0, 'h_...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5003102207...,https://openalex.org/A5003102207,2023-08-15T10:56:48.314371,2023-07-21,2,2023-08-15T10:56:48.314371
4,"[{'score': 89.3, 'level': 0, 'id': 'https://op...","[I.Zh. Zhalsanova, И.А. Гончарова, Irina Zhals...",40,Comorbidity of asthma and hypertension may be ...,"[{'cited_by_count': 13, 'year': 2023, 'works_c...","{'country_code': 'RU', 'ror': 'https://ror.org...",https://orcid.org/0000-0001-6848-7749,И. Ж. Жалсанова,"{'cited_by_count': 40, '2yr_i10_index': 0, 'h_...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5003106718...,https://openalex.org/A5003106718,2023-08-15T08:42:33.264095,2023-07-21,28,2023-08-15T08:42:33.264095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"[{'score': 100.0, 'level': 0, 'id': 'https://o...","[Christine M. Mayr Marangon, C Marangon, Chris...",40,Techniques for Mitigating the Effects of Smoke...,"[{'cited_by_count': 10, 'year': 2023, 'works_c...","{'country_code': 'IT', 'ror': 'https://ror.org...",https://orcid.org/0000-0001-7826-2295,Christine Mayr Marangon,"{'cited_by_count': 40, '2yr_i10_index': 2, 'h_...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5004582787...,https://openalex.org/A5004582787,2023-08-15T17:31:54.752480,2023-07-21,11,2023-08-15T17:31:54.752480
96,"[{'score': 100.0, 'level': 0, 'id': 'https://o...",[Yutaro Sumida],40,Chloride intracellular channel protein 2 in ca...,"[{'cited_by_count': 3, 'year': 2023, 'works_co...","{'country_code': 'JP', 'ror': 'https://ror.org...",,Yutaro Sumida,"{'cited_by_count': 40, '2yr_i10_index': 0, 'h_...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5004583578...,https://openalex.org/A5004583578,2023-08-15T22:15:24.172798,2023-07-21,5,2023-08-15T22:15:24.172798
97,"[{'score': 100.0, 'level': 0, 'id': 'https://o...","[C. A. Atwood, Carlyn Atwood]",40,Effect of Multimodal Analgesia on Opioid Use A...,"[{'cited_by_count': 3, 'year': 2023, 'works_co...","{'country_code': 'US', 'ror': 'https://ror.org...",,C. A. Atwood,"{'cited_by_count': 40, '2yr_i10_index': 0, 'h_...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5004619576...,https://openalex.org/A5004619576,2023-08-15T17:46:30.521235,2023-07-21,2,2023-08-15T17:46:30.521235
98,"[{'score': 100.0, 'level': 1, 'id': 'https://o...",[Shenlu Xu],40,Taste characteristics and umami mechanism of n...,"[{'cited_by_count': 20, 'year': 2023, 'works_c...","{'country_code': 'CN', 'ror': 'https://ror.org...",,Shenlu Xu,"{'cited_by_count': 40, '2yr_i10_index': 2, 'h_...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5004621160...,https://openalex.org/A5004621160,2023-08-15T14:23:28.871193,2023-07-21,2,2023-08-15T14:23:28.871193


##### Filtering IDs of authors who have been cited (`where cited_by_count != 0`)

In [37]:
duckdb.sql("select id from read_json_auto('https://openalex.s3.amazonaws.com/data/authors/updated_date%3D2023-08-15/part_001.gz', format='newline_delimited', compression='gzip') where cited_by_count != 0")

┌──────────────────────────────────┐
│                id                │
│             varchar              │
├──────────────────────────────────┤
│ https://openalex.org/A5003040756 │
│ https://openalex.org/A5003051334 │
│ https://openalex.org/A5003067678 │
│ https://openalex.org/A5003102207 │
│ https://openalex.org/A5003106718 │
│ https://openalex.org/A5003120547 │
│ https://openalex.org/A5003129271 │
│ https://openalex.org/A5003144433 │
│ https://openalex.org/A5003177155 │
│ https://openalex.org/A5003196082 │
│                ·                 │
│                ·                 │
│                ·                 │
│ https://openalex.org/A5088374549 │
│ https://openalex.org/A5088419030 │
│ https://openalex.org/A5088420402 │
│ https://openalex.org/A5088439642 │
│ https://openalex.org/A5088458123 │
│ https://openalex.org/A5088481895 │
│ https://openalex.org/A5088484543 │
│ https://openalex.org/A5088494947 │
│ https://openalex.org/A5088511620 │
│ https://openalex.org/A5088519879 │
├

##### Unnesting author records by year

In [41]:
duckdb.sql("select id as author_id, unnest(counts_by_year, recursive := true) from read_json_auto('https://openalex.s3.amazonaws.com/data/authors/updated_date%3D2023-08-15/part_000.gz', format='newline_delimited', compression='gzip') limit 200")

┌──────────────────────────────────┬────────────────┬───────┬─────────────┬────────────────┐
│            author_id             │ cited_by_count │ year  │ works_count │ oa_works_count │
│             varchar              │     int64      │ int64 │    int64    │     int64      │
├──────────────────────────────────┼────────────────┼───────┼─────────────┼────────────────┤
│ https://openalex.org/A5066716873 │          76342 │  2023 │        2239 │            843 │
│ https://openalex.org/A5066716873 │         116480 │  2022 │        2802 │           1380 │
│ https://openalex.org/A5066716873 │         114486 │  2021 │        2812 │           1453 │
│ https://openalex.org/A5066716873 │         103164 │  2020 │        2617 │           1298 │
│ https://openalex.org/A5066716873 │          58837 │  2019 │        2273 │            924 │
│ https://openalex.org/A5066716873 │          48643 │  2018 │        2010 │            751 │
│ https://openalex.org/A5066716873 │          41940 │  2017 │        1

##### Getting the `display_name` from `last_known_institution`

In [42]:
duckdb.sql("select id, display_name, json_extract(last_known_institution, '$.display_name'), works_count, cited_by_count from read_json_auto('https://openalex.s3.amazonaws.com/data/authors/updated_date%3D2023-08-15/part_000.gz', format='newline_delimited', compression='gzip') where cited_by_count > 10")

┌──────────────────────┬──────────────────────┬─────────────────────────────────────────┬─────────────┬────────────────┐
│          id          │     display_name     │ json_extract(last_known_institution, …  │ works_count │ cited_by_count │
│       varchar        │       varchar        │                  json                   │    int64    │     int64      │
├──────────────────────┼──────────────────────┼─────────────────────────────────────────┼─────────────┼────────────────┤
│ https://openalex.o…  │ Li Zhang             │ "Beijing University of Technology"      │       31086 │         583108 │
│ https://openalex.o…  │ Eric S. Lander       │ "Center for Systems Biology"            │         979 │         474366 │
│ https://openalex.o…  │ Ahmedin Jemal        │ "American Cancer Society"               │         872 │         452731 │
│ https://openalex.o…  │ Jing Wang            │ "Institute of Mechanics"                │       25475 │         430446 │
│ https://openalex.o…  │ Lei Wan