In [4]:
import duckdb
import pandas as pd
from dotenv import load_dotenv

In [None]:
load_dotenv()

#### Review a snapshot data file using `duckdb`

In [2]:
duckdb.sql("install 'httpfs'; load 'httpfs'; set s3_region='us-east-1';")

In [21]:
duckdb.sql("select count(*) from read_json_auto('https://openalex.s3.amazonaws.com/data/authors/updated_date%3D2023-07-21/part_000.gz', format='newline_delimited', compression='gzip')")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      1356380 │
└──────────────┘

In [3]:
duckdb.sql("select * from read_json_auto('https://openalex.s3.amazonaws.com/data/authors/updated_date%3D2023-07-21/part_000.gz', format='newline_delimited', compression='gzip')")

┌──────────────────────┬──────────────────────┬────────────────┬───┬──────────────┬─────────────┬────────────┐
│      x_concepts      │ display_name_alter…  │ cited_by_count │ … │ created_date │ works_count │  updated   │
│ struct(score doubl…  │      varchar[]       │     int64      │   │     date     │    int64    │    date    │
├──────────────────────┼──────────────────────┼────────────────┼───┼──────────────┼─────────────┼────────────┤
│ [{'score': 100.0, …  │ [J W Chappell, JB …  │           1840 │ … │ 2023-07-21   │           3 │ 2023-07-21 │
│ [{'score': 100.0, …  │ [Runa Patel]         │           1293 │ … │ 2023-07-21   │           1 │ 2023-07-21 │
│ [{'score': 100.0, …  │ [Basket-Late Inves…  │           1273 │ … │ 2023-07-21   │           1 │ 2023-07-21 │
│ [{'score': 41.7, '…  │ [Syaiful Bahri Dja…  │           1269 │ … │ 2023-07-21   │          12 │ 2023-07-21 │
│ [{'score': 100.0, …  │ [L McVay-Boudreau]   │           1233 │ … │ 2023-07-21   │           7 │ 2023-07-21 │
│

#### Explore a small sample of the records to explore the data model

In [4]:
df = duckdb.sql("select * from read_json_auto('https://openalex.s3.amazonaws.com/data/authors/updated_date%3D2023-07-21/part_000.gz', format='newline_delimited', compression='gzip') limit 100").df()

In [5]:
df.columns

Index(['x_concepts', 'display_name_alternatives', 'cited_by_count',
       'most_cited_work', 'counts_by_year', 'last_known_institution', 'orcid',
       'display_name', 'summary_stats', 'works_api_url', 'ids', 'id',
       'updated_date', 'created_date', 'works_count', 'updated'],
      dtype='object')

In [6]:
df

Unnamed: 0,x_concepts,display_name_alternatives,cited_by_count,most_cited_work,counts_by_year,last_known_institution,orcid,display_name,summary_stats,works_api_url,ids,id,updated_date,created_date,works_count,updated
0,"[{'score': 100.0, 'level': 1, 'id': 'https://o...","[J W Chappell, JB Chappell]",1840,A simple method for the preparation of 32P-lab...,"[{'cited_by_count': 2, 'year': 2023, 'works_co...",,,J W Chappell,"{'cited_by_count': 1840, '2yr_i10_index': 0, '...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5022292925...,https://openalex.org/A5022292925,2023-07-21,2023-07-21,3,2023-07-21
1,"[{'score': 100.0, 'level': 0, 'id': 'https://o...",[Runa Patel],1293,"Forskningsmetodikens grunder. Att planera, gen...","[{'cited_by_count': 2, 'year': 2021, 'works_co...",,,Runa Patel,"{'cited_by_count': 1293, '2yr_i10_index': 0, '...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5064913229...,https://openalex.org/A5064913229,2023-07-21,2023-07-21,1,2023-07-21
2,"[{'score': 100.0, 'level': 0, 'id': 'https://o...",[Basket-Late Investigators],1273,Late Clinical Events After Clopidogrel Discont...,"[{'cited_by_count': 6, 'year': 2023, 'works_co...",,,Basket-Late Investigators,"{'cited_by_count': 1273, '2yr_i10_index': 0, '...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5067701857...,https://openalex.org/A5067701857,2023-07-21,2023-07-21,1,2023-07-21
3,"[{'score': 41.7, 'level': 0, 'id': 'https://op...",[Syaiful Bahri Djamarah],1269,Strategi Belajar Mengajar (2002),"[{'cited_by_count': 76, 'year': 2021, 'works_c...",,,Syaiful Bahri Djamarah,"{'cited_by_count': 1269, '2yr_i10_index': 0, '...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5080646503...,https://openalex.org/A5080646503,2023-07-21,2023-07-21,12,2023-07-21
4,"[{'score': 100.0, 'level': 1, 'id': 'https://o...",[L McVay-Boudreau],1233,Immunoregulatory circuits among T-cell sets. I...,"[{'cited_by_count': 1, 'year': 2023, 'works_co...",,,L McVay-Boudreau,"{'cited_by_count': 1233, '2yr_i10_index': 0, '...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5034559380...,https://openalex.org/A5034559380,2023-07-21,2023-07-21,7,2023-07-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"[{'score': 100.0, 'level': 1, 'id': 'https://o...","[J S McKinney, JS McKinney]",652,"Inositol 1,4,5-trisphosphate and inositol 1,3,...","[{'cited_by_count': 1, 'year': 2022, 'works_co...",,,J S McKinney,"{'cited_by_count': 652, '2yr_i10_index': 0, 'h...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5040071334...,https://openalex.org/A5040071334,2023-07-21,2023-07-21,8,2023-07-21
96,"[{'score': 100.0, 'level': 0, 'id': 'https://o...",[Z. P. Zhang],650,Transverse Momentum and Centrality Dependence ...,"[{'cited_by_count': 6, 'year': 2023, 'works_co...",,,Z. P. Zhang,"{'cited_by_count': 650, '2yr_i10_index': 0, 'h...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5018610379...,https://openalex.org/A5018610379,2023-07-21,2023-07-21,2,2023-07-21
97,"[{'score': 100.0, 'level': 0, 'id': 'https://o...",[Y. Li],650,Transverse Momentum and Centrality Dependence ...,"[{'cited_by_count': 6, 'year': 2023, 'works_co...",,,Y. Li,"{'cited_by_count': 650, '2yr_i10_index': 0, 'h...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5076127104...,https://openalex.org/A5076127104,2023-07-21,2023-07-21,2,2023-07-21
98,"[{'score': 100.0, 'level': 0, 'id': 'https://o...",[X. L. Wang],650,Transverse Momentum and Centrality Dependence ...,"[{'cited_by_count': 6, 'year': 2023, 'works_co...",,,X. L. Wang,"{'cited_by_count': 650, '2yr_i10_index': 0, 'h...",https://api.openalex.org/works?filter=author.i...,{'openalex': 'https://openalex.org/A5079803399...,https://openalex.org/A5079803399,2023-07-21,2023-07-21,2,2023-07-21


In [13]:
duckdb.sql("select count(*) from read_json_auto('/Users/m20/projects/open_alex_authors/data/authors/*/*.gz', format='newline_delimited', compression='gzip')")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│     92840725 │
└──────────────┘

In [22]:
!aws s3 ls --summarize --human-readable --no-sign-request --recursive "s3://openalex/data/authors/"

2023-09-22 09:54:45   14.7 KiB data/authors/manifest
2023-09-22 09:54:45  468.5 MiB data/authors/updated_date=2023-07-21/part_000.gz
2023-09-22 09:54:45  477.8 MiB data/authors/updated_date=2023-07-21/part_001.gz
2023-09-22 09:54:45  481.0 MiB data/authors/updated_date=2023-07-21/part_002.gz
2023-09-22 09:54:45  482.8 MiB data/authors/updated_date=2023-07-21/part_003.gz
2023-09-22 09:54:45  487.6 MiB data/authors/updated_date=2023-07-21/part_004.gz
2023-09-22 09:54:52  514.3 MiB data/authors/updated_date=2023-07-21/part_005.gz
2023-09-22 09:54:53  514.3 MiB data/authors/updated_date=2023-07-21/part_006.gz
2023-09-22 09:54:53  514.2 MiB data/authors/updated_date=2023-07-21/part_007.gz
2023-09-22 09:54:53  514.2 MiB data/authors/updated_date=2023-07-21/part_008.gz
2023-09-22 09:54:53  514.2 MiB data/authors/updated_date=2023-07-21/part_009.gz
2023-09-22 09:55:00  514.4 MiB data/authors/updated_date=2023-07-21/part_010.gz
2023-09-22 09:55:00  514.2 MiB data/authors/updated_date=2023-07-21

##### Filtering IDs of authors who have been cited (`where cited_by_count != 0`)

In [9]:
con = duckdb.connect('open_alex_authors.duckdb')
con.sql("select id from open_alex_authors.september_2023_snapshot where cited_by_count != 0")

┌──────────────────────────────────┐
│                id                │
│             varchar              │
├──────────────────────────────────┤
│ https://openalex.org/A5056528844 │
│ https://openalex.org/A5019326658 │
│ https://openalex.org/A5056649904 │
│ https://openalex.org/A5082777400 │
│ https://openalex.org/A5016357520 │
│ https://openalex.org/A5061059691 │
│ https://openalex.org/A5024931330 │
│ https://openalex.org/A5010910238 │
│ https://openalex.org/A5032086815 │
│ https://openalex.org/A5053724899 │
│                ·                 │
│                ·                 │
│                ·                 │
│ https://openalex.org/A5072601511 │
│ https://openalex.org/A5072619972 │
│ https://openalex.org/A5087342123 │
│ https://openalex.org/A5087496243 │
│ https://openalex.org/A5003114427 │
│ https://openalex.org/A5009904364 │
│ https://openalex.org/A5020931994 │
│ https://openalex.org/A5031921539 │
│ https://openalex.org/A5044059063 │
│ https://openalex.org/A5045713584 │
├

##### Unnesting author records by year

In [10]:
con.sql("select id as author_id, unnest(counts_by_year, recursive := true) from open_alex_authors.september_2023_snapshot")

┌──────────────────────────────────┬────────────────┬───────┬─────────────┬────────────────┐
│            author_id             │ cited_by_count │ year  │ works_count │ oa_works_count │
│             varchar              │     int64      │ int64 │    int64    │     int64      │
├──────────────────────────────────┼────────────────┼───────┼─────────────┼────────────────┤
│ https://openalex.org/A5056528844 │             60 │  2023 │           0 │              0 │
│ https://openalex.org/A5056528844 │             99 │  2022 │           0 │              0 │
│ https://openalex.org/A5056528844 │            121 │  2021 │           0 │              0 │
│ https://openalex.org/A5056528844 │            129 │  2020 │           0 │              0 │
│ https://openalex.org/A5056528844 │            100 │  2019 │           0 │              0 │
│ https://openalex.org/A5056528844 │            143 │  2018 │           0 │              0 │
│ https://openalex.org/A5056528844 │            144 │  2017 │         

##### Getting the `display_name` from `last_known_institution`

In [11]:
con.sql("select id, display_name, json_extract(last_known_institution, '$.display_name'), works_count, cited_by_count from open_alex_authors.september_2023_snapshot where cited_by_count > 10")

┌──────────────────────┬──────────────────────┬─────────────────────────────────────────┬─────────────┬────────────────┐
│          id          │     display_name     │ json_extract(last_known_institution, …  │ works_count │ cited_by_count │
│       varchar        │       varchar        │                  json                   │    int64    │     int64      │
├──────────────────────┼──────────────────────┼─────────────────────────────────────────┼─────────────┼────────────────┤
│ https://openalex.o…  │ G C Ebers            │ "University of Oxford"                  │         110 │           7912 │
│ https://openalex.o…  │ Earl L. Giller       │ "Center for Telepsychology"             │         111 │           6606 │
│ https://openalex.o…  │ James W. Prichard    │ "The Alfred Hospital"                   │         170 │           6546 │
│ https://openalex.o…  │ Kiang-Teck Yeo       │ "Magee-Womens Hospital"                 │          45 │           6292 │
│ https://openalex.o…  │ Joaquim

In [12]:
con.close()