## OpenAlex API - Fetching Data

In [1]:
# basic
import sys
import os
import json
import requests
from tqdm import tqdm
import ast
import numpy as np
import pandas as pd

# debug
import pdb
from loguru import logger

# custom
from parser import work_parser, author_parser, venue_parser, institution_parser, concept_parser
from scraper import oa_work_scraper, oa_author_scraper
from utils import *

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gautamchoudhary/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# 0. OpenAlex API Docs: https://docs.openalex.org/api
# 0. OpenAlex API Tutorials: https://github.com/ourresearch/openalex-api-tutorials
# 1. additional request packages: https://stackoverflow.com/a/18579484
# 2. custom tqdm: https://stackoverflow.com/questions/45808140/using-tqdm-progress-bar-in-a-while-loop 
# 3. loguru tutorial: https://medium.com/analytics-vidhya/a-quick-guide-to-using-loguru-4042dc5437a5
# 3.1 multiple loggers: https://github.com/Delgan/loguru/issues/333

In [3]:
YEAR = 2017

make_dirs(f'./data/{YEAR}')

Directory already exists!


### Works

In [3]:
# INIT_PAGE, INIT_CURSOR = 0, '*'
INIT_PAGE, INIT_CURSOR = 482, 'IlsyLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzI3NzAzMTMyMDknXSI='

DATA_URL = 'https://api.openalex.org/works?'\
                f'filter=from_publication_date:{YEAR}-01-01,'\
                f'to_publication_date:{YEAR}-12-31,'\
                'has_abstract:true,'\
                'has_references:true,'\
                'is_paratext:false,'\
                'is_retracted:false,'\
                'type:proceedings-article,'\
                'concepts.id:C41008148'

# concept id - computer science: https://api.openalex.org/C41008148

oa_work_scraper(
    data_url = DATA_URL,
    parser = work_parser,
    email = 'dial.19@simon.edu',
    LOG_PATH = f'./data/{YEAR}/log.works.{YEAR}.v1.txt', 
    DATA_PATH = f'./data/{YEAR}/data.works.{YEAR}.v1.txt', 
    PER_PAGE = 200,
    INIT_PAGE = INIT_PAGE,
    INIT_CURSOR = INIT_CURSOR,
    dry_run = False,
)

dry run: {'count': 199293, 'db_response_time_ms': 84, 'page': 1, 'per_page': 25}


### Concepts

In [4]:
INIT_PAGE, INIT_CURSOR = 0, '*'
# INIT_PAGE, INIT_CURSOR = 482, 'IlsyLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzI3NzAzMTMyMDknXSI='

DATA_URL = 'https://api.openalex.org/concepts?filter=level:0|1|2|3|4|5'

# concept id - computer science: https://api.openalex.org/C41008148

oa_work_scraper(
    data_url = DATA_URL,
    parser = concept_parser,
    email = 'dial.19@simon.edu',
    LOG_PATH = f'./data/{YEAR}/log.concepts.{YEAR}.v2.txt', 
    DATA_PATH = f'./data/{YEAR}/data.concepts.{YEAR}.v2.txt', 
    PER_PAGE = 200,
    INIT_PAGE = INIT_PAGE,
    INIT_CURSOR = INIT_CURSOR,
    dry_run = False,
)

dry run: {'count': 65073, 'db_response_time_ms': 38, 'page': 1, 'per_page': 25}


100%|███████████████████████████████████████████████████████████████████████████| 65073/65073 [04:15<00:00, 255.05it/s]

Last log: 
  - page: 326, 
  - cursor: IlswLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvQzkzMDk3OTkyJ10i, 
  - hit_url: https://api.openalex.org/concepts?filter=level:0|1|2|3|4|5&per_page=200&cursor=Ils0NSwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL0MyNzc3NjkwNjY1J10i
Logs saved to file: ./data/2017/log.concepts.2017.v2.txt
Data saved to file: ./data/2017/data.concepts.2017.v2.txt





In [4]:
## to resume, check line of log file for {cursor, page, hit_url}

### Reading Data

In [6]:
def read_data(file_name):
    with open(file_name, 'r', encoding='utf8') as f:
        data = [ast.literal_eval(result) for result in tqdm(f.readlines(), desc=f'Reading {file_name}')]
    return pd.DataFrame.from_dict(flatten(data))

In [16]:
df = read_data(f'./data/{YEAR}/data.works.{YEAR}.v1.txt')

Reading ./data/2017/data.works.2017.v1.txt: 100%|████████████████████████████████████████████████████████████| 997/997 [01:32<00:00, 10.76it/s]


In [17]:
print (df.shape)
df.head()

(147235, 15)


Unnamed: 0,id,doi,title,type,publication_date,host_venue,open_access_is_oa,open_access_oa_status,authorships,page_count,cited_by_count,concepts,referenced_works,abstract,counts_by_year
0,W2963446712,https://doi.org/10.1109/cvpr.2017.243,Densely Connected Convolutional Networks,proceedings-article,2017-07-21,V4306400194,True,green,"[[A2114281204, [I205783295]], [A2566736780, [I...",-1,16035,"[[C41008148, 0.8167397], [C185798385, 0.806655...","[W1677182931, W1903029394, W2097117768, W21020...",Recent work has shown that convolutional netwo...,"[[2022, 2671], [2021, 4520], [2020, 4269], [20..."
1,W2963150697,https://doi.org/10.1109/iccv.2017.322,Mask R-CNN,proceedings-article,2017-03-20,V4306402512,False,closed,"[[A2164292938, [I4210114444]], [A102740216, [I...",-1,11067,"[[C41008148, 0.60304093], [C154945302, 0.41181...","[W1536680647, W1903029394, W1923115158, W19602...","We present a conceptually simple, flexible, an...","[[2022, 1721], [2021, 3369], [2020, 3040], [20..."
2,W2962793481,https://doi.org/10.1109/iccv.2017.244,Unpaired Image-to-Image Translation Using Cycl...,proceedings-article,2017-10-01,V4306419272,True,green,"[[A2112232458, [I95457486]], [A2604325483, [I9...",-1,9368,"[[C2779757391, 0.87979054], [C115961682, 0.661...","[W845365781, W1530781137, W1896934482, W190302...",Image-to-image translation is a class of visio...,"[[2022, 1278], [2021, 2657], [2020, 2620], [20..."
3,W2963073614,https://doi.org/10.1109/cvpr.2017.632,Image-to-Image Translation with Conditional Ad...,proceedings-article,2017-07-21,V4306417987,True,green,"[[A2077136294, [I95457486]], [A2112232458, [I9...",-1,9104,"[[C2779757391, 0.80383134], [C115961682, 0.735...","[W845365781, W1903029394, W1905829557, W197242...",We investigate conditional adversarial network...,"[[2022, 1118], [2021, 2501], [2020, 2440], [20..."
4,W2963351448,https://doi.org/10.1109/iccv.2017.324,Focal Loss for Dense Object Detection,proceedings-article,2017-08-07,V4306400194,True,green,"[[A2618037599, [I205783295]], [A2642611022, [I...",-1,7809,"[[C94915269, 0.8226304], [C41008148, 0.777915]...","[W1536680647, W1903029394, W2031489346, W20369...",The highest accuracy object detectors to date ...,"[[2022, 1549], [2021, 2795], [2020, 2048], [20..."


In [18]:
# no. of unique authors
authors = np.unique([j 
                     for i in df['authorships'].map(lambda x: [i[0] for i in x]).tolist() 
                        for j in i
                    ])
len(authors)

344607

In [19]:
# institutions
insts = np.unique([k
                   for i in df['authorships'].map(lambda x: [i[1] for i in x]).tolist() 
                       for j in i
                           for k in j
                  ])
len(insts)

13227

In [20]:
# host venues
venues = np.unique(df['host_venue'].tolist())
len(venues)

1854

### Authors

In [17]:
# authors
oa_author_scraper(
    data_url='https://api.openalex.org/authors?filter=openalex:',
    author_ids=authors[270000:],
    parser=author_parser,
    email='keyb.1297@wisco.edu',
    LOG_PATH=f'./data/{YEAR}/log.authors.{YEAR}.v1.part1.txt', 
    DATA_PATH=f'./data/{YEAR}/data.authors.{YEAR}.v1.part1.txt'
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████▉| 74585/74607 [14:01<00:00, 88.59it/s]

Last log: 
  - page: 1492, 
  - hit_url: https://api.openalex.org/authors?filter=openalex:A9985198|A998713140|A9988965|A998909512|A99930888|A999725237|A999784343&per_page=50
Logs saved to file: log.authors.2017.v1.part1.txt
Data saved to file: data.authors.2017.v1.part1.txt





### Venues

In [12]:
# venues
oa_author_scraper(
    data_url='https://api.openalex.org/venues?filter=openalex:',
    author_ids=venues,
    parser=venue_parser,
    email='watcher.1997@cs.iitr.ac.in',
    LOG_PATH=f'./data/{YEAR}/log.venues.{YEAR}.v1.txt', 
    DATA_PATH=f'./data/{YEAR}/data.venues.{YEAR}.v1.txt'
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1854/1854 [00:19<00:00, 97.04it/s]

Last log: 
  - page: 37, 
  - hit_url: https://api.openalex.org/venues?filter=openalex:V89276529|V8962223|V9551102|V96331937&per_page=50
Logs saved to file: log.venues.2017.v1.txt
Data saved to file: data.venues.2017.v1.txt





### Institutions

In [13]:
# institutions
oa_author_scraper(
    data_url='https://api.openalex.org/institutions?filter=openalex:',
    author_ids=insts,
    parser=institution_parser,
    email='watcher.1997@cs.iitr.ac.in',
    LOG_PATH=f'./data/{YEAR}/log.insts.{YEAR}.v1.txt', 
    DATA_PATH=f'./data/{YEAR}/data.insts.{YEAR}.v1.txt'
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 13227/13227 [02:22<00:00, 92.81it/s]

Last log: 
  - page: 264, 
  - hit_url: https://api.openalex.org/institutions?filter=openalex:I99399873|I99412970|I99418890|I99434035|I99464096|I99490713|I99501251|I99507834|I99542240|I99552915|I99555447|I99561568|I99568074|I99601430|I99613584|I99682543|I99686982|I99712911|I99714305|I99729588|I99731219|I99840328|I99861883|I99908691|I99942243|I99944750|I99977706&per_page=50
Logs saved to file: log.insts.2017.v1.txt
Data saved to file: data.insts.2017.v1.txt





### Converting to DataFrame

In [21]:
# Saving to DataFrame for quick loading

works = read_data(f'./data/{YEAR}/data.works.{YEAR}.v1.txt')
authors = read_data(f'./data/{YEAR}/data.authors.{YEAR}.v1.txt')
venues = read_data(f'./data/{YEAR}/data.venues.{YEAR}.v1.txt')
insts = read_data(f'./data/{YEAR}/data.insts.{YEAR}.v1.txt')
concepts = read_data(f'./data/{YEAR}/data.concepts.{YEAR}.v1.txt')

# remove duplicate records
if works['id'].nunique() < len(works):
    print (f"duplicate works, dropping {len(works) - works['id'].nunique()} records")
    works.drop_duplicates(['id'], inplace=True)
if authors['id'].nunique() < len(authors):
    print (f"duplicate authors, dropping {len(authors) - authors['id'].nunique()} records")
    authors.drop_duplicates(['id'], inplace=True)
if venues['id'].nunique() < len(venues):
    print (f"duplicate venues, dropping {len(venues) - venues['id'].nunique()} records")
    venues.drop_duplicates(['id'], inplace=True)
if insts['id'].nunique() < len(insts):
    print (f"duplicate insts, dropping {len(insts) - insts['id'].nunique()} records")
    insts.drop_duplicates(['id'], inplace=True)
if concepts['id'].nunique() < len(concepts):
    print (f"duplicate concepts, dropping {len(concepts) - concepts['id'].nunique()} records")
    concepts.drop_duplicates(['id'], inplace=True)

# save csv
works.to_csv(f'./data/{YEAR}/works.{YEAR}.v2.csv', index=None)
authors.to_csv(f'./data/{YEAR}/authors.{YEAR}.v2.csv', index=None)
venues.to_csv(f'./data/{YEAR}/venues.{YEAR}.v2.csv', index=None)
insts.to_csv(f'./data/{YEAR}/insts.{YEAR}.v2.csv', index=None)
concepts.to_csv(f'./data/{YEAR}/concepts.{YEAR}.v2.csv', index=None)

# save pickle
works.to_pickle(f'./data/{YEAR}/works.{YEAR}.v2.pkl')
authors.to_pickle(f'./data/{YEAR}/authors.{YEAR}.v2.pkl')
venues.to_pickle(f'./data/{YEAR}/venues.{YEAR}.v2.pkl')
insts.to_pickle(f'./data/{YEAR}/insts.{YEAR}.v2.pkl')
concepts.to_pickle(f'./data/{YEAR}/concepts.{YEAR}.v2.pkl')

Reading ./data/2017/raw/data.works.2017.v1.txt: 100%|█| 997/997 [01:31<00:00, 10
Reading ./data/2017/raw/data.authors.2017.v1.txt: 100%|█| 6893/6893 [01:24<00:00
Reading ./data/2017/raw/data.venues.2017.v1.txt: 100%|█| 38/38 [00:00<00:00, 186
Reading ./data/2017/raw/data.insts.2017.v1.txt: 100%|█| 265/265 [00:15<00:00, 17


duplicate works, dropping 163 records
