## OpenAlex API - Fetching Data

In [1]:
# basic
import sys
import os
import json
import requests
from tqdm import tqdm
import ast
import numpy as np
import pandas as pd

# debug
import pdb
from loguru import logger

# custom
from parser import work_parser, author_parser, venue_parser, institution_parser
from scraper import oa_work_scraper, oa_author_scraper

In [2]:
# 0. OpenAlex API Docs: https://docs.openalex.org/api
# 0. OpenAlex API Tutorials: https://github.com/ourresearch/openalex-api-tutorials
# 1. additional request packages: https://stackoverflow.com/a/18579484
# 2. custom tqdm: https://stackoverflow.com/questions/45808140/using-tqdm-progress-bar-in-a-while-loop 
# 3. loguru tutorial: https://medium.com/analytics-vidhya/a-quick-guide-to-using-loguru-4042dc5437a5
# 3.1 multiple loggers: https://github.com/Delgan/loguru/issues/333

### Works

In [3]:
INIT_PAGE, INIT_CURSOR = 0, '*'
# INIT_PAGE, INIT_CURSOR = 298, 'Ils5MCwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL1cyNjI0MTg2MjY4J10i'

DATA_URL = 'https://api.openalex.org/works?'\
                'filter=from_publication_date:2012-01-01,'\
                'to_publication_date:2012-12-31,'\
                'has_abstract:true,'\
                'has_references:true,'\
                'is_paratext:false,'\
                'is_retracted:false,'\
                'type:proceedings-article,'\
                'concepts.id:C41008148'

# concept id - computer science: https://api.openalex.org/C41008148

oa_work_scraper(
    data_url = DATA_URL,
    parser = work_parser,
    email = 'watcher.1997@cs.iitr.ac.in',
    LOG_PATH = './log.works.2012.v1.txt', 
    DATA_PATH = './data.works.2012.v1.txt', 
    PER_PAGE = 200,
    INIT_PAGE = INIT_PAGE,
    INIT_CURSOR = INIT_CURSOR,
    dry_run = False,
)

dry run: {'count': 164444, 'db_response_time_ms': 297, 'page': 1, 'per_page': 25}


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 164444/164444 [17:26<00:00, 157.11it/s]

Last log: 
  - page: 823, 
  - cursor: IlswLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzk5OTM2NDYwNyddIg==, 
  - hit_url: https://api.openalex.org/works?filter=from_publication_date:2012-01-01,to_publication_date:2012-12-31,has_abstract:true,has_references:true,is_paratext:false,is_retracted:false,type:proceedings-article,concepts.id:C41008148&per_page=200&cursor=IlswLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzc1ODQ3OTc0NiddIg==
Logs saved to file: ./log.works.2012.v1.txt
Data saved to file: ./data.works.2012.v1.txt





In [None]:
## to resume, check line of log file for {cursor, page, hit_url}

### Reading Data

In [2]:
DATA_FILE = './data.works.2012.v1.txt'
with open(DATA_FILE, 'r') as f:
    works = [ast.literal_eval(work) for work in tqdm(f.readlines())]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 823/823 [00:24<00:00, 33.83it/s]


In [3]:
works_x = [j for i in works for j in i]
df = pd.DataFrame.from_dict(works_x)
print (df.shape)
df.head()

(128097, 15)


Unnamed: 0,id,doi,title,type,publication_date,host_venue,open_access_is_oa,open_access_oa_status,authorships,page_count,cited_by_count,concepts,referenced_works,abstract,counts_by_year
0,W2150066425,https://doi.org/10.1109/cvpr.2012.6248074,Are we ready for autonomous driving? The KITTI...,proceedings-article,2012-06-16,V4306417987,False,closed,"[[A2182886034, [I102335020]], [A2167431718, [I...",-1,6636,"[[C154945302, 0.7994821], [C5799516, 0.7514366...","[W1578985305, W1964814179, W1968799614, W19903...","Today, visual recognition systems are still ra...","[[2022, 570], [2021, 1427], [2020, 1363], [201..."
1,W2158782408,https://doi.org/10.1109/iros.2012.6386109,MuJoCo: A physics engine for model-based control,proceedings-article,2012-12-24,V4306418941,False,closed,"[[A2028747364, [I201448701]], [A2086424281, [I...",-1,2177,"[[C190390380, 0.9146959], [C169590947, 0.71768...","[W154144259, W639693478, W1992116952, W2004116...",We describe a new physics engine tailored to m...,"[[2022, 155], [2021, 616], [2020, 601], [2019,..."
2,W2141125852,https://doi.org/10.1109/cvpr.2012.6248110,Multi-column deep neural networks for image cl...,proceedings-article,2012-06-16,V4306417987,True,green,"[[A1966905718, [I2614128279]], [A2343548174, [...",-1,1970,"[[C190502265, 0.8540422], [C41008148, 0.794758...","[W1601963269, W1968995181, W1987778624, W20268...",Traditional methods of computer vision and mac...,"[[2022, 126], [2021, 237], [2020, 269], [2019,..."
3,W2021851106,https://doi.org/10.1109/iros.2012.6385773,A benchmark for the evaluation of RGB-D SLAM s...,proceedings-article,2012-12-24,V4306418941,True,green,"[[A2973958393, [I62916508]], [A1927103669, [I3...",-1,1901,"[[C31972630, 0.78334284], [C154945302, 0.75840...","[W1514909517, W1556098723, W1656165940, W19652...","In this paper, we present a novel benchmark fo...","[[2022, 142], [2021, 317], [2020, 330], [2019,..."
4,W2100960835,https://doi.org/10.1145/2090236.2090255,Fairness through awareness,proceedings-article,2012-01-08,V4306418069,True,green,"[[A208343995, [I1290206253]], [A2301637896, [I...",-1,1655,"[[C41008148, 0.59133285], [C95623464, 0.542300...","[W178579835, W632002811, W1599656298, W1873763...","We study fairness in classification, where ind...","[[2022, 229], [2021, 453], [2020, 353], [2019,..."


In [4]:
# no. of unique authors
authors = np.unique([j 
                     for i in df['authorships'].map(lambda x: [i[0] for i in x]).tolist() 
                        for j in i
                    ])
len(authors)

282636

In [5]:
# institutions
insts = np.unique([k
                   for i in df['authorships'].map(lambda x: [i[1] for i in x]).tolist() 
                       for j in i
                           for k in j
                  ])
len(insts)

12015

In [6]:
# host venues
venues = np.unique(df['host_venue'].tolist())
len(venues)

1713

### Authors

In [13]:
# authors
oa_author_scraper(
    data_url='https://api.openalex.org/authors?filter=openalex:',
    author_ids=authors[190000:],
    parser=author_parser,
    email='laptop.1997@wisconsin.edu',
    LOG_PATH='log.authors.2012.v3.part3.txt', 
    DATA_PATH='data.authors.2012.v3.part3.txt'
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████▉| 92588/92636 [18:08<00:00, 85.08it/s]

Last log: 
  - page: 1852, 
  - hit_url: https://api.openalex.org/authors?filter=openalex:A99512508|A99531987|A99533699|A995615555|A99562039|A995647421|A995685755|A995843601|A996043393|A996076943|A99634055|A99643244|A996611473|A997073975|A99770489|A997748992|A997878525|A997880244|A998162854|A99825556|A998286317|A99843963|A9985198|A99871570|A99883087|A998882806|A998909512|A99906037|A999226360|A999235383|A999507011|A999665357|A99967126|A99975585|A999838833|A9999165&per_page=50
Logs saved to file: log.authors.2012.v3.part3.txt
Data saved to file: data.authors.2012.v3.part3.txt





### Venues

In [11]:
# venues
oa_author_scraper(
    data_url='https://api.openalex.org/venues?filter=openalex:',
    author_ids=venues,
    parser=venue_parser,
    email='watcher.1997@cs.iitr.ac.in',
    LOG_PATH='log.venues.2012.v1.txt', 
    DATA_PATH='data.venues.2012.v1.txt'
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1713/1713 [00:18<00:00, 91.71it/s]

Last log: 
  - page: 34, 
  - hit_url: https://api.openalex.org/venues?filter=openalex:V4306534857|V4306534858|V4306534922|V43295729|V44569693|V46576051|V49974971|V50690046|V5240358|V61861333|V80171551|V897311980|V998850158&per_page=50
Logs saved to file: log.venues.2012.v1.txt
Data saved to file: data.venues.2012.v1.txt





### Institutions

In [12]:
# institutions
oa_author_scraper(
    data_url='https://api.openalex.org/institutions?filter=openalex:',
    author_ids=insts,
    parser=institution_parser,
    email='watcher.1997@cs.iitr.ac.in',
    LOG_PATH='log.insts.2012.v1.txt', 
    DATA_PATH='data.insts.2012.v1.txt'
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 12015/12015 [02:15<00:00, 88.78it/s]

Last log: 
  - page: 240, 
  - hit_url: https://api.openalex.org/institutions?filter=openalex:I99568074|I99601430|I99613584|I99677737|I99682543|I99686982|I99712911|I99729588|I99731219|I99861883|I99908691|I99939339|I99942243|I99977706|I99978038&per_page=50
Logs saved to file: log.insts.2012.v1.txt
Data saved to file: data.insts.2012.v1.txt



