# Collecter des données OpenAlex

## Découvrir l'API

Jeter un coup d'oeil ici : https://docs.openalex.org/how-to-use-the-api/api-overview

In [2]:
import yaml
with open('creds.yaml', 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

https://openalex.org/works?page=1&filter=title_and_abstract.search:%22computational+social+science%22

In [5]:
import requests

url = "https://api.openalex.org/works"

headers = {
    "Authorization": f"Bearer {config['token-openalex']}"
}

params = {
    "q": "computational social science",
    "per-page": 25,
    "page": 1,
}

r = requests.get(url, headers=headers, params=params)
r.raise_for_status()
data = r.json()
#print(data["results"])


In [7]:
# data["results"][0]

Regarder l'abstract

https://docs.openalex.org/api-entities/works/work-object

Get all the data

In [None]:
import requests
import time


keyword = "computational social science"
per_page = 25     
max_results = None 

url = "https://api.openalex.org/works"

headers = {
    "Authorization": f"Bearer {config['token-openalex']}"
}

cursor = "*"
all_works = []
n = 0

while True:
    params = {
        "filter": f'title_and_abstract.search:"{keyword}"',
        "per-page": per_page,
        "select": "id,title,abstract_inverted_index,publication_year",
        "cursor": cursor
    }

    r = requests.get(url, headers=headers, params=params)
    r.raise_for_status()
    data = r.json()

    results = data.get("results", [])
    total = data["meta"]["count"]

    if not results:
        break

    all_works.extend(results)
    n += len(results)

    print(f"Fetched {n} / {total} (cursor={cursor})")

    if max_results is not None and n >= max_results:
        break

    cursor = data["meta"].get("next_cursor")
    if not cursor:
        break

    time.sleep(0.2)

print(f"\nDone. Collected {len(all_works)} works total.\n")


In [14]:
import pickle
import os

os.makedirs("data", exist_ok=True)

with open("data/all_works.pkl", "wb") as f:
    pickle.dump(all_works, f)

## Existence de wrapper

Par exemple : pyalex : https://github.com/J535D165/pyalex

Nécessité de token / créer un compte

## Mettre sous la forme d'un dataframe

In [15]:
import pandas as pd

In [27]:
def reconstruct_abstract(inv_index):
    if not inv_index:
        return None
    
    # Determine abstract length
    max_position = max(pos for positions in inv_index.values() for pos in positions)
    abstract_words = [None] * (max_position + 1)

    # Place words in correct positions
    for word, positions in inv_index.items():
        for pos in positions:
            abstract_words[pos] = word

    return " ".join(abstract_words)

In [32]:
reconstruct_abstract(all_works[200]["abstract_inverted_index"])

'Deep learning has achieved a remarkable performance breakthrough in several fields, most notably in speech recognition, natural language processing, and computer vision. In particular, convolutional neural network (CNN) architectures currently produce state-of-the-art performance on a variety of image analysis tasks such as object detection and recognition. Most of deep learning research has so far focused on dealing with 1D, 2D, or 3D Euclidean-structured data such as acoustic signals, images, or videos. Recently, there has been an increasing interest in geometric deep learning, attempting to generalize deep learning methods to non-Euclidean structured data such as graphs and manifolds, with a variety of applications from the domains of network analysis, computational social science, or computer graphics. In this paper, we propose a unified framework allowing to generalize CNN architectures to non-Euclidean domains (graphs and manifolds) and learn local, stationary, and compositional

In [16]:
fields_to_keep = ["id", "doi", "title", "display_name", "relevance_score", "publication_year", "publication_date", "open_access", "abstract"]

{'id': 'https://openalex.org/W2159397589',
 'doi': 'https://doi.org/10.1126/science.1167742',
 'title': 'Computational Social Science',
 'display_name': 'Computational Social Science',
 'relevance_score': 1368.2181,
 'publication_year': 2009,
 'publication_date': '2009-02-06',
 'ids': {'openalex': 'https://openalex.org/W2159397589',
  'doi': 'https://doi.org/10.1126/science.1167742',
  'mag': '2159397589',
  'pmid': 'https://pubmed.ncbi.nlm.nih.gov/19197046'},
 'language': 'en',
 'primary_location': {'id': 'doi:10.1126/science.1167742',
  'is_oa': False,
  'landing_page_url': 'https://doi.org/10.1126/science.1167742',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S3880285',
   'display_name': 'Science',
   'issn_l': '0036-8075',
   'issn': ['0036-8075', '1095-9203'],
   'is_oa': False,
   'is_in_doaj': False,
   'is_core': True,
   'host_organization': 'https://openalex.org/P4310315823',
   'host_organization_name': 'American Association for the Advancement of Science',
 

In [26]:
import json
with open("example.json", "w") as f:
    f.write(json.dumps(all_works[0]))