In [1]:
from dotenv import load_dotenv
from elasticsearch import Elasticsearch
import requests, os
import pandas as pd

In [2]:
# You need to add the infos in the .env file
load_dotenv()

ES_HOST = os.getenv("ES_HOST")
ES_PASSWORD = os.getenv("ES_PASSWORD")
ES_USER = os.getenv("ES_USER")
es = Elasticsearch(ES_HOST, http_auth=(ES_USER, ES_PASSWORD))
INDEX_PUBLICATIONS = f'{ES_HOST}/scanr-publications'

In [3]:
# See complete doc 
# https://www.elastic.co/guide/en/elasticsearch/reference/8.11/paginate-search-results.html#search-after

# To get more than 10k results, it is possible to use the 'search_after' feature. 
# This needs to get all documents sorted. To make sure the order remains consistent
# with mutiple requests, a PIT (point in time) has to be generated, and deleted at the end.

def get_pit():
    r = requests.post(f'{INDEX_PUBLICATIONS}/_pit?keep_alive=1m', auth=(ES_USER, ES_PASSWORD))
    pit = r.json()['id']
    return pit

def delete_pit(pit):
    requests.delete(f'{ES_HOST}/_pit', json={"id": pit}, auth=(ES_USER, ES_PASSWORD))

In [4]:
def get_all_results(query):
    pit = get_pit()
    all_data = []
    last_sort = None
    body = {
        "size": 1000,
        "query": query,
        "sort": [{"year": "asc", "id.keyword": "asc"}],
        "track_total_hits": False                        
    }
    for p in range(0, 100):
        if last_sort:
            body['search_after'] = last_sort
        body['pit'] = {'id': pit, "keep_alive": "1m"}
        r_tmp = requests.post(f'{ES_HOST}/_search', json=body, auth=(ES_USER, ES_PASSWORD))
        current_res = r_tmp.json()
        current_hits = current_res['hits']['hits']
        if len(current_hits)==0:
            break
        all_data += [k['_source'] for k in current_hits]
        last_sort = current_hits[-1]['sort']
        print(len(all_data), end=',')
    delete_pit(pit)
    print()
    print(f'{len(all_data)} records have been retrieved')
    return all_data

In [5]:
# Example of a query: get all publications, flagged with bso_local_affiliation 180070039, in 2023
query = { 
    "bool":{ 
        "must": [
            {"term": {"bso_local_affiliations.keyword":"180070039"}},
            {'range': {'year': {'gte': 2020, 'lte': 2024}}}
        ]
    }
}

In [6]:
all_data = get_all_results(query)
df = pd.DataFrame(all_data)

1000,2000,3000,4000,5000,6000,7000,8000,9000,10000,11000,12000,13000,14000,15000,16000,17000,18000,19000,20000,21000,22000,23000,24000,25000,26000,27000,28000,29000,30000,31000,32000,33000,34000,35000,36000,37000,38000,39000,40000,41000,42000,43000,44000,45000,46000,47000,48000,49000,50000,51000,52000,53000,54000,55000,56000,57000,58000,59000,60000,61000,62000,63000,64000,65000,66000,67000,68000,69000,70000,70036,
70036 records have been retrieved


In [13]:
df.sample()

Unnamed: 0,id,predict_teds,title,summary,doiUrl,externalIds,year,publicationDate,type,productionType,...,software,co_authors,co_institutions,co_domains,co_countries,projects,co_software,co_projects,acknowledgments,tags
8006,doi10.1146/annurev-cellbio-012820-103850,"[{'label': 'not_ipcc', 'probability': 0.976934...",{'default': 'Shaping Organs: Shared Structural...,{'default': 'Development encapsulates the morp...,http://doi.org/10.1146/annurev-cellbio-012820-...,"[{'type': 'doi', 'id': '10.1146/annurev-cellbi...",2020,2020-10-06T00:00:00,journal-article,publication,...,,[idref08943868X###Olivier Hamant###FR---idref2...,[130008121###FR_Ecole normale supérieure de Ly...,[Q1172449###encapsulates---Q11936292###mechano...,[France---Singapore],,,,,


In [15]:
df.to_json('data_from_scanr.json', orient='records')