In [1]:
import requests
import pandas as pd

In [2]:
SCANR_API_URL='https://scanr-api.enseignementsup-recherche.gouv.fr/elasticsearch/publications/_search'
SCANR_API_TOKEN='Basic *****'
header = {'Authorization': SCANR_API_TOKEN}

In [82]:
must_block = []
for q in ['athlete']:
    must_block.append( {
                    "query_string": {
                        "fields": ["title.default",
                        "title.fr", "title.en",
                                   "keywords.en", "keywords.fr", "keywords.default",
                                   "domains.label.default", "domains.label.fr", "domains.label.en",
                                   "summary.default", "summary.fr", "summary.en",
                                  "alternativeSummary.default", "alternativeSummary.fr", "alternativeSummary.en"],
                        "query": f'"{q}"'
                    }})

json = {
        "size": 10000,
        "query": {
            "bool": {
                "filter":[
                                   {
                       "terms": {"authors.role.keyword": ["author", "directeurthese"]}
                    },
                    {
                       "terms": {"year": [2018, 2019, 2020, 2021, 2022, 2023]}
                    } 
                ],
                "must": must_block
            }
        },
        "aggs":{
            "idref":{ "terms": {
                "field": "authors.person.id.keyword",
                "size":10
            }
                    }
        }
    }

r = requests.post(SCANR_API_URL, json=json, headers=header).json()
    

In [101]:
r['aggregations']

{'idref': {'doc_count_error_upper_bound': 0,
  'sum_other_doc_count': 4212,
  'buckets': [{'key': 'idref128108630', 'doc_count': 44},
   {'key': 'idref145417093', 'doc_count': 39},
   {'key': 'idref113270283', 'doc_count': 29},
   {'key': 'idref172802024', 'doc_count': 28},
   {'key': 'idref087560917', 'doc_count': 23},
   {'key': 'idref203009983', 'doc_count': 23},
   {'key': 'idref160686962', 'doc_count': 22},
   {'key': 'idref035682485', 'doc_count': 20},
   {'key': 'idref074621866', 'doc_count': 20},
   {'key': 'idref074575902', 'doc_count': 19}]}}

In [102]:
len(r['hits']['hits'])

1518

In [85]:
import networkx as nx

In [99]:
NB_MAX_COAUTHORS = 20
NB_MIN_PUBLICATIONS = 5

G=nx.Graph()

all_edges = {}
    
for e in r['hits']['hits']:
    elt = e['_source']
    authors = elt.get('authors')
    if len(authors) > NB_MAX_COAUTHORS:
        continue
    currentNodes = []
    for aut in elt.get('authors'):
        if 'person' in aut:
            #currentNode = aut['person']['fullName'] + '(' + aut['person']['id'] + ')'
            currentNode = aut['person']['id']
            #print(currentNode)
            #print(1/0)
        elif 'fullName' in aut:
            currentNode = aut['fullName']
        else:
            continue
        currentNodes.append(currentNode)
    for node in currentNodes:
        if node not in all_edges:
            all_edges[node] = {'nb_publis': 0, 'coauthors':{}}
        all_edges[node]['nb_publis'] += 1
        for j_node in currentNodes:
            if node <  j_node:
                if j_node not in  all_edges[node]['coauthors']:
                    all_edges[node]['coauthors'][j_node] = 0
                all_edges[node]['coauthors'][j_node] += 1
                
            
for n in all_edges:
    if all_edges[n]['nb_publis'] < NB_MIN_PUBLICATIONS:
        continue
    for m in all_edges[n]['coauthors']:
        G.add_edge(n, m, weight = all_edges[n]['coauthors'][m])
                

print(G.number_of_nodes())
print(G.number_of_edges())

#print(len(nb_publi_per_node))
#print(len(edges))
nx.write_graphml_lxml(G, 'athelete.graphml')

1042
2173


In [97]:
#all_edges['idref050803476']

In [98]:
all_edges['idref128108630']

{'nb_publis': 45,
 'coauthors': {'idref258075392': 14,
  'idref172802024': 29,
  'idref172801869': 2,
  'idref201723441': 3,
  'idref160177626': 14,
  'idref233667954': 13,
  'idref167362852': 1,
  'idref132431610': 5,
  'idref185611451': 1,
  'idref195768191': 1,
  'idref157221849': 2,
  'idref186344295': 1,
  'idref230609732': 2,
  'idref151243069': 1,
  'idref258716304': 1,
  'idref230622097': 1}}