In [1]:
import requests
import pandas as pd

In [2]:
SCANR_API_URL='https://scanr-api.enseignementsup-recherche.gouv.fr/elasticsearch/publications/_search'
SCANR_API_TOKEN='Basic *****'
header = {'Authorization': SCANR_API_TOKEN}

In [82]:
must_block = []
for q in ['athlete']:
    must_block.append( {
                    "query_string": {
                        "fields": ["title.default",
                        "title.fr", "title.en",
                                   "keywords.en", "keywords.fr", "keywords.default",
                                   "domains.label.default", "domains.label.fr", "domains.label.en",
                                   "summary.default", "summary.fr", "summary.en",
                                  "alternativeSummary.default", "alternativeSummary.fr", "alternativeSummary.en"],
                        "query": f'"{q}"'
                    }})

json = {
        "size": 10000,
        "query": {
            "bool": {
                "filter":[
                                   {
                       "terms": {"authors.role.keyword": ["author", "directeurthese"]}
                    },
                    {
                       "terms": {"year": [2018, 2019, 2020, 2021, 2022, 2023]}
                    } 
                ],
                "must": must_block
            }
        },
        "aggs":{
            "idref":{ "terms": {
                "field": "authors.person.id.keyword",
                "size":10
            }
                    }
        }
    }

r = requests.post(SCANR_API_URL, json=json, headers=header).json()
    

In [104]:
json

{'size': 10000,
 'query': {'bool': {'filter': [{'terms': {'authors.role.keyword': ['author',
       'directeurthese']}},
    {'terms': {'year': [2018, 2019, 2020, 2021, 2022, 2023]}}],
   'must': [{'query_string': {'fields': ['title.default',
       'title.fr',
       'title.en',
       'keywords.en',
       'keywords.fr',
       'keywords.default',
       'domains.label.default',
       'domains.label.fr',
       'domains.label.en',
       'summary.default',
       'summary.fr',
       'summary.en',
       'alternativeSummary.default',
       'alternativeSummary.fr',
       'alternativeSummary.en'],
      'query': '"athlete"'}}]}},
 'aggs': {'idref': {'terms': {'field': 'authors.person.id.keyword',
    'size': 10}}}}

In [101]:
r['aggregations']

{'idref': {'doc_count_error_upper_bound': 0,
  'sum_other_doc_count': 4212,
  'buckets': [{'key': 'idref128108630', 'doc_count': 44},
   {'key': 'idref145417093', 'doc_count': 39},
   {'key': 'idref113270283', 'doc_count': 29},
   {'key': 'idref172802024', 'doc_count': 28},
   {'key': 'idref087560917', 'doc_count': 23},
   {'key': 'idref203009983', 'doc_count': 23},
   {'key': 'idref160686962', 'doc_count': 22},
   {'key': 'idref035682485', 'doc_count': 20},
   {'key': 'idref074621866', 'doc_count': 20},
   {'key': 'idref074575902', 'doc_count': 19}]}}

In [102]:
len(r['hits']['hits'])

1518

In [85]:
import networkx as nx

In [120]:
fullNameIdref = {}


NB_MAX_COAUTHORS = 20
NB_MIN_PUBLICATIONS = 5

nb_removed = 0

G=nx.Graph()

all_edges = {}
    
for e in r['hits']['hits']:
    elt = e['_source']
    authors = elt.get('authors')
    if len(authors) > NB_MAX_COAUTHORS:
        print('remove publi ' + str(e['_source']['id']))
        nb_removed += 1
        continue
    currentNodes = []
    for aut in elt.get('authors'):
        if 'person' in aut:
            idref = aut['person']['id']
            if idref not in fullNameIdref:
                fullNameIdref[idref] = aut['fullName']
            currentNode = fullNameIdref[idref]
            #print(currentNode)
            #print(1/0)
        elif 'fullName' in aut:
            currentNode = aut['fullName']
        else:
            continue
        currentNodes.append(currentNode)
    for node in currentNodes:
        if node not in all_edges:
            all_edges[node] = {'nb_publis': 0, 'coauthors':{}}
        all_edges[node]['nb_publis'] += 1
        for j_node in currentNodes:
            if node <  j_node:
                if j_node not in  all_edges[node]['coauthors']:
                    all_edges[node]['coauthors'][j_node] = 0
                all_edges[node]['coauthors'][j_node] += 1
                
            
for n in all_edges:
    if all_edges[n]['nb_publis'] < NB_MIN_PUBLICATIONS:
        continue
    G.add_node(n, size = all_edges[n]['nb_publis'])
    for m in all_edges[n]['coauthors']:
        G.add_edge(n, m, weight = all_edges[n]['coauthors'][m])
   
print()
print('removed '+str(nb_removed))
print('nb nodes = '+str(G.number_of_nodes()))
print('nb edges = '+str(G.number_of_edges()))

#print(len(nb_publi_per_node))
#print(len(edges))
nx.write_graphml_lxml(G, 'athelete.graphml')

remove publi doi10.5114/biolsport.2022.117576
remove publi doi10.1007/s40279-022-01776-y
remove publi doi10.1123/ijspp.2021-0543
remove publi doi10.1007/s40279-021-01573-z
remove publi doi10.1186/s40798-022-00469-0
remove publi doi10.1177/2047487319834852
remove publi doi10.3389/fnut.2022.925092
remove publi doi10.1136/bjsports-2022-105567
remove publi doi10.1007/s40279-021-01601-y
remove publi doi10.1136/bmjsem-2021-001273
remove publi doi10.1136/bjsports-2022-105759
remove publi doi10.1371/journal.pone.0243354
remove publi doi10.3389/fphys.2022.904778
remove publi doi10.1007/s40279-021-01502-0
remove publi doi10.1136/bjsports-2021-104087
remove publi doi10.1371/journal.pone.0257719
remove publi doi10.1161/circresaha.119.316386
remove publi doi10.1186/s12955-021-01825-6
remove publi doi10.1016/j.hfc.2018.03.010
remove publi doi10.1016/j.phrs.2020.104719
remove publi doi10.1016/j.ijcha.2021.100790
remove publi doi10.1093/eurheartj/ehy730
remove publi doi10.1097/ee9.0000000000000166
rem

In [112]:
#aut

In [113]:
all_edges['François Carré']

{'nb_publis': 18,
 'coauthors': {'Frédéric Schnell': 9,
  'Nathalie Behar': 1,
  'Solène Le Douairon Lahaye': 3,
  'Gaëlle Kervio': 1,
  'Vincent Menard': 1,
  'Thibault Lachard': 1,
  'Guy Carrault': 2,
  'Mats Borjesson': 2,
  'Mikael Dellborg': 2,
  'Josef Niebauer': 2,
  'Martin Halle': 2,
  'Paolo Emilio Adami': 1,
  'Stefano Caselli': 2,
  'Michael Papadakis': 2,
  'Hanne Rasmusen': 2,
  'Luis Serratosa': 2,
  'Sanjay Sharma': 2,
  'T. Fourme': 1,
  'P. Chevalier': 1,
  'P. De Groote': 1,
  'I. Denjoy': 1,
  'Stéphane Doutreleau': 2,
  'G. Habib': 1,
  'N. Mansencal': 1,
  'P. Maury': 1,
  'V. Probst': 1,
  'P. Reant': 1,
  'J. Trochu': 1,
  'L. Uzan': 1,
  'P. Charron': 1,
  'Jean-Claude Chatard': 1,
  'Richard Donnadieu': 1,
  'Jean-Paul Grangeon': 1,
  'Jean-Marie Sabot': 1,
  'Régis Dacquin': 1,
  'François-Xavier Raby': 1,
  'Gérard Papouin': 1,
  'Satu Viali': 1,
  'Frédéric Roche': 1,
  'Karl Isaaz': 1,
  'Jean-Claude Barthélémy': 1,
  'Mathias Poussel': 2,
  'Laure Joly':

In [97]:
#all_edges['idref050803476']

- spatialisation ForceAtlas2 (avec / sans linlog ?)
- taille noeuds = nombre de publications
- modularity class ou inference stat pour coloriser les noeuds


In [98]:
all_edges['idref128108630']

{'nb_publis': 45,
 'coauthors': {'idref258075392': 14,
  'idref172802024': 29,
  'idref172801869': 2,
  'idref201723441': 3,
  'idref160177626': 14,
  'idref233667954': 13,
  'idref167362852': 1,
  'idref132431610': 5,
  'idref185611451': 1,
  'idref195768191': 1,
  'idref157221849': 2,
  'idref186344295': 1,
  'idref230609732': 2,
  'idref151243069': 1,
  'idref258716304': 1,
  'idref230622097': 1}}