In [12]:
import requests

In [13]:
url = "https://cluster.elasticsearch.dataesr.ovh/bsso-publications/_search"
header = {'Authorization' : open("../../.env_es_bsso", "r").read().strip()}

## 1.4 Les archives

### 1.4.1 Quelle est la dynamique d’ouverture de la santé parmi les archives ?

In [15]:
year = "2021Q1" # et aussi tous les "millésimes" 2018, 2019, 2020
params = {
  "size": 0,
  "query": {
    "bool": {
      "filter": [{
        "term": {
          "oa_details." + year + ".oa_host_type": "repository"
        }
      }]
    }
  },
  "aggs": {
    "by_publication_year": {
      "terms": {
        "field": "publication_year"
      }
    }
  }
}

requests.get(url, json=params, headers=header).json()['aggregations']['by_publication_year']['buckets']

[{'key': 2020.0, 'doc_count': 28239},
 {'key': 2019.0, 'doc_count': 25598},
 {'key': 2018.0, 'doc_count': 23889},
 {'key': 2017.0, 'doc_count': 22624},
 {'key': 2016.0, 'doc_count': 20711},
 {'key': 2015.0, 'doc_count': 18420},
 {'key': 2014.0, 'doc_count': 14868},
 {'key': 2013.0, 'doc_count': 10051},
 {'key': 2021.0, 'doc_count': 3331},
 {'key': 2012.0, 'doc_count': 303}]

_Question : pourquoi `oa_details` n'est pas un simple tableau d'objects avec un champ "millésime" (ou autre) afin de pouvoir faire une aggrégation sur ce champ et donc faire une grosse requête plutôt qu'une requête par millésime ?_ 

### 1.4.2 Quelles archives ouvertes sont les plus utilisées en santé ?

In [17]:
year = "2021Q1" # et aussi tous les "millésimes" 2018, 2019, 2020
params = {
  "size": 0,
  "query": {
    "bool": {
      "filter": [{
        "term": {
          "oa_details." + year + ".oa_host_type": "repository"
        }
      }]
    }
  },
  "aggs": {
    "by_repository": {
      "terms": {
        "field": "oa_details." + year + ".repositories.keyword",
        "missing": "MISSING",
        "size": 200
      }
    }
  }
}

requests.get(url, json=params, headers=header).json()['aggregations']['by_repository']['buckets']

[{'key': 'www.ncbi.nlm.nih.gov', 'doc_count': 120214},
 {'key': 'europepmc.org', 'doc_count': 103256},
 {'key': 'HAL', 'doc_count': 55321},
 {'key': 'pdfs.semanticscholar.org', 'doc_count': 32544},
 {'key': 'arxiv.org', 'doc_count': 8266},
 {'key': 'discovery.ucl.ac.uk', 'doc_count': 2789},
 {'key': 'www.biorxiv.org', 'doc_count': 2309},
 {'key': 'www.pure.ed.ac.uk', 'doc_count': 2219},
 {'key': 'www.research.ed.ac.uk', 'doc_count': 2205},
 {'key': 'spiral.imperial.ac.uk', 'doc_count': 1856},
 {'key': 'escholarship.org', 'doc_count': 1779},
 {'key': 'eprints.whiterose.ac.uk', 'doc_count': 1734},
 {'key': 'univoak.eu', 'doc_count': 1723},
 {'key': 'digital.csic.es', 'doc_count': 1671},
 {'key': 'ora.ox.ac.uk', 'doc_count': 1601},
 {'key': 'archimer.ifremer.fr', 'doc_count': 1538},
 {'key': 'www.zora.uzh.ch', 'doc_count': 1476},
 {'key': 'helda.helsinki.fi', 'doc_count': 1421},
 {'key': 'www.repository.cam.ac.uk', 'doc_count': 1411},
 {'key': 'www.osti.gov', 'doc_count': 1290},
 {'key': 

_Question : On s'arrête à combien de repositories ? Est-ce que certains repos ne devraient pas être mergés (Je n'ai pas trouvé d'exemple en ce sens) ?_

_Erreur : Pour certaines publications, il manque le champ "oa_details.year.repositories.keyword". ex: https://cluster.elasticsearch.dataesr.ovh/bsso-publications/_search?q=_id:1573. `{'key': 'MISSING', 'doc_count': 257}`_

### 1.4.3 Quelle est la dynamique de dépôt par archive ouverte en santé ?

In [22]:
year = "2021Q1"
params = {
  "size": 0,
  "query": {
    "bool": {
      "filter": [{
        "term": {
          "oa_details." + year + ".oa_host_type": "repository"
        }
      }]
    }
  },
  "aggs": {
    "by_discipline": {
      "terms": {
        "field": "oa_details." + year + ".repositories.keyword",
        "missing": "MISSING",
        "size": 200
      },
      "aggs": {
        "by_publication_year": {
          "terms": {
            "field": "publication_year"
          }
        }
      }
    }
  }
}

requests.get(url, json=params, headers=header).json()['aggregations']['by_discipline']['buckets']

[{'key': 'www.ncbi.nlm.nih.gov',
  'doc_count': 120214,
  'by_publication_year': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 0,
   'buckets': [{'key': 2020.0, 'doc_count': 21565},
    {'key': 2019.0, 'doc_count': 18189},
    {'key': 2018.0, 'doc_count': 17003},
    {'key': 2017.0, 'doc_count': 15852},
    {'key': 2016.0, 'doc_count': 14517},
    {'key': 2015.0, 'doc_count': 13130},
    {'key': 2014.0, 'doc_count': 10366},
    {'key': 2013.0, 'doc_count': 6782},
    {'key': 2021.0, 'doc_count': 2572},
    {'key': 2012.0, 'doc_count': 238}]}},
 {'key': 'europepmc.org',
  'doc_count': 103256,
  'by_publication_year': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 0,
   'buckets': [{'key': 2019.0, 'doc_count': 16814},
    {'key': 2018.0, 'doc_count': 16657},
    {'key': 2017.0, 'doc_count': 15657},
    {'key': 2016.0, 'doc_count': 14323},
    {'key': 2015.0, 'doc_count': 13007},
    {'key': 2014.0, 'doc_count': 10241},
    {'key': 2020.0, 'doc_count': 9594},


_Question : Cette requête est basée sur le millésime "2021Q1". Est-ce bien la façon dont ce graphe a été créé ?_

### 1.4.4 Quelle place occupe HAL dans la dynamique des archives ouvertes en santé ?

In [21]:
year = "2021Q1" # et aussi tous les "millésimes" 2018, 2019, 2020
params = {
  "size": 0,
  "query": {
    "bool": {
      "filter": [{
        "term": {
          "oa_details." + year + ".oa_host_type": "repository"
        }
      }]
    }
  },
  "aggs": {
    "by_discipline": {
      "terms": {
        "field": "oa_details." + year + ".repositories.keyword",
        "missing": "MISSING",
        "size": 200
      }
    }
  }
}

requests.get(url, json=params, headers=header).json()['aggregations']['by_discipline']['buckets']

[{'key': 'www.ncbi.nlm.nih.gov', 'doc_count': 120214},
 {'key': 'europepmc.org', 'doc_count': 103256},
 {'key': 'HAL', 'doc_count': 55321},
 {'key': 'pdfs.semanticscholar.org', 'doc_count': 32544},
 {'key': 'arxiv.org', 'doc_count': 8266},
 {'key': 'discovery.ucl.ac.uk', 'doc_count': 2789},
 {'key': 'www.biorxiv.org', 'doc_count': 2309},
 {'key': 'www.pure.ed.ac.uk', 'doc_count': 2219},
 {'key': 'www.research.ed.ac.uk', 'doc_count': 2205},
 {'key': 'spiral.imperial.ac.uk', 'doc_count': 1856}]

_Idée : Should post process to group / sum all repositories other than 'HAL'_

## 1.5 Les affiliations

### 1.5.1 Taux d’ouverture des publications françaises, dans le domaine de la santé, par millésime tous types d’établissements confondus

In [23]:
year = "2021Q1" # et aussi tous les "millésimes" 2018, 2019, 2020
filter = '*' # *, university, hospital, cnrs, inserm
params = {
  "size": 0,
  "query": {
    "bool": {
      "filter": [{
        "wildcard": {
          "french_affiliations_types": filter
        }
      }]
    }
  },
  "aggs": {
    "by_publication_year": {
      "terms": {
        "field": "publication_year"
      },
      "aggs": {
        "by_is_oa": {
          "terms": {
            "field": "oa_details." + year + ".is_oa"
          }
        }
      }
    }
  }
}

requests.get(url, json=params, headers=header).json()['aggregations']['by_publication_year']['buckets']

{'by_publication_year': {'doc_count_error_upper_bound': 0,
  'sum_other_doc_count': 0,
  'buckets': [{'key': 2020.0,
    'doc_count': 48983,
    'by_is_oa': {'doc_count_error_upper_bound': 0,
     'sum_other_doc_count': 0,
     'buckets': [{'key': 1, 'key_as_string': 'true', 'doc_count': 28890},
      {'key': 0, 'key_as_string': 'false', 'doc_count': 20093}]}},
   {'key': 2019.0,
    'doc_count': 40770,
    'by_is_oa': {'doc_count_error_upper_bound': 0,
     'sum_other_doc_count': 0,
     'buckets': [{'key': 1, 'key_as_string': 'true', 'doc_count': 25961},
      {'key': 0, 'key_as_string': 'false', 'doc_count': 14809}]}},
   {'key': 2017.0,
    'doc_count': 39868,
    'by_is_oa': {'doc_count_error_upper_bound': 0,
     'sum_other_doc_count': 0,
     'buckets': [{'key': 1, 'key_as_string': 'true', 'doc_count': 24032},
      {'key': 0, 'key_as_string': 'false', 'doc_count': 15836}]}},
   {'key': 2018.0,
    'doc_count': 39819,
    'by_is_oa': {'doc_count_error_upper_bound': 0,
     'sum_

### 1.5.2 Evolution du taux d’ouverture des publications en santé par types d’établissement entre millésimes (Les données ne sont pas encore prêtes)

### 1.5.3 Quel impact le pays d’affiliation des auteurs a-t-il sur le taux d’ouverture en santé ?

In [24]:
year = "2021Q1" # et aussi tous les "millésimes" 2018, 2019, 2020
params = {
  "size": 0,
  "aggs": {
    "by_publication_year": {
      "terms": {
        "field": "publication_year"
      },
      "aggs": {
        "by_author_useful_rank_fr": {
          "terms": {
            "field": "author_useful_rank_fr"
          },
          "aggs": {
            "by_is_oa": {
              "terms": {
                "field": "oa_details." + year + ".is_oa"
              }
            }
          }
        }
      }
    }
  }
}

requests.get(url, json=params, headers=header).json()['aggregations'][]

{'by_publication_year': {'doc_count_error_upper_bound': 0,
  'sum_other_doc_count': 0,
  'buckets': [{'key': 2020.0,
    'doc_count': 54356,
    'by_author_useful_rank_fr': {'doc_count_error_upper_bound': 0,
     'sum_other_doc_count': 0,
     'buckets': [{'key': 1,
       'key_as_string': 'true',
       'doc_count': 38521,
       'by_is_oa': {'doc_count_error_upper_bound': 0,
        'sum_other_doc_count': 0,
        'buckets': [{'key': 1, 'key_as_string': 'true', 'doc_count': 21427},
         {'key': 0, 'key_as_string': 'false', 'doc_count': 17094}]}},
      {'key': 0,
       'key_as_string': 'false',
       'doc_count': 15835,
       'by_is_oa': {'doc_count_error_upper_bound': 0,
        'sum_other_doc_count': 0,
        'buckets': [{'key': 1, 'key_as_string': 'true', 'doc_count': 10869},
         {'key': 0, 'key_as_string': 'false', 'doc_count': 4966}]}}]}},
   {'key': 2019.0,
    'doc_count': 45324,
    'by_author_useful_rank_fr': {'doc_count_error_upper_bound': 0,
     'sum_other

### 1.5.4 Classement des 10 pays d'affiliation des auteurs de rang utile avec lesquels la France collabore le plus, selon le taux d’accès ouvert de leurs publications en santé 

In [None]:
year = "2021Q1" # et aussi tous les "millésimes" 2018, 2019, 2020
params = {
  "size": 0,
  "query": {
    "bool": {
      "filter": [{
        "term": {
          "author_useful_rank_fr": "true"
        }
      }, {
        "term": {
          "oa_details." + year + ".is_oa": "true"
        }
      }]
    }
  },
  "aggs": {
    "by_country": {
      "terms": {
        "field": "affiliations.countries.keyword",
        "exclude": "FR",
        "size": 200,
      }
    }
  }
}

requests.get(url, json=params, headers=header).json()['aggregations']