In [1]:
import csv
import json
import requests

from collections import OrderedDict
from itertools import combinations

In [3]:
url = "https://cluster.elasticsearch.dataesr.ovh/bso-publications-staging/_search"
headers = { "Authorization": open("../../.env_es_bsso", "r").read().strip() }
AGGREGATION_SIZE = 500

In [3]:
query = {
  "bool": {
    "filter": []
  }
}

In [4]:
# Use the Composite aggregation
# https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-composite-aggregation.html
json_query = {
  "size": 0,
  "query": query,
  "aggs": {
    "X": {
      "composite": {
        "size": AGGREGATION_SIZE,
        "sources": [
            { "issn": { "terms": { "field": "journal_issn_l.keyword" } } },
            { "publisher": { "terms": { "field": "publisher.keyword" } } },
        ],
      },
    }
  }
}

results = []
response = requests.post(url, json=json_query, headers=headers).json().get('aggregations', {}).get('by_issn', {})
after = response.get('after_key')
results += response.get('buckets', [])

while after:
    json_query["aggs"]["by_issn"]["composite"]["after"] = after
    response = requests.post(url, json=json_query, headers=headers).json().get('aggregations', {}).get('by_issn', {})
    after = response.get('after_key')
    results += response.get('buckets', [])

print(len(results))
# We have 38.266 different ISSN x publishers

38266


In [5]:
issns = {}
for result in results:
    issn = result.get('key', {}).get('issn')
    if issn not in issns:
        issns[issn] = []
    issns[issn].append(result.get('key', {}).get('publisher'))
print(len(issns))
# We have 29.766 different ISSNs

29766


In [6]:
publishers = {}
for issn in issns:
    publisher_id = ''.join(issns[issn]).lower()
    if publisher_id not in publishers:
        publishers[publisher_id] = { "names": issns[issn], "issns": [] }
    publishers[publisher_id]["issns"].append(issn)

for publisher in publishers:
    publishers[publisher]["count"] = len(publishers[publisher].get("issns", []))

with open('publishers.json', 'w') as json_file:
    json.dump(publishers, json_file, indent=4)

print(len(publishers))
# We have 5.466 different publishers or multiple publishers

5466


In [7]:
multiple_publishers = {k: v for k, v in publishers.items() if len(v.get("names")) > 1}

with open('multiple_publishers.json', 'w') as json_file:
    json.dump(multiple_publishers, json_file, indent=4)

print(len(multiple_publishers))
# We have 1.363 combination of multiple publishers

1363


In [8]:
# Display the distribution
lengths = {}
for multiple_publisher in multiple_publishers:
    length = len(multiple_publishers[multiple_publisher].get("names", []))
    if length not in lengths:
        lengths[length] = 0
    lengths[length] += 1

OrderedDict(sorted(lengths.items()))

OrderedDict([(2, 1097),
             (3, 195),
             (4, 42),
             (5, 17),
             (6, 5),
             (7, 3),
             (8, 1),
             (9, 1),
             (10, 1),
             (13, 1)])

In [9]:
# Export into CSV
rows = [["publisher_01", "publisher_02", "publisher_03", "publisher_04", "publisher_05", "publisher_06", "publisher_07", 
        "publisher_08", "publisher_09", "publisher_10", "publisher_11", "publisher_12", "publisher_13", "issns", "number_of_issns"]]
for multiple_publisher in multiple_publishers:
    publishers = multiple_publishers[multiple_publisher].get('names')
    row = publishers + [''] * (13 - len(publishers))
    row.append(', '.join(multiple_publishers[multiple_publisher].get('issns')))
    row.append(multiple_publishers[multiple_publisher].get('count'))
    rows.append(row)


with open('multiple_publishers.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(rows)

In [10]:
# Create a pair publishers
results = {}
for issn in issns:
    publishers = issns[issn]
    if len(publishers) > 1:
        pair_publishers = list(combinations(publishers, 2))
        for pair_publisher in pair_publishers:
            pair_publishers_id = ''.join(pair_publisher).lower()
            if pair_publishers_id not in results:
                results[pair_publishers_id] = { "publisher_01": pair_publisher[0], "publisher_02": pair_publisher[1], "issns": [], "count": 0 }
            results[pair_publishers_id]["issns"].append(issn)
            results[pair_publishers_id]["count"] += 1

results = [results[result] for result in results]
keys = results[0].keys()

with open('pair_publishers.csv', 'w') as csv_file:
    dict_writer = csv.DictWriter(csv_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(results)

print(len(results))
# We have 1.593 pairs of publishers

1593


In [11]:
# Filter all pairs that have more than 5 ISSNs
results = [result for result in results if result.get("count") >= 5]
keys = results[0].keys()

with open('filtered_pair_publishers.csv', 'w') as csv_file:
    dict_writer = csv.DictWriter(csv_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(results)

print(len(results))
# We have 182 paris of publishers that have more than 5 ISSNs in common

182
