In [None]:
import json
import re
import subprocess

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime as dt
import numpy as np
import matplotlib.ticker as mticker

from urllib.request import urlopen, HTTPError
import requests
from wordcloud import WordCloud, STOPWORDS

In [None]:
def make_wordcloud(df, column_name, ftitle):
    
    text = ' '.join(df[column_name].values)
    
    exclude = [
        'using',
        'CMS',
        'open',
        'data',
        'collider',
        'event',
        'TeV',
        'analysis',
        'based',
        'LHC',
        'particle',
        'end',
        'high',
        'energy',
        'physics',
        'new',
        'experiment'
    ]

    stopwords = set(STOPWORDS)

    for e in exclude:
        stopwords.add(e)
    
    wc = WordCloud(
        background_color='white',
        stopwords=stopwords,
        width=600,
        height=400
    ).generate(text)
    
    plt.figure()
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    #plt.show()

    plt.savefig(f'./figs/{ftitle}.png', dpi=200)

In [None]:
def handle_doi_queries(dois):
    
    doi_url = 'https://doi.org/api/handles/'
    recids = []
        
    for doi in dois:
        '''
        Some of the DOIs referenced are invalid, either having an extra ":" 
        appended to the end or having the year in parentheses appended.
        In the former case they are duplicates and we can skip.
        In the latter clean them up.
        '''
        
        if ':' in doi:
            continue
        if '(' in doi:
            doi = doi[:-6]
            
        try:
            response = requests.get(f'{doi_url}{doi}').json()
            
            url = response['values'][1]['data']['value']
            recid = url.split('/')[-1]
        
            recids.append(recid)
         
        except KeyError:
            print('Error '+doi)
        
    return recids
    
def resolve_dois(df):
    '''
    The references that come from the Inspire API results
    often have useful information on the datasets in "misc"
    and often has the CODP records as well. In several cases
    they do not. Since we have the DOIs use the DOI API
    
    https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation
    
    and resolve the CODP record URL and add it to the dataframe.
    
    We can get the information from the DOI using the CODP CLI as well
    (and that's probably better but stay with this for now).
    '''
    df['codp_recids'] = df['dois_referenced'].map(
        lambda x: handle_doi_queries(x)
    )

In [None]:
def process_dataframe(input_json):
    
    exclude_names = [
        'McCauley',
        'Bellis',
        'Lange',
        'Tibor',
        'Å imko',
        'Carerra',
        'Geiser',
        'Lassila-Perini',
        'Dallmeier-Tiessen',
        'Calderon',
        'Rao',
        'Socher',
        'Herterich'
    ]
    
    df = pd.read_json(input_json)
    
    '''
    Adds a new column where each row is a list of names from exclude_names 
    that match (substring) those in the author list contained in the authors column
    '''
    df['exclude'] = df['authors'].map(lambda x: [e for e in exclude_names if any(e in xn for xn in x)])
    
    print(df.shape[0])
    
    '''
    Filter out rows in-place where there are matches from the
    excluded names
    '''
    df = df[df['exclude'].str.len() == 0]
    
    print(df.shape[0])
    
    '''
    Reorder the rows by date and reorder the indices. 
    Then add an index column.
    '''
    df.sort_values(by='date', inplace=True)
    df.reset_index(drop='True', inplace=True)
    df.reset_index(inplace=True)
    
    '''
    Format date
    '''
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
    
    return df

In [None]:
def make_histogram(df, date_range, nbins, title, ftitle):
    
    dates = df['date'].to_numpy(dtype='datetime64[Y]')
    
    h, b = np.histogram(
        dates.astype(int),
        range=date_range,
        bins=nbins
    )

    b = np.array([np.datetime64(int(value), 'Y') for value in b])

    plt.bar(b[:-1], h, width=np.diff(b), ec='black', align='edge')
    plt.gca().set_xticks(b)
    plt.gca().set_xticklabels(b, rotation=45)

    plt.title(title)

    plt.tight_layout()
    plt.savefig(f'./figs/{ftitle}.png', dpi=200)

In [None]:
data_releases = [
    '2014-11-20',
    '2016-04-22',
    '2017-12-20',
    '2019-07-18',
    '2020-08-27',
    '2020-12-21',
    '2021-12-20',
    '2022-12-05',
    '2023-09-18',
    '2024-04-02'
]

These papers reference a CMS open data DOI

In [None]:
idf = process_dataframe('data/inspire.json')

In [None]:
resolve_dois(idf)

In [None]:
date_generated = dt.datetime.today().strftime('%Y-%m-%d')
print(
    date_generated
)

In [None]:
idf.head()

In [None]:
idf.tail()

In [None]:
def add_release_dates(ax):
    
    ax.set_xlim([dt.date(2014, 1, 1), dt.date(2026, 1, 1)])

    for dr in data_releases:
    
        date = dt.datetime.strptime(dr, '%Y-%m-%d').date()

        ax.annotate(
            "",
            xy=(date, -3), 
            xytext=(date, -2),        
            arrowprops=dict(facecolor='black', shrink=0.01),
        )


ax = idf.plot(
    kind='scatter',
    x='date', 
    y='index',
    #title=f'Papers citing CMS Open Data DOIs [Inspire] \n{date_generated}'
)

ax.set_xlabel('Date published', fontsize=14)
ax.set_ylabel('Number of papers', fontsize=14)
ax.set_title(f'Papers citing CMS Open Data DOIs [Inspire] \n{date_generated}', fontsize=16)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

#add_release_dates(ax)
plt.savefig('./figs/inspire-npapers.png', dpi=200)

In [None]:
make_histogram(
    idf,
    (np.datetime64('2017').astype(int), np.datetime64('2026').astype(int)),
    9,
    f'Papers citing CMS Open Data DOIs [Inspire] \n{date_generated}',
    'inspire-npapers-hist'
)

In [None]:
nzero = len(idf[idf['citations'] == 0])

nzdf = idf[idf['citations'] > 0]

print(f'{nzero} papers with 0 citations')
print(f'{len(nzdf)} papers with > 0 citations')

In [None]:
h, b = np.histogram(
    nzdf['citations'],
    range=(0,150),
    bins=15
)

plt.bar(b[:-1], h, width=np.diff(b), ec='black', align='edge')

plt.gca().set_xticks(b)
plt.xticks(fontsize=12)
plt.gca().set_xticklabels(b.astype(int), rotation=45)
plt.yticks(fontsize=12)

plt.gca().set_yscale('log')
plt.title(
    f'Number of citations of papers citing CMS Open Data DOIs [Inspire] \n {nzero} papers with 0 citations excluded \n{date_generated}'
)

plt.tight_layout()

plt.savefig('./figs/inspire-citations.png', dpi=200)

In [None]:
make_wordcloud(idf, 'title', 'inspire-wc-title')

In [None]:
make_wordcloud(idf, 'abstract', 'inspire-wc-abstract')

In [None]:
def get_codp_title(recid):
    
    results = subprocess.run([
        'cernopendata-client',
        'get-metadata',
        '--recid',
        recid,
        '--output-value',
        'title'],
        stdout=subprocess.PIPE
    )

    results = results.stdout.decode('utf-8')
    return results

In [None]:
def get_codp_categories(recid):

    results = subprocess.run([
        'cernopendata-client',
        'get-metadata',
        '--recid',
        recid,
        '--output-value',
        'categories'],
        stdout=subprocess.PIPE
    )

    results = results.stdout.decode('utf-8')
  
    if 'ERROR' in results:
        return '', []
    
    categories = json.loads(results)
    
    return categories['primary'], categories['secondary']

In [None]:
titles = idf['codp_recids'].map(lambda x: [get_codp_title(str(rid)) for rid in x])

In [None]:
categories = idf['codp_recids'].map(lambda x: [get_codp_categories(str(rid)) for rid in x])

In [None]:
# Make this more pythonic

primary_categories = []
secondary_categories = []

for c in categories:

    if len(c) > 0:

        for d in c:

            if d[0]:
                primary_categories.append(d[0])
            if d[1]:
                for e in d[1]:
                    secondary_categories.append(e)

print(primary_categories)
print(secondary_categories)

In [None]:
pd.Series(primary_categories).value_counts(sort=False).plot(
    kind='barh',
    title=f'Primary categories of datasets cited [Inspire]\n{date_generated}',
    #figsize=(12,20)
)

plt.tight_layout()
plt.savefig('./figs/inspire-dataset-primary-categories.png', dpi=200)

In [None]:
pd.Series(secondary_categories).value_counts(sort=False).plot(
    kind='barh',
    title=f'Secondary categories of datasets cited [Inspire]\n{date_generated}',
    #figsize=(12,20)
)

plt.tight_layout()
plt.savefig('./figs/inspire-dataset-secondary-categories.png', dpi=200)

In [None]:
titles = [[t.rstrip() for t in title] for title in titles]

In [None]:
'''
Fetch out the dataset names by searching for /A/B/C pattern.

Useful information:
- A
- Run201XY from B
- C
'''

dataset_names = []
dataset_eras = []
dataset_tiers = []

others = []

for title in titles:
    for t in title:
        
        if len(re.findall('/', t)) == 3:
            
            t = t.split('/')
            
            if 'Run201' in t[2]:
                dataset_eras.append(t[2].split('-')[0])
                
            dataset_names.append(t[1])
            dataset_tiers.append(t[3])
            
        else:
            others.append(t)
    

In [None]:
pd.Series(dataset_tiers).value_counts(sort=False).plot(
    kind='barh',
    title=f'Data tiers cited [Inspire] \n{date_generated}'
)

plt.tight_layout()
plt.savefig('./figs/inspire-datatiers.png', dpi=200)

In [None]:
pd.Series(dataset_eras).value_counts(sort=False).plot(
    kind='barh',
    title=f'Dataset eras cited [Inspire]\n{date_generated}'
)

plt.tight_layout()
plt.savefig('./figs/inspire-dataset-eras.png', dpi=200)

In [None]:
pd.Series(dataset_names).value_counts(sort=True).plot(
    kind='barh',
    title=f'Dataset names cited [Inspire]\n{date_generated}',
    figsize=(12,20)
)

plt.tight_layout()
plt.savefig('./figs/inspire-dataset-names.png', dpi=200)

In [None]:
dsn = pd.Series(dataset_names)

groups = dsn.str.split("_").str[0] + "_*"

groups.value_counts(sort=True).plot(
    kind='barh',
    title=f'Dataset names cited [Inspire]\n{date_generated}',
    figsize=(12,20)
)

plt.tight_layout()
plt.savefig('./figs/inspire-dataset-names-groups.png', dpi=200)

In [None]:
publications = idf['publication'].values
publications = [p for p in publications if p]

In [None]:
pd.Series(publications).value_counts(sort=True).plot(
    kind='barh',
    title=f'Publications [Inspire]\n{date_generated}',
    figsize=(12,20)
)

plt.tight_layout()
plt.savefig('./figs/inspire-publications.png', dpi=200)

In [None]:
idf['document_type'].value_counts(sort=False).plot(
    kind='barh',
    title=f'Publication type [Inspire]\n{date_generated}'
)

plt.tight_layout()
plt.savefig('./figs/inspire-publication-type.png', dpi=200)

In [None]:
adf = process_dataframe('data/arxiv.json')

In [None]:
adf.head()

In [None]:
adf.tail()

In [None]:
def add_release_dates(ax):
    
    ax.set_xlim([dt.date(2014, 1, 1), dt.date(2024, 12, 31)])

    for dr in data_releases:
    
        date = dt.datetime.strptime(dr, '%Y-%m-%d').date()

        ax.annotate(
            "",
            xy=(date, -1.2), 
            xytext=(date, 0),        
            arrowprops=dict(facecolor='black', shrink=0.01),
        )

ax = adf.plot(
    kind='scatter',
    x='date', 
    y='index',
    title=f'Papers containing "CMS Open Data" in the abstract [arXiv] \n{date_generated}'
)

ax.set_xlabel('Date published')
ax.set_ylabel('Number of papers')

#add_release_dates(ax)

plt.tight_layout()
plt.savefig('./figs/arxiv-npapers.png', dpi=200)

In [None]:
make_histogram(
    adf,
    (np.datetime64('2017').astype(int), np.datetime64('2026').astype(int)),
    9,
    f'Papers containing "CMS Open Data" in the abstract [arXiv] \n{date_generated}',
    'arxiv-npapers-hist'
)

In [None]:
make_wordcloud(adf, 'title', 'arxiv-wc-title')

In [None]:
make_wordcloud(adf, 'abstract', 'arxiv-wc-abstract')