In [2]:
import json
from pprint import pprint
import plotly.express as px
import pandas as pd
import chart_studio.plotly as csp
from analytics.prov import ProvenanceAnalyser


In [3]:
import json
analysis_results = json.load(open('provenance_analysis_results.json', 'r', encoding='utf-8'))

# # If the file's content is not already sorted, sort it as follows (the dictionary must be sorted for the visualization to work properly)
# analysis_results = ProvenanceAnalyser.sort_prov_analysis_results(analysis_results)

In [4]:
pprint(analysis_results)


{'': {'https://api.crossref.org/': {'omid_only': 356955, 'other_pids': 324066},
      'https://api.crossref.org/snapshots/monthly/2023/09/all.json.tar.gz': {'omid_only': 71227,
                                                                             'other_pids': 149362},
      'https://api.datacite.org/': {'omid_only': 9, 'other_pids': 162061},
      'https://api.datacite.org/ https://api.crossref.org/': {'omid_only': 0,
                                                              'other_pids': 287},
      'https://doi.org/10.5281/zenodo.7845968': {'omid_only': 0,
                                                 'other_pids': 355075},
      'https://doi.org/10.5281/zenodo.7845968 https://api.crossref.org/': {'omid_only': 0,
                                                                           'other_pids': 16},
      'https://nih.figshare.com/collections/iCite_Database_Snapshots_NIH_Open_Citation_Collection_/4586573/42': {'omid_only': 1,
                                     

In [5]:
# count all the values in the nested dictionaries
total = 0
for k_type, v_source in analysis_results.items():
    for k1, v1 in v_source.items():
        total += sum(v1.values())
print(total)



18131618


In [11]:



# Create a DataFrame to make it easier to work with the data
df = pd.DataFrame(analysis_results).T  # Transpose the DataFrame to have 'br types' as columns

# Add new columns for sum of 'omid_only' and 'other_pids'
df['omid_only_sum'] = df.apply(lambda row: sum(item.get('omid_only', 0) if isinstance(item, dict) else 0 for item in row), axis=1)
df['other_pids_sum'] = df.apply(lambda row: sum(item.get('other_pids', 0) if isinstance(item, dict) else 0 for item in row), axis=1)
# Add new column for sum of all values
df['Number of BRs'] = df['omid_only_sum'] + df['other_pids_sum']


# Reset the index to have 'br types' as a regular column
df.reset_index(inplace=True)
df.rename(columns={'index': 'BR Type'}, inplace=True)
df['BR Type'].replace('', 'Unknown', inplace=True)  # Replace empty type string with 'Unknown'

# Create a new column for the legend labels
df['Legend Label'] = df['BR Type'] + ' (' + df['Number of BRs'].astype(str) + ')'


# Create the bar chart
fig = px.bar(df, x='BR Type', y='Number of BRs', text='Number of BRs', color='Legend Label',
             # labels={'Number of BRs': 'Number of BRs', 'omid_only_sum': 'Omid Only', 'With other PIDs': 'other_pids_sum'},
             title='Number of non-mapped BRs per BR Type',
             hover_name='Legend Label',
             hover_data=['omid_only_sum', 'other_pids_sum'],
             # custom_data=df[df.columns[1:15]].fillna('0')
             )

# Customize the layout
fig.update_layout(xaxis_title='BR Type', yaxis_title='Number of BRs')


# print to html file
fig.write_html('graphs/non_mapped_brs_per_br_type.html')

fig.show()


In [7]:
tmp_df = pd.read_json('provenance_analysis_results.json', orient='index')

columns_to_keep = ['proceedings', 'journal issue', 'book', 'journal volume', 'dataset', 'Unknown', 'journal article', 'reference book', 'report', 'journal']
tmp_df = tmp_df.T

# rename column with empty string to 'Unknown'
tmp_df.rename(columns={'': 'Unknown'}, inplace=True)
# rename index column to 'source'
tmp_df.index.rename('source', inplace=True)
# tmp_df.rename(columns={'index': 'source'}, inplace=True)
reduced_df = tmp_df[columns_to_keep]
reduced_df 

# reduced_df['type'].fillna('Unknown', inplace=True)

Unnamed: 0_level_0,proceedings,journal issue,book,journal volume,dataset,Unknown,journal article,reference book,report,journal
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
https://api.crossref.org/,"{'omid_only': 5046154, 'other_pids': 31}","{'omid_only': 4667606, 'other_pids': 79827}","{'omid_only': 2405267, 'other_pids': 91797}","{'omid_only': 1440942, 'other_pids': 95}","{'omid_only': 0, 'other_pids': 46}","{'omid_only': 356955, 'other_pids': 324066}","{'omid_only': 0, 'other_pids': 4983}","{'omid_only': 186140, 'other_pids': 25}","{'omid_only': 0, 'other_pids': 15}","{'omid_only': 56561, 'other_pids': 52}"
https://api.crossref.org/snapshots/monthly/2023/09/all.json.tar.gz,"{'omid_only': 324639, 'other_pids': 0}","{'omid_only': 202652, 'other_pids': 15}","{'omid_only': 2626, 'other_pids': 16272}","{'omid_only': 106965, 'other_pids': 0}",,"{'omid_only': 71227, 'other_pids': 149362}","{'omid_only': 0, 'other_pids': 484}","{'omid_only': 2286, 'other_pids': 1}",,"{'omid_only': 5000, 'other_pids': 3}"
https://doi.org/10.5281/zenodo.7845968,"{'omid_only': 0, 'other_pids': 11487}","{'omid_only': 1075, 'other_pids': 0}","{'omid_only': 0, 'other_pids': 1247}",,"{'omid_only': 0, 'other_pids': 203240}","{'omid_only': 0, 'other_pids': 355075}","{'omid_only': 0, 'other_pids': 202018}",,"{'omid_only': 0, 'other_pids': 1993}","{'omid_only': 0, 'other_pids': 19}"
https://nih.figshare.com/collections/iCite_Database_Snapshots_NIH_Open_Citation_Collection_/4586573/42,"{'omid_only': 786, 'other_pids': 0}","{'omid_only': 102115, 'other_pids': 0}","{'omid_only': 3757, 'other_pids': 0}","{'omid_only': 22602, 'other_pids': 0}",,"{'omid_only': 1, 'other_pids': 153}","{'omid_only': 0, 'other_pids': 42009}","{'omid_only': 1, 'other_pids': 0}",,"{'omid_only': 40080, 'other_pids': 1499}"
https://doi.org/10.5281/zenodo.7845968 https://api.crossref.org/,"{'omid_only': 0, 'other_pids': 17}","{'omid_only': 2830, 'other_pids': 37}","{'omid_only': 0, 'other_pids': 2}","{'omid_only': 1730, 'other_pids': 0}","{'omid_only': 0, 'other_pids': 190}","{'omid_only': 0, 'other_pids': 16}","{'omid_only': 0, 'other_pids': 57}",,,"{'omid_only': 5, 'other_pids': 0}"
https://api.datacite.org/ https://api.crossref.org/,"{'omid_only': 1, 'other_pids': 0}","{'omid_only': 457, 'other_pids': 23}","{'omid_only': 1, 'other_pids': 28}","{'omid_only': 436, 'other_pids': 0}","{'omid_only': 0, 'other_pids': 3521}","{'omid_only': 0, 'other_pids': 287}","{'omid_only': 0, 'other_pids': 7}",,"{'omid_only': 0, 'other_pids': 115}",
https://nih.figshare.com/collections/iCite_Database_Snapshots_NIH_Open_Citation_Collection_/4586573/42 https://doi.org/10.5281/zenodo.7845968,,"{'omid_only': 3847, 'other_pids': 0}",,"{'omid_only': 910, 'other_pids': 0}",,,"{'omid_only': 0, 'other_pids': 374}",,,"{'omid_only': 13, 'other_pids': 21}"
https://nih.figshare.com/collections/iCite_Database_Snapshots_NIH_Open_Citation_Collection_/4586573/42 https://api.crossref.org/,,"{'omid_only': 3305, 'other_pids': 120}",,"{'omid_only': 2125, 'other_pids': 0}",,,"{'omid_only': 0, 'other_pids': 2246}",,,"{'omid_only': 0, 'other_pids': 3}"
https://api.crossref.org/snapshots/monthly/2023/09/all.json.tar.gz https://api.crossref.org/,,"{'omid_only': 0, 'other_pids': 92}","{'omid_only': 0, 'other_pids': 589}",,,,,,,
https://nih.figshare.com/collections/iCite_Database_Snapshots_NIH_Open_Citation_Collection_/4586573/42 https://doi.org/10.5281/zenodo.7845968 https://api.crossref.org/,,"{'omid_only': 12, 'other_pids': 13}",,"{'omid_only': 27, 'other_pids': 0}",,,"{'omid_only': 0, 'other_pids': 31}",,,


In [8]:
# reduced_df.to_latex('provenance_analysis_results.tex', index=False)

In [9]:

def get_tot_contribution_by_source(sources_for_type:dict):
    
    res = dict()
    for k, v in sources_for_type.items():
        if ' ' not in k:
            if k in res:
                res[k] += v
            else:
                res[k] = v
        else:
            for single_source in k.split():
                if single_source in res:
                    res[single_source] += v
                else:
                    res[single_source] = v
    return res


In [10]:
ji = sorted_res['journal issue']
ji_by_source = get_tot_contribution_by_source(sorted_res['journal issue'])
pprint(ji)
print('\n')
pprint(ji_by_source)

NameError: name 'sorted_res' is not defined

In [None]:
import plotly.express as px


def visualize_sources_proportion(sources_for_type:dict, title:str):
    """
    Shows a pie chart of the proportion of contributions from each single source for a given type of bibliographic resource. Each single source's contribution is the results from the sum of all the bibliographic entities that have the source in their provenance data, therefore it is not necessarily a unique contribution, but rather it includes both the number of resources that are provided exclusively by the source and the number of resources that are provided by the source and other sources.
    :param sources_for_type: 
    :param title: 
    :return: 
    """
    data = get_tot_contribution_by_source(sources_for_type)
    labels = list(data.keys())
    values = list(data.values())

    fig = px.pie(values=values, names=labels, title=title)
    fig.show()
    csp.plot(fig, filename=title, auto_open=False, sharing='public')

In [None]:
visualize_sources_proportion(sorted_res['journal issue'], 'Journal Issue')
visualize_sources_proportion(sorted_res['journal volume'], 'Journal Volume')
visualize_sources_proportion(sorted_res['dataset'], 'Dataset')
visualize_sources_proportion(sorted_res[''], 'Unknown')
visualize_sources_proportion(sorted_res['journal article'], 'Journal Article')

In [None]:
visualize_sources_proportion(sorted_res['journal'], 'Journal')

In [None]:
assert False
for k, v in sorted_res.items():
    visualize_sources_proportion(v, k)