In [1]:
# import sqlite3 as sql
# import json 
# 
# db_path = 'E:/provenance.db'
# lookup_br = 'https://w3id.org/oc/meta/br/06201134857'
# 
# with sql.connect(db_path) as conn:
#     c = conn.cursor()
#     query = 'SELECT source_uri FROM Provenance WHERE br_uri = ?'
#     c.execute(query, (lookup_br, ))
#     
#     rows = c.fetchall()
# 
#     for row in rows:
#         res = json.loads(row[0])
#         print(res)
    

In [2]:
import json
analysis_results = json.load(open('provenance_analysis_results_11nov.json', 'r', encoding='utf-8'))


In [15]:
def sort_prov_analysis_results(provenance_data:dict):
    """
    Sort the results of the analysis on provenance data by the sum of values in nested dictionaries in descending order. Each nested dictionary is also sorted by values in descending order.
    :param provenance_data: 
    :return: 
    """
    for key in provenance_data:
        provenance_data[key] = dict(sorted(provenance_data[key].items(), key=lambda x: x[1], reverse=True))

    # Sort the outer dictionary by the sum of values in nested dictionaries in descending order
    result = dict(sorted(provenance_data.items(), key=lambda x: sum(x[1].values()), reverse=True))

    return result

In [17]:
from pprint import pprint

other_results = json.load(open('provenance_analysis_results_11nov_id_info.json', 'r', encoding='utf-8'))
pprint(sort_prov_analysis_results(other_results))


TypeError: '<' not supported between instances of 'dict' and 'dict'

In [None]:
sorted_res = sort_prov_analysis_results(analysis_results)
# count all the values in the nested dictionaries
total = 0
for k, v in sorted_res.items():
    total += sum(v.values())
print(total)

sorted_res

In [None]:
import plotly.express as px
import pandas as pd
import chart_studio.plotly as csp



# Create a DataFrame to make it easier to work with the data
df = pd.DataFrame(sorted_res).T  # Transpose the DataFrame to have 'br types' as columns

# Sum the values for each 'br type'
df['Number of BRs'] = df.sum(axis=1)

# Reset the index to have 'br types' as a regular column
df.reset_index(inplace=True)
df.rename(columns={'index': 'BR Type'}, inplace=True)
df['BR Type'].replace('', 'Unknown', inplace=True)  # Replace empty type string with 'Unknown'

# Create a new column for the legend labels
df['Legend Label'] = df['BR Type'] + ' (' + df['Number of BRs'].astype(str) + ')'


# Create the bar chart
fig = px.bar(df, x='BR Type', y='Number of BRs', text='Number of BRs', color='Legend Label',
             labels={'Number of BRs': 'Number of BRs'},
             title='Number of non-mapped BRs per BR Type (with provenance information)',
             hover_name='Legend Label',
             # hover_data=df[df.columns[1:13]].fillna('0'),
             custom_data=df[df.columns[1:13]].fillna('0')
             )

# Customize the layout
fig.update_layout(xaxis_title='BR Type', yaxis_title='Number of BRs')

# display hover info for the sum of values in nested dictionaries

hover_template = '<b>%{hovertext}</b><br><br>' + \
                 'Number of BRs: %{y}<br><br>' + \
                 '<b>Provenance Data</b><br>' + \
                 'Crossref: %{customdata[0]}<br>' + \
                 'Zenodo: %{customdata[1]}<br>' + \
                 'Pubmed: %{customdata[2]}<br>' + \
                 'Zenodo + Crossref: %{customdata[3]}<br>' + \
                 'Datacite + Crossref: %{customdata[4]}<br>' + \
                 'Zenodo + Pubmed: %{customdata[5]}<br>' + \
                 'Pubmed + Crossref: %{customdata[6]}<br>' + \
                 'Zenodo + Crossref + PubMed: %{customdata[7]}<br>' + \
                    'Datacite: %{customdata[8]}<br>' + \
                    'Crossref + Zenodo + Datacite: %{customdata[9]}<br>' + \
                    'Datacite + Zenodo: %{customdata[10]}<br>' + \
                    'Zenodo + Datacite + PubMed: %{customdata[11]}<br>'

fig.update_traces(hovertemplate=hover_template)




fig.show()
csp.plot(fig, filename='non_mapped_brs_provenance', auto_open=False, sharing='public')#, fileopt='new')


In [None]:
from pprint import pprint

In [None]:

def get_tot_contribution_by_source(sources_for_type:dict):
    """
    
    :param sources_for_type: 
    :return: 
    """
    
    res = dict()
    for k, v in sources_for_type.items():
        if ' ' not in k:
            if k in res:
                res[k] += v
            else:
                res[k] = v
        else:
            for single_source in k.split():
                if single_source in res:
                    res[single_source] += v
                else:
                    res[single_source] = v
    return res


In [None]:
ji = sorted_res['journal issue']
ji_by_source = get_tot_contribution_by_source(sorted_res['journal issue'])
pprint(ji)
print('\n')
pprint(ji_by_source)

In [None]:
import plotly.express as px


def visualize_sources_proportion(sources_for_type:dict, title:str):
    """
    Shows a pie chart of the proportion of contributions from each single source for a given type of bibliographic resource. Each single source's contribution is the results from the sum of all the bibliographic entities that have the source in their provenance data, therefore it is not necessarily a unique contribution, but rather it includes both the number of resources that are provided exclusively by the source and the number of resources that are provided by the source and other sources.
    :param sources_for_type: 
    :param title: 
    :return: 
    """
    data = get_tot_contribution_by_source(sources_for_type)
    labels = list(data.keys())
    values = list(data.values())

    fig = px.pie(values=values, names=labels, title=title)
    fig.show()
    csp.plot(fig, filename=title, auto_open=False, sharing='public')

In [None]:
visualize_sources_proportion(sorted_res['journal issue'], 'Journal Issue')
visualize_sources_proportion(sorted_res['journal volume'], 'Journal Volume')
visualize_sources_proportion(sorted_res['dataset'], 'Dataset')
visualize_sources_proportion(sorted_res[''], 'Unknown')
visualize_sources_proportion(sorted_res['journal article'], 'Journal Article')

In [None]:
visualize_sources_proportion(sorted_res['journal'], 'Journal')

In [None]:
assert False
for k, v in sorted_res.items():
    visualize_sources_proportion(v, k)