In [16]:
from analytics.helper import *
import chart_studio.plotly as csp
import pandas as pd
import plotly.express as px

In [17]:
csv_file = 'mm_latest/multi_mapped_omids.csv'

df = pd.read_csv(csv_file, sep=',', header=None, names=['omid', 'openalex_id', 'type'])

In [18]:
df

Unnamed: 0,omid,openalex_id,type
0,omid,openalex_id,type
1,omid:br/061202342190,W4300573357 W3121236676,proceedings article
2,omid:br/061501314700,W3103430593 W4231214420,journal article
3,omid:br/061501311841,W3209174425 W3200094642,journal article
4,omid:br/061501311942,W3201195150 W4293777174,journal article
...,...,...,...
173509,omid:br/06190311441,W2197031826 W2172190194,journal article
173510,omid:br/06190311461,W1581076644 W4210857697,journal article
173511,omid:br/06190311043,W1594173748 W4298838588,journal article
173512,omid:br/06190311543,W3123497509 W4298115774,journal article


In [19]:
# ----------------LOG SCALE HISTOGRAM (PRIMARY ENTITIES)----------------

viz_df = add_columns_to_df(df)
viz_df['type'].fillna('unspecified', inplace=True)

hist_data_df = viz_df.groupby(['oaid_count', 'type', 'composition']).size().reset_index(name='frequency')


# define new custom legend names (including the total number of occurrences for each type)    
legend_names = {brtype: f"{brtype} ({viz_df['type'].value_counts(dropna=False).get(brtype)})" for brtype in viz_df['type'].unique()}


fig = px.bar(hist_data_df, x='oaid_count', y='frequency', color='type', hover_data=['composition'], log_y=True)

fig.for_each_trace(lambda t: t.update(name = legend_names[t.name]))

fig.update_layout(title='Distribution of OpenAlex ID counts by type',
                  xaxis_title='Number of OpenAlex IDs for a single OMID',
                  yaxis_title='Frequency (log)', yaxis_type='log')

# write html file of the log scale histogram
fig.write_html('graphs/multi_mapped_latest.html')

fig.show()

# upload the log scale histogram to plotly chart studio
# csp.plot(pents_fig, filename='multi_mapped_latest', auto_open=False, sharing='public', fileopt='new')#, fileopt='new')

In [20]:
fig = px.histogram(
    hist_data_df,
    x='oaid_count',
    y='frequency', 
    range_x=[0, 60], 
    nbins=2000, 
    log_y=True, 
    histfunc='sum', 
    text_auto=True, 
    histnorm='percent',
    labels={'oaid_count': 'Number of OpenAlex IDs for a single OMID', 'frequency': 'Frequency'},
) #nbins=2000,  histfunc='sum', text_auto=True
fig.show()
fig.write_html('graphs/mm_histogram_reduced.html')

# csp.plot(fig, filename='mm_histogram_reduced', auto_open=False, sharing='public', fileopt='new')#, fileopt='new')




## Attention
the following is only for creating the histogram for the paper, to be laer adapted by hand via plotly chart studio web interface.


In [45]:
assert False


mod_df = hist_data_df
mod_df['oaid_count'] = hist_data_df['oaid_count'].apply(lambda x: 63 if x > 60 else x)


fig = px.histogram(
    mod_df,
    x='oaid_count',
    y='frequency', 
    # range_x=[0, 61], 
    nbins=100, 
    log_y=True, 
    histfunc='sum', 
    text_auto=True,
    histnorm='percent',
    labels={'oaid_count': 'Number of OpenAlex IDs mapped to a single OMID', 'frequency': 'Percentage (log scale)'},
    
) #nbins=2000,  histfunc='sum', text_auto=True


tickvals = [0, 2, 10, 20, 30, 40, 50, 60, 63]  # 63 is the max value for the x axis, after having purposefully reduced all the values > 60 to 63
ticktext = ['0', '2', '10', '20', '30', '40', '50', '60', '>60']


fig.update_layout(
    xaxis=dict(
        tickvals=tickvals,  # Sets the values at which ticks on this axis appear.
        ticktext=ticktext,  # Sets the text displayed at the ticks position via `tickvals`
    )
)
fig.show()

# fig.write_html('graphs/optimized_mm_histogram.html')
csp.plot(fig, filename='optimized_mm_histogram', auto_open=False, sharing='public', fileopt='new')#, fileopt='new')




'https://plotly.com/~eliarizzetto/412/'