# OpenAlex Cited References
### [STI 2023](https://www.sti2023.org/) Conference - Leiden, Netherlands, September 27-29, 2023
#### Eric Schares, Iowa State University; [eschares.github.io](eschares.github.io) 
#### Sandra Mierz; [https://github.com/smierz](https://github.com/smierz) 
---

# Part 2. Graph and explore Citation Data
### This assumes you have already run the OpenAlex API to collect publication and reference records (see Part 1)
### This notebook expects the following files:
- publications.csv
- pub2ref.csv
- references.csv or .parquet (smaller file size)

In [None]:
import pandas as pd
import plotly.express as px
import pyarrow

In [None]:
# One month test data
# pubs_df = pd.read_csv('files/one_month/publications.csv')
# pub2ref_df = pd.read_csv('files/one_month/pub2ref.csv')
# refs_df = pd.read_parquet('files/one_month/references.parquet')

# One year
pubs_df = pd.read_csv('../files/ISU_2021_fullyear/publications.csv')
pub2ref_df = pd.read_csv('../files/ISU_2021_fullyear/pub2ref.csv')
#refs_df = pd.read_csv('../files/ISU_2021_fullyear/references.csv')
refs_df = pd.read_parquet('../files/ISU_2021_fullyear/references.parquet')

In [None]:
# join tables on id fields - that's why it is important to keep unique openalex ids
### prerequisite for joining: no duplicates in publications or references !

pubs_df.drop_duplicates(inplace=True)
refs_df.drop_duplicates(inplace=True)

In [None]:
### joining
# Takes pub2ref and adds repeated information about the parent pub to the right side
# First two columns are pub2ref, rest are from pubs_df
# Repeats parent information over and over so we can keep track of what parent cited what reference

df = pub2ref_df.join(pubs_df.set_index('publication_id'), on='publication_id')
df.head(3)

In [None]:
pub_id_col = df.pop('reference_id') # move reference_id column to end
df['reference_id'] = pub_id_col     # move reference_id column to end

In [None]:
# bring in reference information for each row
df = df.join(refs_df.set_index('reference_id'), on='reference_id')
df.head(3)

In [None]:
# Calculate the year delta, or how many years old a reference was when it got cited
df['year_delta'] = df['publication_year'] - df['reference_year']
df.head(3)

In [None]:
# Save combined table
# Optional, may be a large file, turn on if wanted

#df.to_csv('files/combined.csv')

### Basic stats about dataframe

In [None]:
print(f"This dataset contains {pubs_df.shape[0]} publications and their {df.shape[0]} cited references.")

In [None]:
# rows and columns
df.shape

In [None]:
# summary statistics by column
df.describe()

In [None]:
# table with multiindex - connection pub to ref visualized
df_grouped = df.set_index(['publication_id',
                            'publication_doi',
                            'publication_year',
                            'publication_title',
                            'publication_journal',
                            'publication_publisher',
                            'publication_journal_issn',
                            'num_cited_references',
                            'reference_id'])
df_grouped.head(3)

---
## 2.1 Look at publication ("parent") data first

In [None]:
pubs_df = pubs_df.sort_values(by='num_cited_references', ascending=False)
pubs_df = pubs_df.reset_index(drop=True)
pubs_df.head(3)

### Average and median number of references per paper

In [None]:
pubs_df.describe()

In [None]:
# Number of references in each unique paper
df['publication_doi'].value_counts()

In [None]:
pubs_df['num_cited_references'].value_counts().sort_index()

### OpenAlex reports 0 references for some papers, even though manual investigation shows there are references there. 

### Not all reference data is openly available. See the Initiative for Open Citations [I4OC](https://i4oc.org/)

In [None]:
# number of publications with 0 reported references
pubs_df.loc[pubs_df['num_cited_references']==0].shape[0]

In [None]:
# percentage of publications with 0 reported references
print(f"{(pubs_df.loc[pubs_df['num_cited_references']==0].shape[0] / pubs_df.shape[0])*100:.2f}% of publications in this set have 0 references reported")

---
### Make plots

In [None]:
# make all numbers same color except for 0 references
color_dict = {num:'blue' for num in pubs_df['num_cited_references'] if num != 0}
color_dict[0]='lightgray'

In [None]:
fig = px.histogram(pubs_df, x='num_cited_references', nbins=50,
             color='num_cited_references',
             color_discrete_map=color_dict,
             title=f'Histogram of the Number of Cited References in {pubs_df.shape[0]} Publications<br>Num_references=0 shown in light gray'
)
fig.update_layout(showlegend=False)

In [None]:
px.ecdf(pubs_df, x='num_cited_references', ecdfnorm='percent',
       title=f'Cumulative Distribution of the Number of Cited References in {pubs_df.shape[0]} Publications')

## Remove publications with 0 reported references

In [None]:
pubs_df_no_zeros = pubs_df.loc[pubs_df['num_cited_references']!=0]
pubs_df_no_zeros.head(3)

In [None]:
pubs_df_no_zeros.describe()

In [None]:
px.histogram(pubs_df_no_zeros, x='num_cited_references', nbins=50,
             text_auto=True,
             title=f'Histogram of the Number of Cited References in {pubs_df_no_zeros.shape[0]} Publications<br>Num_references=0 *removed*')

In [None]:
px.ecdf(pubs_df_no_zeros, x='num_cited_references', ecdfnorm='percent',
       title=f'Cumulative Distribution of the Number of Cited References in {pubs_df_no_zeros.shape[0]} Publications<br>Num_references=0 *removed*',
       width=750)



---
## 2.2 What journals were cited by these publications?

In [None]:
# Number of journals
df['reference_journal'].describe()

In [None]:
journal_counts = df.value_counts(['reference_journal', 'reference_publisher'])
journal_counts = journal_counts.to_frame().reset_index().rename(columns= {0: 'citations', 'reference_journal' : 'title'})
journal_counts.index.name = 'index'

# Top 10 most cited journals
journal_counts.head(10)

In [None]:
# Save top 10 journals out to .csv file
journal_counts.head(10).to_csv("files/journal_counts.csv")

In [None]:
px.bar(journal_counts[:50], 
       x="title", y="citations", color="reference_publisher",
       height=900,
       color_discrete_sequence=px.colors.qualitative.Set1,
       title=f"Top 50 journals cited in this set, by Title and Publisher")

In [None]:
# How many unique titles were cited at least once within each publisher?
title_counts = journal_counts.groupby("reference_publisher")["title"].count().sort_values(ascending=False)
title_counts.head(10)

---
## 2.3 What publishers?

In [None]:
# Number of publishers
df['reference_publisher'].describe()

In [None]:
publisher_counts = df.value_counts(['reference_publisher'])
publisher_counts = publisher_counts.to_frame().reset_index().rename(columns= {0: 'citations'})
publisher_counts.index.name = 'index'
publisher_counts['percentage'] = (publisher_counts['citations'] / df.shape[0]) * 100

#preview
publisher_counts.head(20)

In [None]:
px.histogram(publisher_counts[:20], 
       x="reference_publisher", y="citations", color="reference_publisher", text_auto=True,
       height=900,
       title=f"Publishers cited in this set")

In [None]:
px.histogram(publisher_counts[:20], 
       x="reference_publisher", y="percentage", color="reference_publisher",
       height=900,
       title=f"Publishers cited in this set")

---
## 2.4 What articles were cited?

In [None]:
# Number of articles
df['reference_title'].describe()

In [None]:
article_counts = df.value_counts(['reference_doi', 'reference_title', 'reference_journal'])
article_counts = article_counts.to_frame().reset_index().rename(columns= {0: 'citations', 'reference_title' : 'reference_title'})
article_counts.index.name = 'index'

#preview
article_counts.head(10)

---
## 2.5 What years were those referenced articles published?

### Oldest Reference is:

In [None]:
df.loc[df['reference_year']==df['reference_year'].min()]

### From what year?

In [None]:
df.loc[df['reference_year']==df['reference_year'].min()]['reference_year']

### Make plots

In [None]:
fig_years_count = px.histogram(df, x='reference_year', nbins=400, 
             title=f'Histogram of Cited Year<br>{pubs_df.shape[0]} Publications and {df.shape[0]} References')

fig_years_count['layout']['xaxis']['autorange'] = "reversed"

fig_years_count

In [None]:
fig_years_prob = px.histogram(df, x='reference_year', nbins=400, histnorm='probability density',
            title=f'Probability Density of the Cited Year<br>{pubs_df.shape[0]} Publications and {df.shape[0]} References<br>Zoomed in')

fig_years_prob['layout']['xaxis']['autorange'] = "reversed"

fig_years_prob.show()

In [None]:
fig5 = px.ecdf(df, x='reference_year', ecdfnorm='percent',markers=True, lines=False,
        color_discrete_map={'red':'red', 'blue':'blue'},
       title=f'Cumulative Distribution of Year of Citation<br>{df.shape[0]} references'
)
fig5.update_layout(showlegend=False)

---
## 2.6 Track one publication of interest and its references

### Add 'color' column to control the colors and change color of one DOI to track it on the plot

In [None]:
# Change DOI in this line
red_doi = 'https://doi.org/10.1021/acs.accounts.0c00741'

#https://doi.org/10.1021/acs.accounts.0c00741   used for example
#https://doi.org/  # oldest publication
#https://doi.org/10.5252/adansonia2021v43a8   # publication with oldest average reference at 138 years
#https://doi.org/10.1386/ijia_00033_1  - 0 average year, referenced 1 work, which is itself?

df['color'] = 'blue'
red_title = df.loc[df['publication_doi']==red_doi, 'publication_title'].iloc[0]
red_title

In [None]:
# Change color for that DOI to red
filt = (df['publication_doi'] == red_doi)
df.loc[filt,'color'] = 'red'

In [None]:
# Double check the number that you changed to red, should match number of references in that DOI
df['color'].value_counts()

In [None]:
red_count = df[df['color']=='red']
red_count.shape[0]

In [None]:
fig_trackone_count = px.histogram(df, x='reference_year', color='color', nbins=200,
             title=f'Years when Cited References were published<br>Red: "{red_title}"',
             hover_data={'color':False,
                         'reference_title':True},
             color_discrete_map={'red':'red', 'blue':'blue'},
             category_orders={"color":['blue','red']}
)
fig_trackone_count.update_layout(showlegend=False)
fig_trackone_count['layout']['xaxis']['autorange'] = "reversed"

fig_trackone_count.show()

In [None]:
fig_trackone_box = px.box(df, x='reference_year', points='all', color='color', notched=True,
       title=f'Years when Cited References were published<br>Red: "{red_title}"',
       hover_data={'color':False,
                    'reference_title':True,
                   'publication_year':True,      
                   'publication_title':True},
       color_discrete_map={'red':'red', 'blue':'blue'},
       category_orders={"color":['blue','red']}
)
fig_trackone_box.update_layout(showlegend=False)

In [None]:
fig_trackone_ecdf = px.ecdf(df, x='reference_year', color='color', ecdfnorm='percent',markers=True, lines=False,
        color_discrete_map={'red':'red', 'blue':'blue'},
               hover_data={'color':False,
                    'reference_title':True,
                   'reference_year':True},
       title=f'Blue: Cumulative Distribution of Year for {df.shape[0] - red_count.shape[0]} references<br>Red: {red_count.shape[0]} references in "{red_title}"'
)
fig_trackone_ecdf.update_layout(showlegend=False, width=650)

fig_trackone_ecdf['layout']['xaxis']['autorange'] = "reversed"

fig_trackone_ecdf.show()

---
## 2.7 Calculate the year delta, or how many years old a reference was when it got cited

In [None]:
df['year_delta'].describe()

### Group by publication, get one number per publication that shows the average age of its references

In [None]:
#df2 = df.groupby('publication_title')['year_delta'].mean().to_frame(name='avg_year_delta')
df2 = df.groupby(['publication_title']).agg(avg_year_delta=('year_delta', 'mean'), num_references=('reference_title', 'size')).reset_index()

df2.sample(5)  # show 5 random results

In [None]:
df2 = df2.sort_values(by='avg_year_delta')
df2

In [None]:
df2.to_csv('files/avg_year_deltas.csv', index=False)

In [None]:
df2['avg_year_delta'].describe()

In [None]:
px.ecdf(df2, x='avg_year_delta',
       title=f'Cumulative Distribution of the Average Age of Reference by Publication<br>{df2.shape[0]} Publications<br>'
)