# Paper Trends

## Imports

In [None]:
%load_ext autoreload
%autoreload 2
%aimport
%matplotlib inline

In [None]:
import os
import sys
nb_dir = os.path.dirname(os.path.split(os.getcwd())[0])
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [None]:
from tqdm import tqdm_notebook as tqdm
import pandas as pd
from turicreate import SFrame, load_sframe
from pathlib import Path
import turicreate.aggregate as agg
import numpy as np

In [None]:
import json
import os
import matplotlib.pyplot as plt
import pandas as pd
import math
import glob
import ntpath
from tqdm import tqdm
import seaborn as sns
from  matplotlib.ticker import FuncFormatter
import datetime
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

## Utility Functions

In [None]:
def convert_to_barchart_format(sf,x, year_column="Year", count_column="count",  year_range=(1786,2019)):
    year_sf = SFrame()
    year_sf[year_column] = np.linspace(year_range[0],year_range[1],year_range[1]-year_range[0]+1).tolist()
    year_sf[year_column] = year_sf[year_column]
    sf[year_column] = sf[year_column].astype(float)
    res_sf = SFrame()
    for d in tqdm(sf[x].unique()):
        temp_sf = SFrame()
        temp_sf[x] = [d]*len(year_sf)
        temp_sf[year_column] = year_sf[year_column]
        res_sf = res_sf.append(temp_sf)
    sf = sf.join(res_sf,how="right").sort(year_column)
    sf = sf.fillna(count_column,0)
    df = sf.to_dataframe()
    df = df.sort_values([x,year_column])
    df['value'] = df.groupby([x])[count_column].cumsum()
    df["lastValue"] = df.groupby([x])["value"].shift(1)
    df = df.fillna(0)
    df["rank"] =df.groupby([year_column])["value"].rank(ascending=False)
    return df.rename(columns={x:"name", year_column: "year",count_column:"count"})[["year","name","value","lastValue","rank"]]

In [None]:
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i + n]

def get_d(sf_corr, diseases_id):
    for data in sf_corr.groupby("id"):
        if len(data[1]) >5:
            yield f"{data[0]}: {diseases_id[diseases_id['id']==data[0]][0]['Disease'].title()}", data[1].sort_values("year")

In [None]:


sns.set(style="ticks")
def create_gird(df, col, hue,x,y,sharey=True, legend=False):

    # Initialize a grid of plots with an Axes for each walk
    grid = sns.FacetGrid(df, col=col, hue=hue, palette=sns.color_palette("hls", 4),sharey=sharey,
                         col_wrap=3, height=4.5)
    plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))

    # Draw a horizontal line to show the starting point
    grid.map(plt.axhline, y=0, ls=":", c=".5")

    # Draw a line plot to show the trajectory of each random walk
    grid.map(plt.plot, x, y)
    grid.set_titles("{col_name}")
    if legend:
        grid.add_legend()


    # Adjust the arrangement of the plots
    grid.fig.tight_layout(w_pad=1)
    return grid

## Analysis

In [None]:
spothlight = ["SARS","MERS Coronavirus", "Avian Influenza","Ebola", "Influenza", "HIV/AIDS","Hepatitis B","Hepatitis C", "Swine Flu"]
years = [2002,2012,1878,1976,1878,1981,1966,1987,1918 ]
min_refs = 5

### Data Loading

In [None]:
diseases_id= load_sframe("Data/diseases_id.csv")
disease_names = SFrame.read_csv("Data/disease_names.csv")

General MAG Medicine Publications:

In [None]:
med_mag = load_sframe("Data/mag/med_mag.sframe")

In [None]:
len(med_mag)

MAG Medicine Publications about the specific diseases:


In [None]:
diseases_mag = load_sframe("Data/mag/diseases_med_mag.sframe")

General MAG Virology Publications:

In [None]:
len(diseases_mag)

In [None]:
viro_mag = load_sframe("Data/mag/viro_mag.sframe")

MAG Virology Publications about the specific diseases"

In [None]:
len(viro_mag)

In [None]:
diseases_viro_mag = load_sframe("Data/mag/diseases_viro_mag.sframe")

In [None]:
len(diseases_viro_mag)

### Number of papaers by diseases from 2001

In [None]:
diseases = diseases_mag[(diseases_mag["Year"]>2001)&(diseases_mag["Ref Number"]>min_refs)]

In [None]:
diseases = diseases.filter_by(spothlight, "disease")["disease"].value_counts()

In [None]:
diseases = diseases.rename({"value":"Disease", "count": "Numer of Papers"})

In [None]:
plt.figure(figsize=(20,10))
sns.set()
colors = ["#4374B3", "#4374B3"]
# Set your custom color palette
sns.set_palette(sns.color_palette(colors))

ax = sns.barplot(x="Disease", y="Numer of Papers", data=diseases.to_dataframe(), color="#4374B3")
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
plt.tight_layout()
plt.savefig("output/Papers/disease_count.svg")


We filter all publication that are not academic papers (editorials, letters, etc.).
This type of publication rarely cite other papers filtering the number of refernces removes this kind of publications from the dataset.

In [None]:
med_mag = med_mag[med_mag["Ref Number"]>min_refs]
viro_mag = viro_mag[viro_mag["Ref Number"]>min_refs]
diseases_mag = diseases_mag[diseases_mag["Ref Number"]>min_refs].filter_by(spothlight, "disease")
diseases_viro_mag = diseases_viro_mag[diseases_viro_mag["Ref Number"]>min_refs].filter_by(spothlight, "disease")

### Publications - Citation

#### NPR

Publication data normaliztion

In [None]:
def nomalize_disease_publications(diseases_sf, general_sf):
    diseases_pub_count = diseases_sf.groupby(["disease","Year"], {"Number of papers": agg.COUNT()})
    papers_year = general_sf.groupby("Year", {"Total Number of papers": agg.COUNT()})
    diseases_pub_count = diseases_pub_count.join(papers_year,{"Year":"Year"})
    diseases_pub_count["NPR"] = diseases_pub_count["Number of papers"] / diseases_pub_count["Total Number of papers"]
    diseases_pub_count = diseases_pub_count.rename({"disease":"Disease"})
    return diseases_pub_count.sort(["Disease","Year"])
    

In [None]:
diseases_pub_count_viro = nomalize_disease_publications(diseases_viro_mag, viro_mag)
diseases_pub_count_med = nomalize_disease_publications(diseases_mag, med_mag)

In [None]:
diseases_pub_count_viro["Type"] = "Virolgy"
diseases_pub_count_med["Type"] = "Medicine"
diseases_pub_count = diseases_pub_count_viro.append(diseases_pub_count_med)


In [None]:
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i + n]

def get_data(sf_corr):
    for data in sf_corr.groupby("Disease"):
        if len(data[1]) >5:
            yield data[1].sort_values("Year")

Filter the data:

In [None]:
pub = SFrame()
for d,y in zip(spothlight, years):
    pub = pub.append( diseases_pub_count[(diseases_pub_count["Disease"]==d)&(diseases_pub_count["Year"]>=y)])

In [None]:
pub["Normalized Paper Rate"] = pub["NPR"]

Generate SVG

In [None]:

sns.set(font_scale=1.3)


plt.rc('text', usetex=False)
plt.figure(figsize=(16, 12))
des = list(get_data(pub[(pub["Year"]>=1980)&(pub["Type"]== "Virolgy")].to_dataframe()))
for i, curr_f in enumerate(tqdm(chunks(des, 20), total=((len(des) // 20)+1))):
    create_gird(pd.concat(curr_f),"Disease","Type","Year", "Normalized Paper Rate",False,False)
    plt.savefig(f"output/Papers/Virolgy_NPR_{i}.svg")
#     plt.close()



In [None]:

sns.set(font_scale=1.3)


plt.rc('text', usetex=False)
plt.figure(figsize=(16, 12))
des = list(get_data(pub[(pub["Year"]>=1980)&(pub["Type"]== "Medicine")].to_dataframe()))
for i, curr_f in enumerate(tqdm(chunks(des, 20), total=((len(des) // 20)+1))):
    create_gird(pd.concat(curr_f),"Disease","Type","Year", "Normalized Paper Rate",False,False)
    plt.savefig(f"output/Papers/Medicine_NPR_{i}.svg")
#     plt.close()



Generate multi-page PDF

In [None]:

sns.set(font_scale=1.3)

# Create the PdfPages object to which we will save the pages:
# The with statement makes sure that the PdfPages object is closed properly at
# the end of the block, even if an Exception occurs.
with PdfPages('output/Papers/Medicine_NPR.pdf') as pdf:
    # if LaTeX is not installed or error caught, change to `usetex=False`
    plt.rc('text', usetex=False)
    plt.figure(figsize=(8, 6))
    des = list(get_data(pub[(pub["Year"]>=1980)&(pub["Type"]== "Medicine")].to_dataframe()))
    for i, curr_f in enumerate(tqdm(chunks(des, 20), total=((len(des) // 20)+1))):
        create_gird(pd.concat(curr_f),"Disease","Type","Year", "Normalized Paper Rate",False,False)
        pdf.savefig()
        plt.close()



In [None]:
pub["Normalized Paper Rate"] = np.log(pub["NPR"])

In [None]:
import plotly.express as px

fig = px.line(pub[(pub["Type"]=="Virolgy")&(pub["Year"]>1959)].to_dataframe(), x="Year", y="Normalized Paper Rate",color="Disease", width=1600, height=800)
fig.update_layout({"legend":{"x":0,"y":1.1}, "legend_orientation":"h"},  font=dict(
        size=20,
    ))


fig.show()

In [None]:
# import plotly.io as pio
# pio.orca.config.server_url = "http://localhost:9091"
# fig.write_image("output/Papers/disease-npr.svg")


Plot Similarity Using DTW

In [None]:
data = pub[(pub["Year"]>=1980)&(pub["Type"]== "Virolgy")&(pub["Year"]<2019)][["Disease","Year","NPR"]].to_dataframe()

In [None]:
data = data.sort_values(["Disease","Year"])

In [None]:
from tslearn.metrics import dtw 

res= {"Disease1":[], "Disease2":[], "dtw":[]}
for d1, df1 in data.groupby("Disease"):
    for d2, df2 in data.groupby("Disease"):
        res["Disease1"].append(d1)
        res["Disease2"].append(d2)
        disease1 = df1["NPR"].values
        disease2 = df2["NPR"].values
        res["dtw"].append(dtw(disease1, disease2))


In [None]:
piv_data = []
for d, df in data.groupby("Disease"):
    piv_data.append(df["NPR"].values)

In [None]:
sns.set(font_scale=2.0)
corr = pd.DataFrame(res).pivot(index='Disease1', columns='Disease2', values='dtw')
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    plt.figure(figsize=(40,20))
    ax = sns.heatmap(corr, mask=mask, vmax=.3, square=True, annot=True, fmt='0.3f', cmap=sns.light_palette("#cc0000" , reverse=True, as_cmap=True))
    plt.savefig("output/Papers/dtw_npr.svg")


In [None]:
from tslearn.utils import to_time_series_dataset
from tslearn.clustering import TimeSeriesKMeans

km = TimeSeriesKMeans(n_clusters=2, metric="dtw", max_iter=10, tol=1e-5).fit(to_time_series_dataset(piv_data))

from collections import defaultdict
clusters = defaultdict(lambda: [])
for d, c in zip(corr.index, km.labels_):
    clusters[c].append(d)
clusters

#### NCR

In [None]:
# Calculte the number of citaions for each diseses per year.
def diseses_citations_year(publication_sf):
    disease_citations = publication_sf.stack("Dict of Year_Citation Number",new_column_name=["cite year", "Citations"], drop_na=True)
    disease_citations = disease_citations.groupby(["disease","cite year"], {"Citations": agg.SUM("Citations")})
    disease_citations["cite year"] = disease_citations["cite year"].astype(int)
    return disease_citations.rename({"cite year": "year"})

In [None]:
disease_citations_viro = diseses_citations_year(diseases_viro_mag)

In [None]:
disease_citations_med = diseses_citations_year(diseases_mag)

In [None]:
# The total number of citaions for a year, used to normalize the data.
def citaion_year_mag(publication_sf):
    med_citations = publication_sf.stack("Dict of Year_Citation Number",new_column_name=["cite year", "Citations"], drop_na=True)
    med_citations = med_citations.rename({"cite year": "year"})
    return med_citations.groupby(["year"], operations={"Total Citations": agg.SUM("Citations")})

In [None]:
citations_year_viro = citaion_year_mag(viro_mag)

In [None]:
citations_year_med = citaion_year_mag(med_mag)

In [None]:
citations_year_med["year"] = citations_year_med["year"].astype(int)

In [None]:
citations_year_med.sort("Total Citations",False)

Medicine citaions over time

In [None]:
citations_year_med.to_dataframe().sort_values("year").plot(x="year", y="Total Citations")

Citaion data normaliztion

In [None]:
def norm_disease_citations(disease_citations, citations_year):
    disease_citations = disease_citations.join(citations_year, on="year")
    disease_citations["Citations Norm"] = disease_citations["Citations"]/disease_citations["Total Citations"]
    return disease_citations.join(disease_names)

In [None]:
disease_citations_med = norm_disease_citations(disease_citations_med, citations_year_med)
disease_citations_viro = norm_disease_citations(disease_citations_viro, citations_year_viro)

In [None]:
def clean_disease_citations(disease_citations):
    disease_citations = disease_citations.rename({"year":"Year","Citations Norm":"NCR", "disease": "Disease"})
    disease_citations = disease_citations.join(disease_names, {"id":"id"})
    disease_citations = disease_citations.sort(["Disease", "Year"]) 
    disease_citations = disease_citations.to_dataframe()
    disease_citations = disease_citations[disease_citations["Year"].notna()]
    disease_citations = disease_citations[disease_citations["Year"]<2019]
    return disease_citations.reset_index()

In [None]:
disease_citations_med = clean_disease_citations(disease_citations_med)
disease_citations_viro = clean_disease_citations(disease_citations_viro)

In [None]:
disease_citations_med["Type"] = "Medicine"
disease_citations_viro["Type"] = "Virology"

disease_citations = disease_citations_med.append(disease_citations_viro)


In [None]:
cite = pd.DataFrame()
for d,y in zip(spothlight, years):
    cite = cite.append( disease_citations[(disease_citations["Disease"]==d)&(disease_citations["Year"]>=y)])

In [None]:
cite["Normalized Citaion Rate"] = cite["NCR"]

In [None]:

sns.set(font_scale=1.3)
# sns.set(style="ticks")

plt.rc('text', usetex=False)
plt.figure(figsize=(8, 6))
des = list(get_data(cite[(cite["Year"]>=1980)&(cite["Type"]== "Medicine")]))
for i, curr_f in enumerate(tqdm(chunks(des, 20), total=((len(des) // 20)+1))):
    create_gird(pd.concat(curr_f),"Disease","Type","Year", "Normalized Citaion Rate", False, legend=False)
    plt.savefig(f"output/Papers/Medicine_NCR_{i}.svg")
#     plt.close()



In [None]:

sns.set(font_scale=1.3)

plt.rc('text', usetex=False)
plt.figure(figsize=(8, 6))
des = list(get_data(cite[(cite["Year"]>=1980)&(cite["Type"]== "Virology")]))
for i, curr_f in enumerate(tqdm(chunks(des, 20), total=((len(des) // 20)+1))):
    create_gird(pd.concat(curr_f),"Disease","Type","Year", "Normalized Citaion Rate", False, legend=False)
    plt.savefig(f"output/Papers/Virolgy_NCR_{i}.svg")
#     plt.close()



In [None]:
np.log(10)

In [None]:
10 ** np.log(6)

In [None]:
cite["Normalized Citation Rate"] = np.log(cite["NCR"])

In [None]:
import plotly.express as px

fig = px.line(cite, x="Year", y="Normalized Citaion Rate",color="Disease", width=1600, height=800)

fig.show()

In [None]:
data = cite[(cite["Year"]>=1980)&(cite["Type"]== "Virology")&(cite["Year"]<2019)][["Disease","Year","NCR"]]

In [None]:
data = data.sort_values(["Disease","Year"])

In [None]:
from tslearn.metrics import dtw 

res= {"Disease1":[], "Disease2":[], "dtw":[]}
for d1, df1 in data.groupby("Disease"):
    for d2, df2 in data.groupby("Disease"):
        res["Disease1"].append(d1)
        res["Disease2"].append(d2)
        disease1 = df1["NCR"].values
        disease2 = df2["NCR"].values
        res["dtw"].append(dtw(disease1, disease2))


In [None]:
piv_data = []
for d, df in data.groupby("Disease"):
    piv_data.append(df["NCR"].values)

In [None]:
sns.set( font_scale=2.0)

corr = pd.DataFrame(res).pivot(index='Disease1', columns='Disease2', values='dtw')
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    plt.figure(figsize=(40,20))
    ax = sns.heatmap(corr, mask=mask, vmax=.3, square=True, annot=True, fmt='0.3f', cmap=sns.light_palette("#cc0000" , reverse=True, as_cmap=True))
    plt.savefig("output/Papers/dtw-ncr.svg")

In [None]:
from tslearn.generators import random_walks
from tslearn.clustering import TimeSeriesKMeans
# X = random_walks(n_ts=50, sz=32, d=1)
km = TimeSeriesKMeans(n_clusters=2, metric="dtw", max_iter=10, tol=1e-5).fit(to_time_series_dataset(piv_data))

from collections import defaultdict
clusters = defaultdict(lambda: [])
for d, c in zip(corr.index, km.labels_):
    clusters[c].append(d)
clusters

### Data and Code in research

In [None]:
from ScienceDynamics.datasets.microsoft_academic_graph import MicrosoftAcademicGraph
from ScienceDynamics.config.configs import DATASETS_BASE_DIR
mag = MicrosoftAcademicGraph(DATASETS_BASE_DIR)

In [None]:
resources = diseases_mag.join(mag.paper_resources, on="PaperId")

ResourceType. 1 = Project, 2 = Data, 4 = Code

In [None]:
resources[resources["ResourceType"]==2]["disease"].value_counts()

In [None]:
len(resources[resources["ResourceType"]==2]["disease"])

In [None]:
len(resources[resources["ResourceType"]==4]["disease"])

In [None]:
resources[resources["ResourceType"]==4]["disease"].value_counts()

In [None]:
resources[resources["ResourceType"]==1]["disease"].value_counts()

## Data Fusion

In [None]:
diseases_pubmed = load_sframe("Data/pubmed/diseases_pubmed.sframe")

In [None]:
pubmed_papers_year = diseases_pubmed.groupby("year",{"PubMed":agg.COUNT()})

In [None]:
mag_papers_year =  diseases_mag.groupby("Year",{"MAG":agg.COUNT()})

In [None]:
pubmed = load_sframe("Data/pubmed/pubmed.sframe")

In [None]:
pubmed_papers_year = pubmed.groupby("year",{"PubMed":agg.COUNT()})

In [None]:
mag_papers_year =  med_mag.groupby("Year",{"MAG":agg.COUNT()})

In [None]:
df = pubmed_papers_year.join(mag_papers_year,{"year":"Year"}).sort("year")

In [None]:
df =df.rename({"year":"Year"})

In [None]:
df2 = df.pack_columns(column_names=["MAG","PubMed"], dtype=dict, new_column_name='Papers').stack("Papers", new_column_name=['Dataset', 'Total Papers'])

In [None]:
import plotly.express as px

fig = px.line(df2[df2["Year"]<2016].to_dataframe(), x="Year", y="Total Papers",color="Dataset", width=1600, height=800)
fig.update_layout({"legend":{"x":0,"y":1.1}, "legend_orientation":"h"},  font=dict(
        size=20,
    ))


fig.show()

In [None]:
# fig.write_image("output/Papers/Total Papers.svg")
