In [None]:
%load_ext autoreload
%autoreload 2
%aimport
%matplotlib inline

In [None]:
import os
import sys
nb_dir = os.path.dirname(os.path.split(os.getcwd())[0])
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [None]:
from tqdm import tqdm_notebook as tqdm
import pandas as pd
from turicreate import SFrame, load_sframe
from pathlib import Path
import turicreate.aggregate as agg
import numpy as np

In [None]:
from ScienceDynamics.datasets import SJR

In [None]:
import json
import os
import matplotlib.pyplot as plt
import pandas as pd
import math
import glob
import ntpath
from tqdm import tqdm
import re

import datetime
from matplotlib.backends.backend_pdf import PdfPages
from  matplotlib.ticker import FuncFormatter

## Draw Graphs

In [None]:
import seaborn as sns
from  matplotlib.ticker import FuncFormatter

sns.set(font_scale=1.3)
plt.figure(figsize=(16, 12))
# sns.set(style="ticks")
sns.set()
def create_gird(df, col, hue,x,y,sharey=True, legend=False):

    # Initialize a grid of plots with an Axes for each walk
    grid = sns.FacetGrid(df, col=col, hue=hue, palette=sns.color_palette("hls", 4),sharey=sharey,
                         col_wrap=3, height=4.5)
    plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))

    # Draw a horizontal line to show the starting point
    grid.map(plt.axhline, y=0, ls=":", c=".5")

    # Draw a line plot to show the trajectory of each random walk
    grid.map(plt.plot, x, y)
    grid.set_titles("{col_name}")
    if legend:
        grid.add_legend()


    # Adjust the arrangement of the plots
    grid.fig.tight_layout(w_pad=1)
    return grid

In [None]:
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i + n]


## Analysis

### Load Data

In [None]:
spothlight = ["SARS","MERS Coronavirus", "Avian Influenza","Ebola", "Influenza", "HIV/AIDS","Hepatitis B","Hepatitis C", "Swine Flu"]

In [None]:
diseases_id= load_sframe("Data/diseases_list.csv")
disease_names = SFrame.read_csv("Data/disease_names.csv")

In [None]:
disease_names = disease_names.rename({"disease":"Disease"})

##### SJR

In [None]:
sjr = SJR()

In [None]:
sjr_sf = sjr.data

Convert SJR to float form string

In [None]:
sjr_sf["SJR"] = sjr_sf["SJR"].apply(lambda x: x.replace(",","."))
sjr_sf["SJR"] = sjr_sf["SJR"].apply(lambda x: x if x else "-1")
sjr_sf["SJR"] = sjr_sf["SJR"].astype(float)

##### Pubmed

In [None]:
diseases_pubmed = load_sframe("Data/pubmed/diseases_pubmed.sframe")

### Journals Current State

Format ISSN similarly to PubMed dataset

In [None]:
diseases_pubmed["issn_linking"]

In [None]:
diseases_pubmed["ISSN"] = diseases_pubmed["issn_linking"].apply(lambda x: x.replace("-", ""))

SJR is publushed in a delay of two years, to deal with it we correct the SJR date to its correct publication date. 

In [None]:
sjr_sf["Year"] = sjr_sf["Year"]+2

In [None]:
diseases_pubmed = diseases_pubmed.rename({"disease":"Disease"})

In [None]:
diseases_pubmed = diseases_pubmed.filter_by(spothlight,"Disease")

In [None]:
pubmed_sjr = sjr_sf.join(diseases_pubmed, on={"ISSN":"ISSN","Year":"year"})

In [None]:
pubmed_sjr = pubmed_sjr.filter_by(spothlight,"Disease")

In [None]:
pubmed_sjr=pubmed_sjr.unique()

In [None]:
pubmed_sjr["Categories Clean"] = pubmed_sjr["Categories"].apply(lambda x: re.compile(r"\s\(.*?\);{0,1}").split(x)[:-1])

In [None]:
pubmed_sjr["Categories Clean"] = pubmed_sjr["Categories Clean"].apply(lambda x: {c.strip() for c in x})

In [None]:
cat = {"Immunology", "Epidemiology", "Infectious Diseases", "Virology", "Microbiology"}

In [None]:
pubmed_sjr = pubmed_sjr[pubmed_sjr["Categories Clean"].apply(lambda x: len(set(x) & cat)>0)]

In [None]:
journal_disease = pubmed_sjr.groupby(["ISSN","Disease","Year"], operations={"Count": agg.COUNT_DISTINCT("pmid")})

In [None]:
journal_map = pubmed_sjr[["SJR" ,"Title", "ISSN","Year"]].unique()

In [None]:
journal_disease = journal_disease.join(journal_map,["ISSN","Year"])

In [None]:
journal_disease = journal_disease[journal_disease["SJR"]>0]

In [None]:
journal_disease_df = journal_disease.to_dataframe()

### Top Journals by Publications

In [None]:
journal_disease_sf = SFrame(journal_disease_df)

In [None]:
journal_disease_sf["Score"] = journal_disease_sf["Count"]*journal_disease_sf["SJR"]

In [None]:
journal_disease_score = journal_disease_sf.groupby("Disease",{"Count":agg.SUM("Count"), "Score":agg.SUM("Score")})

In [None]:
journal_disease_score["JScore"] = journal_disease_score["Score"] /journal_disease_score["Count"] 

In [None]:
journal_disease_score.sort("JScore",False)

In [None]:
journal_disease_score.save("output/Journals/avg_jscore.csv")

In [None]:
journal_disease_year_score = journal_disease_sf.groupby(["Disease","Year"],{"Count":agg.SUM("Count"), "Score":agg.SUM("Score")})

In [None]:
journal_disease_year_score["JScore"] = journal_disease_year_score["Score"] /journal_disease_year_score["Count"] 

In [None]:
journal_disease_year_score.save("output/Journals/journal_disease_year_score.csv","csv")

In [None]:
journal_disease_year_score = journal_disease_year_score.sort(["Disease", "Year"])

In [None]:

sns.set(font_scale=1.3)


plt.rc('text', usetex=False)
create_gird(journal_disease_year_score[journal_disease_year_score["Count"]>5].to_dataframe(),"Disease","Disease","Year", "JScore")
plt.savefig('output/Journals/jscore.svg')
# plt.close()



In [None]:
sjr_sf["Categories Clean"] = sjr_sf["Categories"].apply(lambda x: re.compile(r"\s\(.*?\);{0,1}").split(x)[:-1])
sjr_sf["Categories Clean"] = sjr_sf["Categories Clean"].apply(lambda x: {c.strip() for c in x})

In [None]:
sjr_sf = sjr_sf[sjr_sf["Categories Clean"].apply(lambda x: len(set(x) & cat)>0)]

In [None]:
jtop_10 = SFrame()
for year in range(2002,2019):
    jtop_10 = jtop_10.append(sjr_sf.filter_by(year, "Year").sort('Rank')[:10])
jtop_3 = SFrame()
for year in range(2002,2019):
    jtop_3 = jtop_3.append(sjr_sf.filter_by(year, "Year").sort('Rank')[:3])

In [None]:
papers_sjr_top_10 = pubmed_sjr.join( jtop_10[["Year","Sourceid"]], on=["Year","Sourceid"])


In [None]:
papers_sjr_top_10 = pubmed_sjr.join( jtop_10[["Year","Sourceid"]], on=["Year","Sourceid"])
papers_top_10= papers_sjr_top_10.groupby(["Disease","Year"], {"Count":agg.COUNT()})
papers_top_10 = papers_top_10.sort(["Disease","Year"])

In [None]:
papers_top_10["# of Papaers in Top-10"] = papers_top_10["Count"]

In [None]:

plt.rc('text', usetex=False)
create_gird(papers_top_10.to_dataframe(),"Disease","Disease","Year", "# of Papaers in Top-10", False, False)
plt.savefig('output/Journals/top10.svg')
# plt.close()



In [None]:
papers_sjr_top_3 = pubmed_sjr.join( jtop_3[["Year","Sourceid"]], on=["Year","Sourceid"])
papers_top_3= papers_sjr_top_3.groupby(["Disease","Year"], {"Count":agg.COUNT()})
papers_top_3 = papers_top_3.sort(["Disease","Year"])

In [None]:


# if LaTeX is not installed or error caught, change to `usetex=False`
plt.rc('text', usetex=False)
plt.figure(figsize=(8, 6))
create_gird(papers_top_3.to_dataframe(),"Disease","Disease","Year", "Count", True, False)
plt.savefig("output/Journals/top3.svg")



In [None]:
papaers_q = pubmed_sjr.groupby(["Year","Disease", "SJR Best Quartile"],{"Count": agg.COUNT()})

In [None]:
papaers_q = papaers_q.sort(["Disease", "Year"])

In [None]:
papaers_q=papaers_q[papaers_q["SJR Best Quartile"]!="-"]

In [None]:
plt.rc('text', usetex=False)
plt.figure(figsize=(8, 6))
create_gird(papaers_q[papaers_q["SJR Best Quartile"]=="Q1"].to_dataframe(),"Disease","Disease","Year", "Count", False)
plt.savefig('output/Journals/q1.svg')

In [None]:
papaers_q["# of Papaers in Quartile"] = papaers_q["Count"]

In [None]:
plt.rc('text', usetex=False)
plt.figure(figsize=(8, 6))
create_gird(papaers_q.to_dataframe(),"Disease","SJR Best Quartile","Year", "# of Papaers in Quartile", False, True)
plt.savefig('output/Journals/q.svg')