In [None]:
%load_ext autoreload
%autoreload 2
%aimport
%matplotlib inline

In [None]:
import os
import sys
nb_dir = os.path.dirname(os.path.split(os.getcwd())[0])
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [None]:
from tqdm import tqdm_notebook as tqdm
import pandas as pd
from turicreate import SFrame, load_sframe
from pathlib import Path
import turicreate.aggregate as agg
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from  matplotlib.ticker import FuncFormatter
import re
import datetime
from matplotlib.backends.backend_pdf import PdfPages
import geopandas


In [None]:
from collections import defaultdict
from itertools import chain

In [None]:
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i + n]

def get_d(sf_corr):
    for data in sf_corr.groupby("Disease"):
        if len(data[1]) >5:
            yield data[1].sort_values("Year")

In [None]:

plt.figure(figsize=(20, 10))

sns.set(style="ticks")
def create_gird(df, col, hue,x,y):

    # Initialize a grid of plots with an Axes for each walk
    grid = sns.FacetGrid(df, col=col, hue=hue, palette="tab20c",
                         col_wrap=3, height=4.5)
    plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))

    # Draw a horizontal line to show the starting point
    grid.map(plt.axhline, y=0, ls=":", c=".5")

    # Draw a line plot to show the trajectory of each random walk
    grid.map(plt.plot, x, y)


    # Adjust the arrangement of the plots
    grid.fig.tight_layout(w_pad=1)
    return grid

In [None]:
diseases_id= load_sframe("Data/diseases_id.csv")
disease_names= SFrame.read_csv("Data/disease_names.csv")

In [None]:
spothlight = ["SARS","MERS Coronavirus", "Avian Influenza","Ebola", "Influenza", "HIV/AIDS","Hepatitis B","Hepatitis C", "Swine Flu"]

In [None]:
import networkx as nx

## Pre-Processing

In [None]:
from ScienceDynamics.datasets.microsoft_academic_graph import MicrosoftAcademicGraph
mag = MicrosoftAcademicGraph()

In [None]:
diseases_mag = load_sframe("Data/mag/diseases_med_mag.sframe")

In [None]:
diseases_mag = diseases_mag.filter_by(spothlight,"disease")

In [None]:
diseases_mag = diseases_mag.rename({"disease":"Disease"})

In [None]:
diseases_mag= diseases_mag[diseases_mag["Ref Number"]>5]

In [None]:
paper_ids = diseases_mag["PaperId"]

In [None]:
paper_author_aff = mag.paper_author_affiliations

In [None]:
paper_author_aff = paper_author_aff.filter_by(paper_ids,"PaperId")

Fill affilations that are missing in the affilation table

In [None]:
import geonamescache
gc = geonamescache.GeonamesCache()
cities = gc.get_cities()
countries = gc.get_countries()
cities = {v['name'] for k, v in cities.items()}
countries = {v['name'] for k, v in countries.items()}

In [None]:
import re
from geonamescache.mappers import country
mapper = country(from_key='iso', to_key='name')

def add_location_from_original_affilation(paper_author_aff):
    paper_author_aff["AffiliationId"] = paper_author_aff["AffiliationId"].astype(int)
    paper_author_aff["NormalizedAffiliation"] = paper_author_aff["OriginalAffiliation"].apply(lambda x: set(re.sub(r"[^a-zA-Z0-9]+", ' ', x).split(" ")))
    
    paper_author_aff["City_A"] = paper_author_aff["NormalizedAffiliation"].apply(lambda x: set(x)&cities)
    paper_author_aff["City_A"] = paper_author_aff["City_A"].apply(lambda x: x[0] if x else "")
    paper_author_aff["City_A"] = paper_author_aff["City_A"].apply(lambda x: None if x=="" else x)

    paper_author_aff["Country_A"] = paper_author_aff["NormalizedAffiliation"].apply(lambda x: set(x)&countries)
    paper_author_aff["Country_A"] = paper_author_aff["Country_A"].apply(lambda x:x[0] if x else "")
    
    paper_author_aff["Country_Temp"] = paper_author_aff.apply(lambda x: gc.get_cities_by_name(x["City_A"]) if x["City_A"] and not x["Country_A"] else [] )
    paper_author_aff["Country_Temp"] = paper_author_aff["Country_Temp"].apply(lambda x: [l.popitem()[1] for l in x] )
    paper_author_aff["Country_A"] = paper_author_aff.apply(lambda x: x["Country_A"] if x["Country_A"] is not None  else mapper(x["Country_Temp"][0]["countrycode"]) if len(x["Country_Temp"])==1 else "" )
    paper_author_aff["Country_A"] = paper_author_aff["Country_A"].apply(lambda x: None if x=="" else x)
    return paper_author_aff.remove_column("Country_Temp")

In [None]:
paper_author_aff = add_location_from_original_affilation(paper_author_aff)

In [None]:
def merge_with_affilations(paper_author_aff, affiliations):
    paper_author_aff = paper_author_aff.join(affiliations, on={"AffiliationId":"AffiliationId"}, how="left")
    
    paper_author_aff["Country"] = paper_author_aff.apply(lambda x: x["Country_A"] if x["Country"]is None else x["Country"])
    paper_author_aff["City"] = paper_author_aff.apply(lambda x: x["City_A"] if x["City"] is None else x["City"])
    return paper_author_aff


In [None]:
paper_author_aff = merge_with_affilations(paper_author_aff, mag.affiliations)
paper_author_aff = paper_author_aff[(paper_author_aff["Country"]!=None)]

## Author Country

In [None]:
paper_author_aff = paper_author_aff[paper_author_aff["Country"]!=None]
paper_author_aff_country = paper_author_aff[["PaperId","AuthorId", "Country"]].unstack(["AuthorId", "Country"])
paper_author_aff_country["num_countries"] = paper_author_aff_country["Dict of AuthorId_Country"].apply(lambda x: len({v for k,v in x.items() if v!=""  }))

In [None]:
try:
    diseases_mag= diseases_mag.remove_columns(["Dict of AuthorId_Country","num_countries"])
except:
    pass
diseases_mag = diseases_mag.join(paper_author_aff_country, on={"PaperId":"PaperId"})

In [None]:
diseases_mag = diseases_mag[diseases_mag["num_countries"] > 0]

In [None]:
diseases_mag["Year"] = diseases_mag["Year"].astype(int)

In [None]:
diseases_mag.groupby("Disease",operations={"num_countries":agg.AVG("num_countries"), "count":agg.COUNT() }).sort("num_countries",False).save("output/Collab/author_counter.csv")

In [None]:
df = diseases_mag.groupby("Year",operations={"Average Distinct Countries":agg.AVG("num_countries"), "count":agg.COUNT() }).sort("Year",False)

In [None]:
df.to_dataframe().plot(x="Year", y="Average Distinct Countries", xlim=(2001,2018), figsize=(10,6))

In [None]:
df2 = diseases_mag.groupby(["Disease", "Year"],operations={"Average Number of Countries":agg.AVG("num_countries"), "count":agg.COUNT() }).sort("Year",False)

In [None]:
df2 = df2.sort(["Disease", "Year"])

In [None]:
df2 = df2[(df2["count"] >5)&(df2["Year"] >2001)]

In [None]:


plt.rc('text', usetex=False)

create_gird(df2.to_dataframe(),"Disease","Disease","Year", "Average Number of Countries")
# plt.savefig('output/Collab/countries.png')
# plt.close()



In [None]:
diseases_mag["num_countries"].value_counts()

In [None]:
countries_p  = diseases_mag.groupby(["Disease", "Year"], {"Count": agg.COUNT(),"Freq":agg.FREQ_COUNT("num_countries")})

In [None]:
countries_p["Colab"] = countries_p["Freq"].apply(lambda x: sum([v for k,v in x.items() if k>1]))

In [None]:
countries_p["Colab%"] = countries_p["Colab"] / countries_p["Count"]

In [None]:
countries_p=countries_p.sort(["Disease","Year"])

In [None]:
countries_p = countries_p[(countries_p["Year"]>2001)&(countries_p["Year"]<2019)]

In [None]:
import datetime
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt


plt.rc('text', usetex=False)
create_gird(countries_p.to_dataframe(),"Disease","Disease","Year", "Colab%")

### Pair Colab

In [None]:
from itertools import combinations
def get_research_cop(diseases_mag, field="Dict of AuthorId_Country", n=2):
    diseases_mag["temp"] = diseases_mag[field].apply(lambda x: [v for v in set(x.values()) if v!=""])
    diseases_mag2 = diseases_mag[diseases_mag["temp"].apply(lambda x: len(x) >= n)]
    diseases_mag2= diseases_mag2[diseases_mag2["temp"]!=None]
    diseases_mag2["temp"] = diseases_mag2["temp"].apply(lambda x: sorted(x))
    diseases_mag2["con_pairs"] = diseases_mag2["temp"].apply(lambda x: list(combinations(x, n)))
    return  diseases_mag2.stack("con_pairs", new_column_name="con_pairs")


In [None]:
diseases_mag_pairs = get_research_cop(diseases_mag,n=2)

In [None]:
diseases_mag_pairs = diseases_mag_pairs.groupby(["Disease","Year","con_pairs"],agg.COUNT)

In [None]:
diseases_mag_pairs.sort("Count",False).save("output/Collab/pairs.csv")

In [None]:
diseases_mag["first_atuhor_country"] = diseases_mag.apply(lambda x: x["Dict of AuthorId_Country"][int(x["Authors List Sorted"][0])] if int(x["Authors List Sorted"][0]) in x["Dict of AuthorId_Country"] else "")

In [None]:
diseases_mag["last_atuhor_country"] = diseases_mag.apply(lambda x: x["Dict of AuthorId_Country"][int(x["Authors List Sorted"][-1])] if int(x["Authors List Sorted"][-1]) in x["Dict of AuthorId_Country"] else "")

In [None]:
first_atuhor_country = diseases_mag.groupby(["Disease","Year","first_atuhor_country"], {"Count":agg.COUNT()})
last_atuhor_country = diseases_mag.groupby(["Disease","Year","last_atuhor_country"], {"Count":agg.COUNT()})

In [None]:
import pycountry
import  functools
# @functools.lru_cache(maxsize=8048)
def get_iso(txt):
    try:
        return pycountry.countries.search_fuzzy(txt)[0].alpha_3
    except:
        pass
    return ""


In [None]:
diseases_mag_pairs = diseases_mag_pairs.groupby(["con_pairs", "Disease"],{ "Count":agg.SUM("Count")})

In [None]:
# These examples are inspired from http://geopandas.org/mapping.html


world = geopandas.read_file("http://thematicmapping.org/downloads/TM_WORLD_BORDERS_SIMPL-0.3.zip")


In [None]:
df = diseases_mag_pairs.to_dataframe()

In [None]:
sf = df.groupby("Disease").apply(lambda x: x.nlargest(10, 'Count')).reset_index(drop=True)  

In [None]:
sf =SFrame(sf)

In [None]:
world["ISO3"] = world.apply(lambda x: get_iso(x["name"]) if x["ISO3"] == "-99" else x["ISO3"], axis=1)

In [None]:
world["cord"] = world["geometry"].centroid


In [None]:
from geonamescache.mappers import country
mapper = country(from_key='name', to_key='name')

In [None]:
import networkx as nx

In [None]:
g = nx.Graph()

In [None]:
pairs = diseases_mag_pairs.groupby(["con_pairs", "Disease"],{ "Count":agg.SUM("Count")})

In [None]:
inter = get_research_cop(diseases_mag ,"Dict of AuthorId_Country", n=1).groupby(["Disease","con_pairs"],agg.COUNT).sort("Count",False)

In [None]:
inter["Country"] = inter["con_pairs"].apply(lambda x: x[0])

In [None]:
diseases_mag_pairs

In [None]:
import json
graphs = dict(zip(spothlight,[nx.Graph() for x in spothlight]))
for d in tqdm(spothlight):
    total_papers = inter[inter["Disease"]==d]["Count"].sum()
    for line in inter[inter["Disease"]==d]:
        if line['Count']/total_papers>0.05:
            
            graphs[d].add_node(line["Country"], weight=line['Count'])
    nodes = set(graphs[d].nodes)
    for line in pairs[pairs["Disease"]==d]:
        if line["con_pairs"][0] in nodes and line["con_pairs"][1] in  nodes:
            graphs[d].add_edge(*line["con_pairs"], weight=line['Count'])
    nx.write_gml(  graphs[d], f"output/Collab/Cyto/{d.replace('/','-')}.gml")
    with open(f"output/Collab/Cyto/{d.replace('/','-')}.json", "w") as f :
        f.write(json.dumps(nx.cytoscape_data(graphs[d])))


In [None]:
import matplotlib.pyplot as plt

In [None]:
sars = world.merge(inter[inter["Disease"]=="SARS"].to_dataframe(),left_on="NAME", right_on="Country")

In [None]:
!pip install descartes

In [None]:
import math
from mpl_toolkits.axes_grid1 import make_axes_locatable
fig, ax = plt.subplots(1,1,figsize=(20,20))

# ax.set_aspect('equal')
divider = make_axes_locatable(ax)

cax = divider.append_axes("right", size="5%", pad=0.1)

# world['pop_est_log'] = world['pop_est'].apply(lambda i: math.log(i) if i >0 else 0)
s = sars.plot(ax=ax, column="Count",cmap="OrRd",  legend=True, cax=cax, zorder=2)
world.plot(color='white', edgecolor='black',ax=s)
ax.set_axis_off()
plt.savefig("output/Collab/map.svg")

In [None]:
inter = inter.join(inter.groupby("Disease",{ "Total":agg.SUM("Count")}))

In [None]:
inter["p%"] = inter["Count"]/ inter["Total"]

In [None]:
inter = inter.sort(["Disease","p%"],False)

In [None]:
spothlight = sorted(spothlight,reverse=True)

In [None]:
res = []
for d in spothlight:
    res+=list(inter[inter["Disease"]==d]["p%"].cumulative_sum())

In [None]:
inter["cumulative_sum"]=res

In [None]:
inter["Pie_Country"] = inter.apply(lambda x: "Others" if x["p%"]<0.02 else x["Country"])

In [None]:
inter2 = inter.groupby(["Disease","Pie_Country"],{"Count":agg.SUM("Count")})

In [None]:
countries = list(inter2["Pie_Country"].unique())

In [None]:
inter2 = inter2.sort(["Disease","Count"],False)

In [None]:
from matplotlib.colors import ListedColormap
import math

def generate_colormap(N):
    arr = np.arange(N)/N
    N_up = int(math.ceil(N/7)*7)
    arr.resize(N_up)
    arr = arr.reshape(7,N_up//7).T.reshape(-1)
    ret = plt.cm.GnBu_r(arr)
    n = ret[:,3].size
    a = n//2
    b = n-a
    for i in range(3):
        ret[0:n//2,i] *= np.arange(0.2,1,0.8/a)
    ret[n//2:,3] *= np.arange(1,0.1,-0.9/b)
#     print(ret)
    return ret

In [None]:

colordict={}

for l,c in zip(countries,generate_colormap(len(countries))):
    colordict[l]=c

# fig = plt.figure(figsize=[10, 10])
# ax = fig.add_subplot(111)

# pie_wedge_collection = ax.pie(slices, labels=labels, labeldistance=1.05)#, autopct=make_autopct(slices))

# for pie_wedge in pie_wedge_collection[0]:
#     pie_wedge.set_edgecolor('white')
#     pie_wedge.set_facecolor(colordict[pie_wedge.get_label()])

In [None]:
fig, axes = plt.subplots(3,3,figsize=(20,20))
axes = axes.flatten()

for d,ax in zip(sorted(spothlight),axes):

    ax.set_title(f'{d}  ({len(set(inter[inter["Disease"]==d]["Country"]))})', fontsize=18)
    data = inter2[inter2["Disease"]==d]
    pie_wedge_collection = ax.pie(data["Count"],  labels=data["Pie_Country"], textprops={'fontsize': 13},  labeldistance=1.15)
    
    for pie_wedge in pie_wedge_collection[0]:
        pie_wedge.set_edgecolor('white')
        pie_wedge.set_facecolor(colordict[pie_wedge.get_label()])
fig.tight_layout()
plt.savefig("output/Collab/inter.svg")

In [None]:
world2 = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

In [None]:
world2= world2.replace("United States of America", "United States")

In [None]:
inter = inter.join(SFrame(world2[["name","continent"]]), on={"Country":"name"},how='left')

In [None]:
first_author =  diseases_mag.groupby(["Disease","first_atuhor_country"], agg.COUNT())

In [None]:
first_author = first_author[first_author["first_atuhor_country"]!=""]

In [None]:
first_author = first_author.join(first_author.groupby("Disease",{ "Total":agg.SUM("Count")}))

In [None]:
first_author["p%"] = first_author["Count"]/ first_author["Total"]

In [None]:
first_author = first_author.sort(["Disease","p%"],False)

In [None]:
spothlight = sorted(spothlight,reverse=True)

In [None]:
res = []
for d in spothlight:
    res+=list(first_author[first_author["Disease"]==d]["p%"].cumulative_sum())

In [None]:
first_author["cumulative_sum"]=res

In [None]:
for d in spothlight:
    print(first_author[first_author["Disease"]==d][:6][["Disease","first_atuhor_country","cumulative_sum"]])

In [None]:
first_author = first_author.join(SFrame(world2[["name","continent"]]), on={"first_atuhor_country":"name"},how='left')

In [None]:
first_author_continent = first_author.groupby(["Disease","continent"],{"Count":agg.SUM("Count"), "p%":agg.SUM("p%")})

In [None]:
first_author_continent[first_author_continent["continent"]!=None].sort(["Disease","p%"]).print_rows(63)