In [None]:
%load_ext autoreload
%autoreload 2
%aimport
%matplotlib inline

In [None]:
import os
import sys
nb_dir = os.path.dirname(os.path.split(os.getcwd())[0])
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [None]:
from tqdm import tqdm_notebook as tqdm
import pandas as pd
from turicreate import SFrame, load_sframe
from pathlib import Path
import turicreate.aggregate as agg


In [None]:
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

## Disease List

In [None]:
diseases_list = SFrame.read_csv("Data/diseases_list.csv")

In [None]:
len(diseases_list)

In [None]:
for c in diseases_list.column_names():
    diseases_list[c] = diseases_list[c].apply(lambda x: x.strip())

In [None]:
def get_diseases_by_datasource(diseases_list, source="GDB"):
    diseases = diseases_list[[source]].unique()
    diseases = diseases[diseases[source]!=""]
    diseases["id"] = range(len(diseases))
    return diseases_list.join(diseases)

In [None]:
def normalize_diseases_list(sf):
    sf = sf.pack_columns(["GIDEON","Alias","Alias2"], new_column_name="diseases").stack('diseases', new_column_name='diseases')
    sf = sf[sf["diseases"]!=""]
    sf["diseases"] = sf["diseases"].apply(lambda x: x.lower())
    return sf[["id", "diseases"]].unique()

In [None]:
gideon_diseases = get_diseases_by_datasource(diseases_list, source="GIDEON")
diseases_id = normalize_diseases_list(gideon_diseases)

In [None]:
gideon_diseases[["id","GIDEON"]].rename({"GIDEON":"disease"}).save("Data/disease_names.csv")

In [None]:
disease_set = set(diseases_id["diseases"])

In [None]:
diseases_id.save("Data/diseases_id.csv","csv")

In [None]:
diseases_id = SFrame.read_csv("Data/diseases_id.csv")
disease_set = set(diseases_id["diseases"])

In [None]:
disease_names = gideon_diseases[["id","GIDEON"]].rename({"GIDEON":"disease"})

## Bibliometric Datasets

In [None]:
import re

def ngrams(tokens, n):
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]


def word_ngrams(s, max_len=None,  min_len=0):
    s = s.lower()
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    if max_len is None:
        max_len = len(tokens)
    elif max_len > len(tokens):
        max_len = len(tokens)
    return {t for i in range(min_len, max_len) for t in ngrams(tokens, i+1)}

In [None]:
def fiter_diseases(diseases_sf, disease_set, max_len, title_field="Normalized paper title"):
    diseases_sf["diseases"] = diseases_sf[title_field].apply(lambda t: disease_set & word_ngrams(t, max_len) )
    diseases_sf = diseases_sf[diseases_sf["diseases"]!=[]]
    return diseases_sf.stack("diseases",new_column_name="disease")

In [None]:
import random
def sf_unique(sf, fields):
    index_field = f"index_{random.getrandbits(128)}"
    sf[index_field]=range(len(sf))
    i = sf.groupby(fields, {index_field:agg.SELECT_ONE(index_field)})[index_field]
    return sf.filter_by(i,index_field).remove_column(index_field)
           
def normalize_filtered_diseses(f_mag,  disease_set, max_len, title_field="Normalized paper title"):
    diseases_f_mag = fiter_diseases(f_mag, disease_set, max_len, "PaperTitle")
    diseases_f_mag = diseases_f_mag.join(diseases_id, on={"disease":"diseases"})
    diseases_f_mag = diseases_f_mag.remove_column("disease").join(disease_names, on="id")
    return sf_unique(diseases_f_mag,["PaperId", "id"])

def filter_by_cats(mag,cats, disease_set, max_len, title_field="Normalized paper title"):
    papaers_field = mag.paper_fields_of_study.join(mag.fields_of_study.filter_by(cats, "NormalizedName")[["FieldOfStudyId","DisplayName"]], on="FieldOfStudyId")
    f_mag = mag.extended_papers.filter_by(papaers_field["PaperId"], "PaperId")
    return f_mag, normalize_filtered_diseses(f_mag, disease_set, max_len, title_field)

In [None]:
max_len = max([len(d.split(" ")) for d in disease_set])

### Microsoft Academics

In [None]:
from ScienceDynamics.datasets import MicrosoftAcademicGraph
mag = MicrosoftAcademicGraph()


In [None]:
len(mag.extended_papers)

In [None]:
diseases_mag = normalize_filtered_diseses(mag.extended_papers, disease_set, max_len, "PaperTitle")

In [None]:
diseases_mag.save("Data/mag/diseases_mag.sframe")

In [None]:
med_mag, diseases_med_mag = filter_by_cats(mag,cats, disease_set, max_len)

In [None]:
virology_mag, diseases_virology_mag = filter_by_cats(mag,["virology"], disease_set, max_len)

In [None]:
viro_mag = load_sframe("Data/mag/viro_mag2.sframe")
diseases_viro_mag = load_sframe("Data/mag/diseases_viro_mag2.sframe")

In [None]:
len(diseases_virology_mag[(diseases_virology_mag["disease"]=="HIV/AIDS")&(diseases_virology_mag["Year"]>2000)&(diseases_virology_mag["Ref Number"]>5)])/len(virology_mag[(virology_mag["Year"]>2000)&(virology_mag["Ref Number"]>5)])

In [None]:
sars_cor =  diseases_virology_mag[((diseases_virology_mag["disease"]=="SARS")| (diseases_virology_mag["disease"]=="MERS Coronavirus"))&(diseases_virology_mag["Ref Number"]>5)]

In [None]:
len(set(sars_cor[sars_cor["Year"]>=2000]["PaperId"])) / len(virology_mag[(virology_mag["Year"]>=2000)&(virology_mag["Ref Number"]>5)])

In [None]:
len(virology_mag)

In [None]:
virology_mag.save("Data/mag/viro_mag.sframe")
diseases_virology_mag.save("Data/mag/diseases_viro_mag.sframe")

In [None]:
med_mag.save("Data/mag/med_mag.sframe")
diseases_med_mag.save("Data/mag/diseases_med_mag.sframe")

## PubMed

In [None]:
from pathlib import Path
if not Path("Data/pubmed/pubmed.sframe").exists():
    pubmed = SFrame.read_json("Data/pubmed/pubmed.json")
    pubmed = pubmed[pubmed["pubdate"]!=""]
    pubmed["Normalized paper title"] = pubmed["title"].apply(lambda x: x.lower().replace("[","").replace("].",""))
    pubmed["pubdate"] = pubmed["pubdate"].astype(int)
    pubmed = pubmed.rename({"pubdate":"year"})
    pubmed["pmid"] = pubmed["pmid"].astype(int)
    pubmed["mesh_terms"] = pubmed["mesh_terms"].apply(lambda x: x.split(";"))
    pubmed.save("Data/pubmed/pubmed.sframe")
else:
    pubmed = load_sframe("Data/pubmed/pubmed.sframe")
    diseases_pubmed = fiter_diseases(pubmed,  disease_set, max_len)
    diseases_pubmed = diseases_pubmed.join(diseases_id)
    diseases_pubmed.save("Data/pubmed/diseases_pubmed.sframe")

In [None]:
diseases_pubmed = fiter_diseases(pubmed,  disease_set, max_len)

In [None]:
pubmed.save("Data/pubmed/pubmed.sframe")

In [None]:
pubmed = load_sframe("Data/pubmed/pubmed.sframe")


In [None]:
pubmed["pmid"] = pubmed["pmid"].astype(int)

In [None]:
pubmed["mesh_terms_norm"] = pubmed["mesh_terms"].apply(lambda terms: terms.split(";") )

In [None]:
pubmed["mesh_terms_norm"] = pubmed["mesh_terms_norm"].apply(lambda terms: [t.strip() for t in terms] )

In [None]:
def fiter_mesh(diseases_sf, disease_set, title_field="mesh_terms_norm"):
    diseases_sf["diseases"] = diseases_sf[title_field].apply(lambda t: disease_set & set(t) )
    diseases_sf = diseases_sf[diseases_sf["diseases"]!=[]]
    return diseases_sf.stack("diseases",new_column_name="disease")

In [None]:
diseases_pubmed_mesh = fiter_mesh(pubmed,  disease_set)
diseases_pubmed_mesh = diseases_pubmed_mesh.join(diseases_id, on={"disease":"diseases"})

In [None]:
diseases_pubmed_mesh = diseases_pubmed_mesh.join(diseases_id, on={"disease":"diseases"})

In [None]:
diseases_pubmed_mesh = diseases_pubmed_mesh.remove_column("disease").join(disease_names, on={"id":"id"})

In [None]:
diseases_pubmed_mesh.save("Data/pubmed/diseases_pubmed_mesh.sframe")

In [None]:
diseases_pubmed = diseases_pubmed.unique()

In [None]:
diseases_pubmed["mesh_terms"] = diseases_pubmed["mesh_terms"].apply(lambda x: x.split(";"))

In [None]:
# pubmed = load_sframe("Data/pubmed/pubmed.sframe")
# diseases_pubmed = fiter_diseases(pubmed,  disease_set, max_len)

# diseases_pubmed = diseases_pubmed.rename({"pubdate":"year"})
diseases_pubmed = diseases_pubmed.remove_column("disease").join(disease_names, on={"id":"id"})
diseases_pubmed.save("Data/pubmed/diseases_pubmed.sframe")

In [None]:
diseases_pubmed.save("Data/pubmed/diseases_pubmed.sframe")

In [None]:
diseases_pubmed = load_sframe("Data/pubmed/diseases_pubmed.sframe")

In [None]:
spothlight = ["SARS","MERS Coronavirus", "Avian Influenza","Ebola", "Influenza", "HIV/AIDS","Hepatitis B","Hepatitis C", "Swine Flu"]

In [None]:
diseases_pubmed = diseases_pubmed.filter_by(spothlight, "disease")

In [None]:
mesh_terms = diseases_pubmed.stack("mesh_terms", new_column_name="mesh_term")

In [None]:
mesh_terms[mesh_terms["disease"]=="Hepatitis B"]["mesh_term"].value_counts()[7]

In [None]:
mesh = {'D045473:SARS Virus','D045169:Severe Acute Respiratory Syndrome', 'D065207:Middle East Respiratory Syndrome Coronavirus',
 "D005585:Influenza in Birds",'D053124:Influenza A Virus, H5N1 Subtype',"D029043:Ebolavirus","D019142:Hemorrhagic Fever, Ebola",
"D007251:Influenza, Human","D007252:Influenza Vaccines0","D015658:HIV Infections","D015497:HIV-1","D006509:Hepatitis B","D006515:Hepatitis B virus"
,"D006526:Hepatitis C","D016174:Hepacivirus","D053118:Influenza A Virus, H1N1 Subtype","D019698:Hepatitis C, Chronic","D019694:Hepatitis B, Chronic"}

In [None]:
diseases_pubmed_mesh = fiter_mesh(pubmed,  mesh)


In [None]:
diseases_pubmed_mesh["pmid"] = diseases_pubmed_mesh["pmid"].astype(int)
diseases_pubmed["pmid"] = diseases_pubmed["pmid"].astype(int)

In [None]:
diseases_pubmed_mesh_left = diseases_pubmed_mesh.join(diseases_pubmed, on="pmid", how="left")

In [None]:
1- len(diseases_pubmed_mesh_left[diseases_pubmed_mesh_left["title.1"]==None])/len(diseases_pubmed_mesh_left)

In [None]:
diseases_pubmed_left = diseases_pubmed.join(diseases_pubmed_mesh, on="pmid", how="left")

In [None]:
1-len(diseases_pubmed_left[diseases_pubmed_left["title.1"]==None])/len(diseases_pubmed_left)

In [None]:
diseases_pubmed_outer = diseases_pubmed.join(diseases_pubmed_mesh, on="pmid", how="outer")

In [None]:
diseases_pubmed_outer[diseases_pubmed_outer["title.1"]==None][2]

In [None]:
pubmed["mesh_terms"] = pubmed["mesh_terms"].apply(lambda x: x.split(";"))

In [None]:
pubmed = pubmed.stack("mesh_terms", new_column_name="mesh_term")

In [None]:
pubmed["mesh_term"] = pubmed["mesh_term"].apply(lambda x: x.strip())

In [None]:
diseases_pubmed = load_sframe("Data/pubmed/diseases_pubmed.sframe")