# Biclustering of Dependency Paths for Biomedical Realtionship Extraction

A global network of biomedical relationships derived from text

In [1]:
import pandas as pd

# Chemical-Disease

In [2]:
chemical_disease_url = 'https://zenodo.org/record/1495808/files/part-i-chemical-disease-path-theme-distributions.txt.zip'
chemical_disease_paths_url = 'https://zenodo.org/record/1495808/files/part-ii-dependency-paths-chemical-disease-sorted-with-themes.txt.zip'

In [3]:
chemical_disease_path_dist_df = pd.read_table(chemical_disease_url)
chemical_disease_path_dist_df.head(2)

Unnamed: 0,path,T,T.ind,C,C.ind,Sa,Sa.ind,Pr,Pr.ind,Pa,Pa.ind,J,J.ind,Mp,Mp.ind
0,comparison|nmod|start_entity importance|dep|co...,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
1,correlated|nsubj|start_entity correlated|nmod|...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,1.0,0,0.0,0


In [4]:
chemical_disease_paths_df = pd.read_table(
    chemical_disease_paths_url, 
    names=[
        "pubmed_id", "sentence_num",
        "first_entity_name", "first_entity_location",
        "second_entity_name", "second_entity_location",
        "first_entity_name_raw", "second_entity_name_raw",
        "first_entity_db_id", "second_entity_db_id",
        "first_entity_type", "second_entity_type",
        "dep_path", "sentence"
    ]
)
chemical_disease_paths_df.head(2)

Unnamed: 0,pubmed_id,sentence_num,first_entity_name,first_entity_location,second_entity_name,second_entity_location,first_entity_name_raw,second_entity_name_raw,first_entity_db_id,second_entity_db_id,first_entity_type,second_entity_type,dep_path,sentence
0,26832326,3,-,523524,ischemic_stroke,647662,-,ischemic stroke,,MESH:D002544,Chemical,Disease,effects|nmod|START_ENTITY assessed|dobj|effect...,This study assessed the neuroprotective effect...
1,26832326,3,-,523524,neuronal_injury,609624,-,neuronal injury,,MESH:D009410,Chemical,Disease,START_ENTITY|acl|induced induced|dobj|END_ENTITY,This study assessed the neuroprotective effect...


In [5]:
chemical_disease_merged_path_df=(
    chemical_disease_paths_df
    .assign(dep_path=chemical_disease_paths_df.dep_path.apply(lambda x: x.lower()).values)
    .merge(chemical_disease_path_dist_df.rename(index=str, columns={"path":"dep_path"}), on=["dep_path"])
)
chemical_disease_merged_path_df.head(2)

Unnamed: 0,pubmed_id,sentence_num,first_entity_name,first_entity_location,second_entity_name,second_entity_location,first_entity_name_raw,second_entity_name_raw,first_entity_db_id,second_entity_db_id,...,Sa,Sa.ind,Pr,Pr.ind,Pa,Pa.ind,J,J.ind,Mp,Mp.ind
0,26832326,3,-,523524,ischemic_stroke,647662,-,ischemic stroke,,MESH:D002544,...,2.0,0,0.0,0,2.0,0,8.0,0,0.0,0
1,12752312,2,sirolimus,392401,cardiovascular_disease,532554,sirolimus,cardiovascular disease,MESH:D020123,MESH:D002318,...,2.0,0,0.0,0,2.0,0,8.0,0,0.0,0


In [6]:
chemical_disease_merged_path_df.to_csv(
    "chemical_disease_bicluster_results.tsv.xz", 
    sep="\t", index=False, compression="xz"
)

# Chemical-Gene

In [7]:
chemical_gene_url = 'https://zenodo.org/record/1495808/files/part-i-chemical-gene-path-theme-distributions.txt.zip'
chemical_gene_paths_url = 'https://zenodo.org/record/1495808/files/part-ii-dependency-paths-chemical-gene-sorted-with-themes.txt.zip'

In [8]:
chemical_gene_path_dist_df = pd.read_table(chemical_gene_url)
chemical_gene_path_dist_df.head(2)

Unnamed: 0,path,A+,A+.ind,A-,A-.ind,B,B.ind,E+,E+.ind,E-,...,E,E.ind,N,N.ind,O,O.ind,K,K.ind,Z,Z.ind
0,kinases|compound|start_entity participate|nsub...,2.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,2.0,0,0.0,0,0.0,0
1,treatment|nmod|start_entity caused|nsubj|treat...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,4.0,0,0.0,0,0.0,0,0.0,0,0.0,0


In [9]:
chemical_gene_paths_df = pd.read_table(
    chemical_gene_paths_url, 
    names=[
        "pubmed_id", "sentence_num",
        "first_entity_name", "first_entity_location",
        "second_entity_name", "second_entity_location",
        "first_entity_name_raw", "second_entity_name_raw",
        "first_entity_db_id", "second_entity_db_id",
        "first_entity_type", "second_entity_type",
        "dep_path", "sentence"
    ]
)
chemical_gene_paths_df.head(2)

Unnamed: 0,pubmed_id,sentence_num,first_entity_name,first_entity_location,second_entity_name,second_entity_location,first_entity_name_raw,second_entity_name_raw,first_entity_db_id,second_entity_db_id,first_entity_type,second_entity_type,dep_path,sentence
0,25640386,11,-,16241625,hormone_receptor,15981614,-,hormone receptor,,3164,Chemical,Gene,sensitivity|appos|START_ENTITY sensitivity|nmo...,Similarly 18F-FDG-PET/CT had higher sensitivit...
1,28560459,1,+,244245,Sig-1R,227233,+,Sig-1R,,10280,Chemical,Gene,agonist|appos|START_ENTITY agonist|compound|re...,The purpose of the present study was to invest...


In [10]:
chemical_gene_merged_path_df=(
    chemical_gene_paths_df
    .assign(dep_path=chemical_gene_paths_df.dep_path.apply(lambda x: x.lower()).values)
    .merge(chemical_gene_path_dist_df.rename(index=str, columns={"path":"dep_path"}), on=["dep_path"])
)
chemical_gene_merged_path_df.head(2)

Unnamed: 0,pubmed_id,sentence_num,first_entity_name,first_entity_location,second_entity_name,second_entity_location,first_entity_name_raw,second_entity_name_raw,first_entity_db_id,second_entity_db_id,...,E,E.ind,N,N.ind,O,O.ind,K,K.ind,Z,Z.ind
0,25640386,11,-,16241625,hormone_receptor,15981614,-,hormone receptor,,3164,...,2.0,0,0.0,0,0.0,0,1.0,0,0.0,0
1,11737778,4,ABA,550553,abi3,570574,ABA,abi3,CHEBI:2365,822061(Tax:3702),...,2.0,0,0.0,0,0.0,0,1.0,0,0.0,0


In [11]:
chemical_gene_merged_path_df.to_csv(
    "chemical_gene_bicluster_results.tsv.xz", 
    sep="\t", index=False, compression="xz"
)

# Disease-Gene

In [12]:
disease_gene_url = 'https://zenodo.org/record/1495808/files/part-i-gene-disease-path-theme-distributions.txt.zip'
disease_gene_paths_url = 'https://zenodo.org/record/1495808/files/part-ii-dependency-paths-gene-disease-sorted-with-themes.txt.zip'

In [13]:
disease_gene_path_dist_df = pd.read_table(disease_gene_url)
disease_gene_path_dist_df.head(2)

Unnamed: 0,path,U,U.ind,Ud,Ud.ind,D,D.ind,J,J.ind,Te,...,Y,Y.ind,G,G.ind,Md,Md.ind,X,X.ind,L,L.ind
0,correlated|nsubj|start_entity correlated|nmod|...,0.0,0,1.0,0,0.0,0,31.0,0,5.0,...,4.0,0,2.0,0,6.0,0,11.0,0,1.0,0
1,vector|compound|start_entity transfected|nmod|...,1.0,0,0.0,0,2.0,0,1.0,0,0.0,...,1.0,0,0.0,0,1.0,0,1.0,0,14.0,0


In [14]:
disease_gene_paths_df = pd.read_table(
    disease_gene_paths_url, 
    names=[
        "pubmed_id", "sentence_num",
        "first_entity_name", "first_entity_location",
        "second_entity_name", "second_entity_location",
        "first_entity_name_raw", "second_entity_name_raw",
        "first_entity_db_id", "second_entity_db_id",
        "first_entity_type", "second_entity_type",
        "dep_path", "sentence"
    ]
)
disease_gene_paths_df.head(2)

Unnamed: 0,pubmed_id,sentence_num,first_entity_name,first_entity_location,second_entity_name,second_entity_location,first_entity_name_raw,second_entity_name_raw,first_entity_db_id,second_entity_db_id,first_entity_type,second_entity_type,dep_path,sentence
0,27955536,7,101F6,12271232,lung_cancers,12041216,101F6,lung cancers,11068,MESH:D008175,Gene,Disease,expressed|dobj|START_ENTITY expressed|nsubj|EN...,All normal lung bronchial epithelial cells and...
1,11980673,2,101F6,483488,tumor,529534,101F6,tumor,11068,MESH:D009369,Gene,Disease,effects|dep|START_ENTITY studied|dobj|effects ...,We studied the effects of six of these 3p21 .3...


In [15]:
disease_gene_merged_path_df=(
    disease_gene_paths_df
    .assign(dep_path=disease_gene_paths_df.dep_path.apply(lambda x: x.lower()).values)
    .merge(disease_gene_path_dist_df.rename(index=str, columns={"path":"dep_path"}), on=["dep_path"])
)
disease_gene_merged_path_df.head(2)

Unnamed: 0,pubmed_id,sentence_num,first_entity_name,first_entity_location,second_entity_name,second_entity_location,first_entity_name_raw,second_entity_name_raw,first_entity_db_id,second_entity_db_id,...,Y,Y.ind,G,G.ind,Md,Md.ind,X,X.ind,L,L.ind
0,27955536,7,101F6,12271232,lung_cancers,12041216,101F6,lung cancers,11068,MESH:D008175,...,168.0,0,251.0,0,149.0,0,336.0,0,1148.0,0
1,1466150,7,7B2,12191222,tumours,11851192,7B2,tumours,6447,MESH:D009369,...,168.0,0,251.0,0,149.0,0,336.0,0,1148.0,0


In [16]:
disease_gene_merged_path_df.to_csv(
    "disease_gene_bicluster_results.tsv.xz", 
    sep="\t", index=False, compression="xz"
)

# Gene-Gene

In [17]:
gene_gene_url = 'https://zenodo.org/record/1495808/files/part-i-gene-gene-path-theme-distributions.txt.zip'
gene_gene_paths_url = 'https://zenodo.org/record/1495808/files/part-ii-dependency-paths-gene-gene-sorted-with-themes.txt.zip'

In [18]:
gene_gene_path_dist_df = pd.read_table(gene_gene_url)
gene_gene_path_dist_df.head(2)

Unnamed: 0,path,B,B.ind,W,W.ind,V+,V+.ind,E+,E+.ind,E,E.ind,I,I.ind,H,H.ind,Rg,Rg.ind,Q,Q.ind
0,6|appos|start_entity mediated|nmod|6 mediated|...,0.0,0,0.0,0,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
1,vector|compound|start_entity transfected|nmod|...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,1.0,0,0.0,0,0.0,0,2.0,0


In [19]:
gene_gene_paths_df = pd.read_table(
    gene_gene_paths_url, 
    names=[
        "pubmed_id", "sentence_num",
        "first_entity_name", "first_entity_location",
        "second_entity_name", "second_entity_location",
        "first_entity_name_raw", "second_entity_name_raw",
        "first_entity_db_id", "second_entity_db_id",
        "first_entity_type", "second_entity_type",
        "dep_path", "sentence"
    ]
)
gene_gene_paths_df.head(2)

Unnamed: 0,pubmed_id,sentence_num,first_entity_name,first_entity_location,second_entity_name,second_entity_location,first_entity_name_raw,second_entity_name_raw,first_entity_db_id,second_entity_db_id,first_entity_type,second_entity_type,dep_path,sentence
0,29574423,5,0-C-reactive_protein,785805,CRP,807810,0-C-reactive protein,CRP,1401,1401,Gene,Gene,<|amod|START_ENTITY <|compound|END_ENTITY,Patient characteristics including Eastern Coop...
1,22583689,4,10E12Z-CLA,602612,COX-2,510515,10E12Z-CLA,COX-2,12720(Tax:10090),17709(Tax:10090),Gene,Gene,treatment|nmod|START_ENTITY induced|nmod|treat...,This work demonstrates that COX-2 is also indu...


In [20]:
gene_gene_merged_path_df=(
    gene_gene_paths_df
    .assign(dep_path=gene_gene_paths_df.dep_path.apply(lambda x: x.lower()).values)
    .merge(gene_gene_path_dist_df.rename(index=str, columns={"path":"dep_path"}), on=["dep_path"])
)
gene_gene_merged_path_df.head(2)

Unnamed: 0,pubmed_id,sentence_num,first_entity_name,first_entity_location,second_entity_name,second_entity_location,first_entity_name_raw,second_entity_name_raw,first_entity_db_id,second_entity_db_id,...,E,E.ind,I,I.ind,H,H.ind,Rg,Rg.ind,Q,Q.ind
0,29574423,5,0-C-reactive_protein,785805,CRP,807810,0-C-reactive protein,CRP,1401,1401,...,0.0,0,0.0,0,0.0,0,2.0,0,15.0,0
1,19779716,10,CD4,12381241,CXCR4,12531258,CD4,CXCR4,920,7852,...,0.0,0,0.0,0,0.0,0,2.0,0,15.0,0


In [21]:
gene_gene_merged_path_df.to_csv(
    "gene_gene_bicluster_results.tsv.xz", 
    sep="\t", index=False, compression="xz"
)