# Biclustering of Dependency Paths for Biomedical Realtionship Extraction

A global network of biomedical relationships derived from text

In [1]:
import networkx as nx
import pandas as pd
import spacy
from tqdm import tqdm

tqdm.pandas(desc="LF progressbar")

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
def convert_dep_path(dataframe_row):
    """
    This function converts dependency paths generated from Stanford's Core NLP parser into
    dependency paths generated from Spacy.
    This project uses Spacy to parse sentences which is different from the parser used in this paper:
    A global network of biomedical relationships derived from text

    Args:
        dataframe_row - the row from the pandas dataframe
    """
    is_first_entity_first = int(
        dataframe_row["first_entity_location"].split(",")[1]
    ) < int(dataframe_row["second_entity_location"].split(",")[0])
    seen_first_entity = False
    build_string = []
    for token in dataframe_row.sentence.split(" "):
        if token == dataframe_row["second_entity_name"]:
            if is_first_entity_first and seen_first_entity:
                token = "end_entity"

            if not is_first_entity_first:
                token = "start_entity"

        if token == dataframe_row["first_entity_name"]:
            if is_first_entity_first:
                token = "start_entity"
                seen_first_entity = True
            else:
                token = "end_entity"

        build_string.append(token.lower())

    # Skip sentences with multiple entities
    sen_text = " ".join(build_string).lower()
    if sen_text.count("start_entity") > 1 or sen_text.count("end_entity") > 1:
        return ""

    try:
        doc = nlp(sen_text)
        dep_graph = nx.Graph()
        for token in doc:
            for child in token.children:
                dep_graph.add_edge(
                    "{0}".format(token), "{0}".format(child), dep=child.dep_
                )

        path = nx.shortest_path(dep_graph, "start_entity", "end_entity")
        pathGraph = nx.path_graph(path)

        return " ".join(
            [
                f"{ea[1]}|{dep_graph.edges[ea[0], ea[1]]['dep']}|{ea[0]}"
                for ea in pathGraph.edges()
            ]
        )

    except nx.NetworkXNoPath:
        return ""
    except nx.NodeNotFound:
        return ""

# Chemical-Disease

This section takes all dependency clusters designed for chemical and disease relationships and outputs them into a tab separated format. The dependency clusters are broken up into two files, one for the path groups and the other for the extract match of that path. This code here merges both.

In [4]:
chemical_disease_url = "https://zenodo.org/record/1495808/files/part-i-chemical-disease-path-theme-distributions.txt.zip"
chemical_disease_paths_url = "https://zenodo.org/record/1495808/files/part-ii-dependency-paths-chemical-disease-sorted-with-themes.txt.zip"

In [5]:
chemical_disease_path_dist_df = pd.read_table(chemical_disease_url)
chemical_disease_path_dist_df.head(2)

Unnamed: 0,path,T,T.ind,C,C.ind,Sa,Sa.ind,Pr,Pr.ind,Pa,Pa.ind,J,J.ind,Mp,Mp.ind
0,comparison|nmod|start_entity importance|dep|co...,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
1,correlated|nsubj|start_entity correlated|nmod|...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,1.0,0,0.0,0


In [6]:
chemical_disease_paths_df = pd.read_table(
    chemical_disease_paths_url,
    names=[
        "pubmed_id",
        "sentence_num",
        "first_entity_name",
        "first_entity_location",
        "second_entity_name",
        "second_entity_location",
        "first_entity_name_raw",
        "second_entity_name_raw",
        "first_entity_db_id",
        "second_entity_db_id",
        "first_entity_type",
        "second_entity_type",
        "dep_path",
        "sentence",
    ],
)
chemical_disease_paths_df.head(2)

Unnamed: 0,pubmed_id,sentence_num,first_entity_name,first_entity_location,second_entity_name,second_entity_location,first_entity_name_raw,second_entity_name_raw,first_entity_db_id,second_entity_db_id,first_entity_type,second_entity_type,dep_path,sentence
0,26832326,3,-,523524,ischemic_stroke,647662,-,ischemic stroke,,MESH:D002544,Chemical,Disease,effects|nmod|START_ENTITY assessed|dobj|effect...,This study assessed the neuroprotective effect...
1,26832326,3,-,523524,neuronal_injury,609624,-,neuronal injury,,MESH:D009410,Chemical,Disease,START_ENTITY|acl|induced induced|dobj|END_ENTITY,This study assessed the neuroprotective effect...


In [7]:
chemical_disease_merged_path_df = chemical_disease_paths_df.assign(
    dep_path=chemical_disease_paths_df.dep_path.apply(lambda x: x.lower()).values,
    spacy_dep_path=lambda x: x.progress_apply(convert_dep_path, axis=1),
).merge(
    chemical_disease_path_dist_df.rename(index=str, columns={"path": "dep_path"}),
    on=["dep_path"],
)
chemical_disease_merged_path_df.head(2)

LF progressbar:  26%|██▌       | 1184771/4548801 [2:11:21<6:24:57, 145.64it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

LF progressbar:  31%|███       | 1392186/4548801 [2:34:36<6:20:28, 138.28it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

LF progressbar:  35%|███▌      | 1597993/4548801 [2:57:49<5:53:27, 139.14it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoi

LF progressbar:  83%|████████▎ | 3791150/4548801 [7:04:54<1:14:40, 169.11it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

LF progressbar:  84%|████████▍ | 3841523/4548801 [7:10:39<1:13:31, 160.31it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

LF progressbar:  88%|████████▊ | 3993098/4548801 [7:28:05<1:08:12, 135.77it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoi

In [8]:
chemical_disease_merged_path_df.to_csv(
    "chemical_disease_bicluster_results.tsv.xz", sep="\t", index=False, compression="xz"
)

# Chemical-Gene

This section takes all dependency clusters designed for chemical and gene relationships and outputs them into a tab separated format. The dependency clusters are broken up into two files, one for the path groups and the other for the extract match of that path. This code here merges both.

In [9]:
chemical_gene_url = "https://zenodo.org/record/1495808/files/part-i-chemical-gene-path-theme-distributions.txt.zip"
chemical_gene_paths_url = "https://zenodo.org/record/1495808/files/part-ii-dependency-paths-chemical-gene-sorted-with-themes.txt.zip"

In [10]:
chemical_gene_path_dist_df = pd.read_table(chemical_gene_url)
chemical_gene_path_dist_df.head(2)

Unnamed: 0,path,A+,A+.ind,A-,A-.ind,B,B.ind,E+,E+.ind,E-,...,E,E.ind,N,N.ind,O,O.ind,K,K.ind,Z,Z.ind
0,kinases|compound|start_entity participate|nsub...,2.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0.0,0,0.0,0,2.0,0,0.0,0,0.0,0
1,treatment|nmod|start_entity caused|nsubj|treat...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,4.0,0,0.0,0,0.0,0,0.0,0,0.0,0


In [11]:
chemical_gene_paths_df = pd.read_table(
    chemical_gene_paths_url,
    names=[
        "pubmed_id",
        "sentence_num",
        "first_entity_name",
        "first_entity_location",
        "second_entity_name",
        "second_entity_location",
        "first_entity_name_raw",
        "second_entity_name_raw",
        "first_entity_db_id",
        "second_entity_db_id",
        "first_entity_type",
        "second_entity_type",
        "dep_path",
        "sentence",
    ],
)
chemical_gene_paths_df.head(2)

Unnamed: 0,pubmed_id,sentence_num,first_entity_name,first_entity_location,second_entity_name,second_entity_location,first_entity_name_raw,second_entity_name_raw,first_entity_db_id,second_entity_db_id,first_entity_type,second_entity_type,dep_path,sentence
0,25640386,11,-,16241625,hormone_receptor,15981614,-,hormone receptor,,3164,Chemical,Gene,sensitivity|appos|START_ENTITY sensitivity|nmo...,Similarly 18F-FDG-PET/CT had higher sensitivit...
1,28560459,1,+,244245,Sig-1R,227233,+,Sig-1R,,10280,Chemical,Gene,agonist|appos|START_ENTITY agonist|compound|re...,The purpose of the present study was to invest...


In [12]:
chemical_gene_merged_path_df = chemical_gene_paths_df.assign(
    dep_path=chemical_gene_paths_df.dep_path.apply(lambda x: x.lower()).values,
    spacy_dep_path=lambda x: x.progress_apply(convert_dep_path, axis=1),
).merge(
    chemical_gene_path_dist_df.rename(index=str, columns={"path": "dep_path"}),
    on=["dep_path"],
)
chemical_gene_merged_path_df.head(2)

LF progressbar:   3%|▎         | 49842/1669300 [05:34<2:57:18, 152.22it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

LF progressbar:  16%|█▌        | 270720/1669300 [29:22<2:48:13, 138.57it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

LF progressbar:  28%|██▊       | 463233/1669300 [49:08<2:25:57, 137.72it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing

In [13]:
chemical_gene_merged_path_df.to_csv(
    "chemical_gene_bicluster_results.tsv.xz", sep="\t", index=False, compression="xz"
)

# Disease-Gene

This section takes all dependency clusters designed for disease and gene relationships and outputs them into a tab separated format. The dependency clusters are broken up into two files, one for the path groups and the other for the extract match of that path. This code here merges both.

In [14]:
disease_gene_url = "https://zenodo.org/record/1495808/files/part-i-gene-disease-path-theme-distributions.txt.zip"
disease_gene_paths_url = "https://zenodo.org/record/1495808/files/part-ii-dependency-paths-gene-disease-sorted-with-themes.txt.zip"

In [15]:
disease_gene_path_dist_df = pd.read_table(disease_gene_url)
disease_gene_path_dist_df.head(2)

Unnamed: 0,path,U,U.ind,Ud,Ud.ind,D,D.ind,J,J.ind,Te,...,Y,Y.ind,G,G.ind,Md,Md.ind,X,X.ind,L,L.ind
0,correlated|nsubj|start_entity correlated|nmod|...,0.0,0,1.0,0,0.0,0,31.0,0,5.0,...,4.0,0,2.0,0,6.0,0,11.0,0,1.0,0
1,vector|compound|start_entity transfected|nmod|...,1.0,0,0.0,0,2.0,0,1.0,0,0.0,...,1.0,0,0.0,0,1.0,0,1.0,0,14.0,0


In [16]:
disease_gene_paths_df = pd.read_table(
    disease_gene_paths_url,
    names=[
        "pubmed_id",
        "sentence_num",
        "first_entity_name",
        "first_entity_location",
        "second_entity_name",
        "second_entity_location",
        "first_entity_name_raw",
        "second_entity_name_raw",
        "first_entity_db_id",
        "second_entity_db_id",
        "first_entity_type",
        "second_entity_type",
        "dep_path",
        "sentence",
    ],
)
disease_gene_paths_df.head(2)

Unnamed: 0,pubmed_id,sentence_num,first_entity_name,first_entity_location,second_entity_name,second_entity_location,first_entity_name_raw,second_entity_name_raw,first_entity_db_id,second_entity_db_id,first_entity_type,second_entity_type,dep_path,sentence
0,27955536,7,101F6,12271232,lung_cancers,12041216,101F6,lung cancers,11068,MESH:D008175,Gene,Disease,expressed|dobj|START_ENTITY expressed|nsubj|EN...,All normal lung bronchial epithelial cells and...
1,11980673,2,101F6,483488,tumor,529534,101F6,tumor,11068,MESH:D009369,Gene,Disease,effects|dep|START_ENTITY studied|dobj|effects ...,We studied the effects of six of these 3p21 .3...


In [17]:
disease_gene_merged_path_df = disease_gene_paths_df.assign(
    dep_path=disease_gene_paths_df.dep_path.apply(lambda x: x.lower()).values,
    spacy_dep_path=lambda x: x.progress_apply(convert_dep_path, axis=1),
).merge(
    disease_gene_path_dist_df.rename(index=str, columns={"path": "dep_path"}),
    on=["dep_path"],
)
disease_gene_merged_path_df.head(2)

LF progressbar:   1%|▏         | 51364/3743004 [05:35<6:56:12, 147.83it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

LF progressbar:   7%|▋         | 276222/3743004 [29:29<6:07:39, 157.15it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

LF progressbar:  13%|█▎        | 501938/3743004 [53:17<5:33:37, 161.91it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing

Unnamed: 0,pubmed_id,sentence_num,first_entity_name,first_entity_location,second_entity_name,second_entity_location,first_entity_name_raw,second_entity_name_raw,first_entity_db_id,second_entity_db_id,...,Y,Y.ind,G,G.ind,Md,Md.ind,X,X.ind,L,L.ind
0,27955536,7,101F6,12271232,lung_cancers,12041216,101F6,lung cancers,11068,MESH:D008175,...,168.0,0,251.0,0,149.0,0,336.0,0,1148.0,0
1,1466150,7,7B2,12191222,tumours,11851192,7B2,tumours,6447,MESH:D009369,...,168.0,0,251.0,0,149.0,0,336.0,0,1148.0,0


In [18]:
disease_gene_merged_path_df.to_csv(
    "disease_gene_bicluster_results.tsv.xz", sep="\t", index=False, compression="xz"
)

# Gene-Gene

This section takes all dependency clusters designed for gene-gene relationships and outputs them into a tab separated format. The dependency clusters are broken up into two files, one for the path groups and the other for the extract match of that path. This code here merges both.

In [19]:
gene_gene_url = "https://zenodo.org/record/1495808/files/part-i-gene-gene-path-theme-distributions.txt.zip"
gene_gene_paths_url = "https://zenodo.org/record/1495808/files/part-ii-dependency-paths-gene-gene-sorted-with-themes.txt.zip"

In [20]:
gene_gene_path_dist_df = pd.read_table(gene_gene_url)
gene_gene_path_dist_df.head(2)

Unnamed: 0,path,B,B.ind,W,W.ind,V+,V+.ind,E+,E+.ind,E,E.ind,I,I.ind,H,H.ind,Rg,Rg.ind,Q,Q.ind
0,6|appos|start_entity mediated|nmod|6 mediated|...,0.0,0,0.0,0,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
1,vector|compound|start_entity transfected|nmod|...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,1.0,0,0.0,0,0.0,0,2.0,0


In [21]:
gene_gene_paths_df = pd.read_table(
    gene_gene_paths_url,
    names=[
        "pubmed_id",
        "sentence_num",
        "first_entity_name",
        "first_entity_location",
        "second_entity_name",
        "second_entity_location",
        "first_entity_name_raw",
        "second_entity_name_raw",
        "first_entity_db_id",
        "second_entity_db_id",
        "first_entity_type",
        "second_entity_type",
        "dep_path",
        "sentence",
    ],
)
gene_gene_paths_df.head(2)

Unnamed: 0,pubmed_id,sentence_num,first_entity_name,first_entity_location,second_entity_name,second_entity_location,first_entity_name_raw,second_entity_name_raw,first_entity_db_id,second_entity_db_id,first_entity_type,second_entity_type,dep_path,sentence
0,29574423,5,0-C-reactive_protein,785805,CRP,807810,0-C-reactive protein,CRP,1401,1401,Gene,Gene,<|amod|START_ENTITY <|compound|END_ENTITY,Patient characteristics including Eastern Coop...
1,22583689,4,10E12Z-CLA,602612,COX-2,510515,10E12Z-CLA,COX-2,12720(Tax:10090),17709(Tax:10090),Gene,Gene,treatment|nmod|START_ENTITY induced|nmod|treat...,This work demonstrates that COX-2 is also indu...


In [22]:
gene_gene_merged_path_df = gene_gene_paths_df.assign(
    dep_path=gene_gene_paths_df.dep_path.apply(lambda x: x.lower()).values,
    spacy_dep_path=lambda x: x.progress_apply(convert_dep_path, axis=1),
).merge(
    gene_gene_path_dist_df.rename(index=str, columns={"path": "dep_path"}),
    on=["dep_path"],
)
gene_gene_merged_path_df.head(2)

LF progressbar:  18%|█▊        | 816845/4571448 [1:28:24<5:48:25, 179.60it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

LF progressbar:  22%|██▏       | 1003761/4571448 [1:49:09<4:52:55, 202.99it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

LF progressbar:  26%|██▌       | 1185722/4571448 [2:09:40<7:58:50, 117.85it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoi

Unnamed: 0,pubmed_id,sentence_num,first_entity_name,first_entity_location,second_entity_name,second_entity_location,first_entity_name_raw,second_entity_name_raw,first_entity_db_id,second_entity_db_id,...,E,E.ind,I,I.ind,H,H.ind,Rg,Rg.ind,Q,Q.ind
0,29574423,5,0-C-reactive_protein,785805,CRP,807810,0-C-reactive protein,CRP,1401,1401,...,0.0,0,0.0,0,0.0,0,2.0,0,15.0,0
1,19779716,10,CD4,12381241,CXCR4,12531258,CD4,CXCR4,920,7852,...,0.0,0,0.0,0,0.0,0,2.0,0,15.0,0


In [23]:
gene_gene_merged_path_df.to_csv(
    "gene_gene_bicluster_results.tsv.xz", sep="\t", index=False, compression="xz"
)