# Extract All Sentences for Final Evaluation

This notebook is designed to output all sentences for BioBERT so that we can predict the existence of edge types downstream.

In [1]:
import warnings

import pandas as pd
import plydata as ply
import spacy
from sqlalchemy import create_engine

from snorkeling_helper.candidates_helper import encode_lemmas

warnings.filterwarnings("ignore")

In [2]:
username = "danich1"
password = "snorkel"
dbname = "pubmed_central_db"
database_str = (
    f"postgresql+psycopg2://{username}:{password}@/{dbname}?host=/var/run/postgresql"
)
conn = create_engine(database_str)

In [3]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

# DaG

In [4]:
sql = """
select candidate_id, document_id, text, lemma, disease_start, disease_end, gene_start, gene_end
from disease_gene inner join (
    select sentence_id, document_id
    from sentence
) as sentence_map
on disease_gene.sentence_id = sentence_map.sentence_id
where section = 'title' or section ='abstract'
"""

candidate = pd.read_sql(sql, database_str) >> ply.define(
    lemma=lambda x: x.lemma.apply(lambda y: y.replace("'", "").split("|"))
)
print(candidate.shape)
candidate.head()

(1539670, 8)


Unnamed: 0,candidate_id,document_id,text,lemma,disease_start,disease_end,gene_start,gene_end
0,121,29029454,"In conclusion, the findings demonstrated that ...","[in, conclusion, ,, the, finding, demonstrate,...",12,13,21,24
1,122,29029454,"In conclusion, the findings demonstrated that ...","[in, conclusion, ,, the, finding, demonstrate,...",21,24,42,43
2,124,29029454,Long noncoding RNA AB073614 promotes the malig...,"[long, noncode, rna, ab073614, promote, the, m...",8,9,13,16
3,148,19448163,"As a result, the methylenetetrahydrofolate red...","[as, a, result, ,, the, methylenetetrahydrofol...",5,7,46,49
4,190,23145009,The waist circumference cut points for diagnos...,"[the, waist, circumference, cut, point, for, d...",24,27,19,20


In [6]:
fieldnames = ["disease_start", "disease_end", "gene_start", "gene_end"]

In [7]:
encoded_abstracts = pd.DataFrame.from_records(
    encode_lemmas(
        candidate,
        stopwords,
        dict(),
        entity_fieldnames=fieldnames,
        entity_one="DISEASE_ENTITY",
        entity_two="GENE_ENTITY",
    )
)

print(encoded_abstracts.shape)
encoded_abstracts.head()

1539670it [04:15, 6027.31it/s]


(1539670, 2)


Unnamed: 0,parsed_lemmas,candidate_id
0,"conclusion|,|finding|demonstrate|ab073614|prom...",121
1,"conclusion|,|finding|demonstrate|ab073614|prom...",122
2,long|noncode|rna|ab073614|promote|malignance|D...,124
3,"result|,|DISEASE_ENTITY|677tt|genotype|prevale...",148
4,waist|circumference|cut|point|diagnose|metabol...,190


In [9]:
(
    encoded_abstracts
    >> ply.call(
        "to_csv", "output/all_dg_abstract_encoded_lemmas.tsv", sep="\t", index=False
    )
)

# CtD

In [10]:
sql = """
select candidate_id, document_id, text, lemma, compound_start, compound_end, disease_start, disease_end
from compound_disease inner join (
    select sentence_id, document_id
    from sentence
) as sentence_map
on compound_disease.sentence_id = sentence_map.sentence_id
where section = 'title' or section ='abstract'
"""

candidate = pd.read_sql(sql, database_str) >> ply.define(
    lemma=lambda x: x.lemma.apply(lambda y: y.replace("'", "").split("|"))
)
print(candidate.shape)
candidate.head()

(971820, 8)


Unnamed: 0,candidate_id,document_id,text,lemma,compound_start,compound_end,disease_start,disease_end
0,52,18165652,Thalidomide in small-cell lung cancer: is it a...,"[thalidomide, in, small, -, cell, lung, cancer...",2,5,0,1
1,175,21623267,Patients with advanced nonsquamous non-small c...,"[patient, with, advanced, nonsquamous, non, -,...",4,10,17,18
2,176,21623267,Erlotinib has prolonged survival in unselected...,"[erlotinib, have, prolong, survival, in, unsel...",9,15,17,18
3,408,9392537,Although the influence of the higher LET depos...,"[although, the, influence, of, the, high, let,...",9,10,31,32
4,409,9392537,CONCLUSION: The development of NVG after heliu...,"[conclusion, :, the, development, of, nvg, aft...",7,8,5,6


In [11]:
fieldnames = [
    "compound_start",
    "compound_end",
    "disease_start",
    "disease_end",
]

In [12]:
encoded_abstracts = pd.DataFrame.from_records(
    encode_lemmas(
        candidate,
        stopwords,
        dict(),
        entity_fieldnames=fieldnames,
        entity_one="COMPOUND_ENTITY",
        entity_two="DISEASE_ENTITY",
    )
)

print(encoded_abstracts.shape)
encoded_abstracts.head()

971820it [02:42, 5962.68it/s]


(971820, 2)


Unnamed: 0,parsed_lemmas,candidate_id
0,DISEASE_ENTITY|COMPOUND_ENTITY|lung|cancer|:|-...,52
1,patient|advanced|nonsquamous|COMPOUND_ENTITY|t...,175
2,erlotinib|prolong|survival|unselected|patient|...,176
3,influence|high|let|deposition|COMPOUND_ENTITY|...,408
4,conclusion|:|development|DISEASE_ENTITY|COMPOU...,409


In [13]:
(
    encoded_abstracts
    >> ply.call(
        "to_csv", "output/all_cd_abstract_encoded_lemmas.tsv", sep="\t", index=False
    )
)

# CbG

In [14]:
sql = """
select candidate_id, document_id, text, lemma, compound_start, compound_end, gene_start, gene_end
from compound_gene inner join (
    select sentence_id, document_id
    from sentence
) as sentence_map
on compound_gene.sentence_id = sentence_map.sentence_id
where section = 'title' or section ='abstract'
"""

candidate = pd.read_sql(sql, database_str) >> ply.define(
    lemma=lambda x: x.lemma.apply(lambda y: y.replace("'", "").split("|"))
)
print(candidate.shape)
candidate.head()

(1292772, 8)


Unnamed: 0,candidate_id,document_id,text,lemma,compound_start,compound_end,gene_start,gene_end
0,186,29473265,34-kDa translocase of the outer mitochondrial ...,"[34-kda, translocase, of, the, outer, mitochon...",12,15,8,9
1,191,23145009,"Anthropometry and fasting lipid, glucose and i...","[anthropometry, and, fast, lipid, ,, glucose, ...",7,8,5,6
2,277,28077915,"Glucose, nonesterified free fatty acids (NEFAs...","[glucose, ,, nonesterifie, free, fatty, acid, ...",10,11,0,1
3,279,28077915,"Glucose, nonesterified free fatty acids (NEFAs...","[glucose, ,, nonesterifie, free, fatty, acid, ...",38,39,0,1
4,281,28077915,"Glucose, nonesterified free fatty acids (NEFAs...","[glucose, ,, nonesterifie, free, fatty, acid, ...",0,1,21,28


In [15]:
fieldnames = [
    "compound_start",
    "compound_end",
    "gene_start",
    "gene_end",
]

In [16]:
encoded_abstracts = pd.DataFrame.from_records(
    encode_lemmas(
        candidate,
        stopwords,
        dict(),
        entity_fieldnames=fieldnames,
        entity_one="COMPOUND_ENTITY",
        entity_two="GENE_ENTITY",
    )
)

print(encoded_abstracts.shape)
encoded_abstracts.head()

1292772it [03:32, 6080.27it/s]


(1292772, 2)


Unnamed: 0,parsed_lemmas,candidate_id
0,34-kda|translocase|outer|mitochondrial|membran...,186
1,"anthropometry|fast|lipid|,|GENE_ENTITY|COMPOUN...",191
2,"GENE_ENTITY|,|nonesterifie|free|fatty|acid|(|n...",277
3,"GENE_ENTITY|,|nonesterifie|free|fatty|acid|(|n...",279
4,"COMPOUND_ENTITY|,|nonesterifie|free|fatty|acid...",281


In [17]:
(
    encoded_abstracts
    >> ply.call(
        "to_csv", "output/all_cg_abstract_encoded_lemmas.tsv", sep="\t", index=False
    )
)

# GiG

In [18]:
sql = """
select candidate_id, document_id, text, lemma, gene1_start, gene1_end, gene2_start, gene2_end
from gene_gene inner join (
    select sentence_id, document_id
    from sentence
) as sentence_map
on gene_gene.sentence_id = sentence_map.sentence_id
where section = 'title' or section ='abstract'
"""

candidate = pd.read_sql(sql, database_str) >> ply.define(
    lemma=lambda x: x.lemma.apply(lambda y: y.replace("'", "").split("|"))
)
print(candidate.shape)
candidate.head()

(5420798, 8)


Unnamed: 0,candidate_id,document_id,text,lemma,gene1_start,gene1_end,gene2_start,gene2_end
0,38967952,18728748,This review examines the physiological role of...,"[this, review, examine, the, physiological, ro...",7,8,17,18
1,40608212,29615870,"Previously, we showed that superficially locat...","[previously, ,, -pron-, show, that, superficia...",16,21,22,23
2,16688148,18475569,Although the ultimate effect was often donor d...,"[although, the, ultimate, effect, be, often, d...",9,10,21,22
3,16517196,27196066,"Quaking (QKI), which belongs to the STAR famil...","[quake, (, qki, ), ,, which, belong, to, the, ...",0,1,2,3
4,16517195,27196066,QKI plays critical roles in myelinogenesis in ...,"[qki, play, critical, role, in, myelinogenesis...",0,1,34,35


In [19]:
fieldnames = ["gene1_start", "gene1_end", "gene2_start", "gene2_end"]

In [20]:
encoded_abstracts = pd.DataFrame.from_records(
    encode_lemmas(
        candidate,
        stopwords,
        dict(),
        entity_fieldnames=fieldnames,
        entity_one="GENE1_ENTITY",
        entity_two="GENE2_ENTITY",
    )
)

print(encoded_abstracts.shape)
encoded_abstracts.head()

5420798it [14:48, 6101.26it/s]


(5420798, 2)


Unnamed: 0,parsed_lemmas,candidate_id
0,review|examine|physiological|role|GENE1_ENTITY...,38967952
1,"previously|,|-pron-|superficially|locate|micro...",40608212
2,"ultimate|effect|donor|dependent|,|GENE1_ENTITY...",16688148
3,"GENE1_ENTITY|(|GENE2_ENTITY|qki|)|,|belong|sta...",16517196
4,GENE1_ENTITY|play|critical|role|myelinogenesis...,16517195


In [21]:
(
    encoded_abstracts
    >> ply.call(
        "to_csv", "output/all_gg_abstract_encoded_lemmas.tsv", sep="\t", index=False
    )
)