In [None]:
# Objectives:
# - Try using scispacy, BioBERT, and saber for NER
# - Compare this to the GEO annotations

In [1]:
import scispacy
import spacy
import en_ner_bc5cdr_md
from spacy import displacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.umls_linking import UmlsEntityLinker

In [2]:
import pandas as pd

In [3]:
gse_data = pd.read_csv("../../../data/01_sample_lists/gse_metadata_all.csv")

In [4]:
gse_data.head()

Unnamed: 0,gse,gpl,organism,study_type,title,pubmed_id,submission_date,overall_design,summary
0,GSE3,"GPL9,GPL10",human,oligo,Renal Cell Carcinoma Differential Expression,11691851.0,2001-07-19,,We investigated the changes in gene expression...
1,GSE11,GPL24,mouse,oligo,NOD model of type 1 diabetes,11827943.0,2001-11-19,,We used high density oligonucleotide arrays to...
2,GSE12,GPL24,mouse,oligo,Group1,,2001-11-19,,Replicate group 1 for GSE11. All samples were...
3,GSE15,GPL24,mouse,oligo,Group2,,2002-01-03,,Replicate group 2 for GSE11. All samples were ...
4,GSE51,GPL81,mouse,oligo,Hippocampus replicate samples,,2002-05-28,,Hippocampus gene expression experiments;\tKeyw...


In [13]:
gse_data.head(50).shape

(50, 9)

In [12]:
nlp = spacy.load("en_core_sci_lg")
abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)

nlp1 = spacy.load("en_ner_bionlp13cg_md")   # organism, cancer, cell
abbreviation_pipe1 = AbbreviationDetector(nlp1)
nlp1.add_pipe(abbreviation_pipe1)

nlp2 = spacy.load("en_ner_jnlpba_md")   # protein, dna, cell type
abbreviation_pipe2 = AbbreviationDetector(nlp2)
nlp2.add_pipe(abbreviation_pipe2)

nlp3 = spacy.load("en_ner_bc5cdr_md")   # chemical, disease
abbreviation_pipe3 = AbbreviationDetector(nlp3)
nlp3.add_pipe(abbreviation_pipe3)

In [19]:

my_df = pd.DataFrame({'GSE' : [], "tissue":[], \
                      "chemical": [], "disease":[], "cell":[]})

for index, row in gse_data.iterrows():
    my_l = "%s %s %s" %(row['title'], row['overall_design'], row['summary'])
    #summary_ents = nlp(my_l).ents
    doc1 = nlp1(my_l)
    doc2 = nlp2(my_l)
    doc3 = nlp3(my_l)

    tissue = set([ent.text for ent in doc1.ents if ent.label_ in ["ORGAN", "TISSUE", "MULTI-TISSUE_STRUCTURE", \
                                                                  "ANATOMICAL_SYSTEM", "IMMATERIAL_ANATOMICAL_ENTITY", \
                                                                  "MULTI-TISSUE_STRUCTURE", "ORGANISM_SUBDIVISION"]])
    cells1 = set([ent.text for ent in doc2.ents if ent.label_ in ["CELL_LINE", "CELL_TYPE"]])
    cells2 = set([ent.text for ent in doc1.ents if ent.label_ in ["CELL"]])
    cells = cells1.union(cells2)

    chemical = set([ent.text for ent in doc3.ents if ent.label_ in ["CHEMICAL"]])
    disease = set([ent.text for ent in doc3.ents if ent.label_ in ["DISEASE"]])
    #all_ents = summary_ents + doc1.ents + doc2.ents + doc3.ents
    my_df = my_df.append(pd.DataFrame({"GSE": row['gse'], "tissue":[tissue], "chemical": [chemical], "disease":[disease], "cell":[cells]}))


In [20]:
my_df.head()

Unnamed: 0,GSE,tissue,chemical,disease,cell
0,GSE3,"{non-cancerous renal epithelium samples, kidne...","{oxygen, nucleotide}","{renal clear cell carcinoma, renal cell carcin...","{kidney tumor cells, Cell, Renal Cell}"
0,GSE11,"{spleens, spleen B10.H2g7_S1, thymus, spleen, ...",{},"{diabetes, T1D}",{T cells}
0,GSE12,{},{},{},{}
0,GSE15,{},{},{},{}
0,GSE51,{},{},{},{}


In [21]:
my_df.to_csv("../../data/02_labeled_data/ner_annot.csv")