### Import Necessary Packages

In [15]:
from string import punctuation, whitespace, ascii_letters

import pandas as pd 
import numpy as np 
import seaborn as sns

from textblob import TextBlob, Blobber, WordList # POS tagging
from textblob import Blobber

import re

## Tagging Sentences with `B`, `I`, or `O` Terminology (Disease Entity Extraction)

In [3]:
def convert_corpus_dict(FILENAME):
    """HELPER METHOD: given the FILENAME for the NCBI corpus file, convert to pandas dataframe"""
    df = {}
    new_entry = True # tracks if we're evaluating a new pubmed article
    for LINE in open(FILENAME,'r'):
        if LINE == '\n': # articles are separated by newline characters
            new_entry = True
        else:
            if new_entry == True:
                # if LINE contains text & new_entry is True, we have a new article to input w/ title
                new_entry = False 
                PubMed_ID = int(LINE.strip().split("|")[0])
                df[PubMed_ID] = {} # define the new pubmed entry, then extract title 
                df[PubMed_ID]["Title"] = LINE.strip().split("|")[2]
            elif new_entry == False:
                # if LINE contains text & new_entry is False, we have more information to input
                if "|a|" in LINE: # abstract information 
                    df[PubMed_ID]["Abstract"] = LINE.strip().split("|")[2][0:-1]
                elif "\t" in LINE: # identified mentions of diseases, etc.
                    if "Mentions" not in df[PubMed_ID].keys(): 
                        # there might be multiple lines... add first occurence properly
                        df[PubMed_ID]["Mentions"] = []
                    df[PubMed_ID]["Mentions"].append(LINE.strip().split("\t")[1:])
    
    # we've gone through the whole file, so output as proper pandas dataframe
    return(pd.DataFrame(df).T[["Title","Abstract","Mentions"]])

In [4]:
#######################################################
##### Import the NCBI Disease Corpus TEST Set !!! #####
#######################################################
testa_set = 'NCBItestset_corpus.txt'
df_testa = convert_corpus_dict(testa_set)

devel_set = 'NCBIdevelopset_corpus.txt'
df_devel = convert_corpus_dict(devel_set)

train_set = 'NCBItrainset_corpus.txt'
df_train = convert_corpus_dict(train_set)
#######################################################
df_all = pd.concat([df_devel,df_train,df_testa])

Combine all of the datasets provided by the NCBI Disease Corpus (the Development, Test, and Train datasets) to tag with the correct `BIO` terminology per word.

In [5]:
# combine the title and abstract information
df_all["Text"] = df_all.apply(lambda row: row["Title"] + " " + row["Abstract"], axis=1)
df_all.head()

Unnamed: 0,Title,Abstract,Mentions,Text
8808605,Somatic-cell selection is a major determinant ...,X-chromosome inactivation in mammals is regard...,"[[154, 171, enzyme deficiency, DiseaseClass, ...",Somatic-cell selection is a major determinant ...
9050866,"The ataxia-telangiectasia gene product, a cons...",The product of the ataxia-telangiectasia gene ...,"[[4, 25, ataxia-telangiectasia, Modifier, D001...","The ataxia-telangiectasia gene product, a cons..."
9012409,Molecular basis for Duarte and Los Angeles var...,Human orythrocytes that are homozygous for the...,"[[20, 63, Duarte and Los Angeles variant galac...",Molecular basis for Duarte and Los Angeles var...
8755645,An intronic mutation in a lariat branchpoint s...,The first step in the splicing of an intron fr...,"[[78, 102, inherited human disorder, DiseaseCl...",An intronic mutation in a lariat branchpoint s...
8751855,Genetic heterogeneity in hereditary breast can...,The common hereditary forms of breast cancer h...,"[[25, 49, hereditary breast cancer, SpecificDi...",Genetic heterogeneity in hereditary breast can...


Go through each text snippet within the corpus and tag each article's combined Title and Abstract with `BIO` terms.

In [32]:
BIOs = []
for LINE in df_all.iterrows():
    TEXT = LINE[1]["Text"]
    sentence_building = ""
    prev_index = 0
    for MENTION in LINE[1]["Mentions"]:
        # go through each manual annotation and convert those phrases to BI depending
        # on the length of the text; essentially, replace the words within the text with BI
        s_pos = int(MENTION[0]) 
        e_pos = int(MENTION[1])
        # gather the phrase and add in the BIO terms
        disease_phrase = TEXT[s_pos:e_pos] 
        bio_disease = ['B*' if i == 0 else 'I*' for i in range(len(disease_phrase.split(" ")))]
        # append to the build of the sentence and continue going through the mentions made
        sentence_building += TEXT[prev_index:s_pos] + ' '.join(bio_disease) 
        prev_index = e_pos
    # add the rest of the sentence 
    sentence_building += TEXT[prev_index:]
    
    BIO_translation = []
    for WORD in sentence_building.split():
        # split the sentence by WHITESPACE, and then convert the sentence to either B I or O 
        # depending on the length of the disease phrase and the number of extra words in the sentence
        if WORD == 'B*':
            BIO_translation.append('B')
        elif WORD == 'I*':
            BIO_translation.append('I')
        else:
            BIO_translation.append('O')
    
    # collect all of the sentences to add to the dataframe later
    BIOs.append(BIO_translation)

df_all["BIO Translation"] = BIOs
df_all.head()

Unnamed: 0,Title,Abstract,Mentions,Text,BIO Translation
8808605,Somatic-cell selection is a major determinant ...,X-chromosome inactivation in mammals is regard...,"[[154, 171, enzyme deficiency, DiseaseClass, ...",Somatic-cell selection is a major determinant ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
9050866,"The ataxia-telangiectasia gene product, a cons...",The product of the ataxia-telangiectasia gene ...,"[[4, 25, ataxia-telangiectasia, Modifier, D001...","The ataxia-telangiectasia gene product, a cons...","[O, B, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
9012409,Molecular basis for Duarte and Los Angeles var...,Human orythrocytes that are homozygous for the...,"[[20, 63, Duarte and Los Angeles variant galac...",Molecular basis for Duarte and Los Angeles var...,"[O, O, O, B, I, I, I, I, O, O, O, O, O, O, O, ..."
8755645,An intronic mutation in a lariat branchpoint s...,The first step in the splicing of an intron fr...,"[[78, 102, inherited human disorder, DiseaseCl...",An intronic mutation in a lariat branchpoint s...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, ..."
8751855,Genetic heterogeneity in hereditary breast can...,The common hereditary forms of breast cancer h...,"[[25, 49, hereditary breast cancer, SpecificDi...",Genetic heterogeneity in hereditary breast can...,"[O, O, O, B, I, O, O, O, O, O, O, O, O, O, O, ..."


### Output Files for Further Processing

In [45]:
#### OUTPUT TO FILE FOR CLARENCE TO PROCESS
lines = []
for ROW in df_all.iterrows():
    ID = ROW[0]
    tags = ROW[1]["BIO Translation"]
    
    tags = [t.lstrip(whitespace) for t in tags]
    tags = [t.rstrip(whitespace) for t in tags]
    
    TRANSLATION = '\t'.join(tags)
    lines.append("{0}\t{1}".format(ID,TRANSLATION))

f = open("20181204_MR_ncbi-disease-corpus_bio-translation.txt", "w")
f.write('\n'.join(lines))
f.close()

In [59]:
#### OUTPUT JUST TEXT FOR CLARENCE 
corpus = []
for ROW in df_all.iterrows():
    ID = ROW[0]
    document = ROW[1]["Text"].split()
    # Remove leading/trailing whitespace and punctuation
#     document = [word.lstrip(punctuation) for word in document if len(word.lstrip(punctuation)) != 0]
#     document = [word.rstrip(punctuation) for word in document if len(word.rstrip(punctuation)) != 0]
    
    # Lemmatize each word
    textb = WordList(document)
    textb = [word.stem() for word in textb]
    
    # Text 
    TEXT = '\t'.join(textb)
    corpus.append("{0}\t{1}".format(ID,TEXT))

f = open("20181204_MR_ncbi-disease-corpus_text.txt", "w")
f.write('\n'.join(corpus))
f.close()

In [35]:
df_train.shape

(592, 3)