In [0]:
#for creation of labelled_data.txt
import re
import pandas as pd  #for preparing training data
from collections import Counter
import itertools
from keras.preprocessing.text import text_to_word_sequence

In [0]:
# tags from corpus
tagged_corpus = pd.read_csv("data/corpus_tags.tsv", sep="\t", names=["pmid", "start", "end", "word", "type", "umls_id"])
tagged_corpus = tagged_corpus.dropna()
tagged_corpus.head()

Unnamed: 0,pmid,start,end,word,type,umls_id
0,25763772,0,5,DCTN4,"T116,T123",C4308010
1,25763772,23,63,chronic Pseudomonas aeruginosa infection,T047,C0854135
2,25763772,67,82,cystic fibrosis,T047,C0010674
3,25763772,83,120,Pseudomonas aeruginosa (Pa) infection,T047,C0854135
4,25763772,124,139,cystic fibrosis,T047,C0010674


In [0]:
classes = []
with open("data/vocab_class.txt") as ftags:
    classes = [i.strip() for i in ftags.readlines()]
    ftags.close()
print(classes)

['T016']


In [0]:
# for each sentence the words and tags for each words is returned
def get_words_tags(text, pmid,start=0):
    words = []
    tags = []  
    typ = "O"
    df = tagged_corpus[(tagged_corpus["pmid"]==int(pmid)) & (tagged_corpus["end"] <= start + len(text)) & (tagged_corpus["start"] >= start)]
    indices = df.index.values
    text_split = text_to_word_sequence(text, filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')
    for word in text_split:
        words.append(word)
        for i in indices:
            text2 = df.loc[i]["word"]
            text2_split = text_to_word_sequence(text2, filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')
            if word in text2_split and set(df.loc[i]["type"].split(',')).intersection(classes):
                typ = ','.join(list(set(df.loc[i]["type"].split(',')).intersection(classes)))
                break
            else:
                typ = "O"
        tags.append(typ)
    return words,tags

In [0]:
fr = open("data/corpus_pubtator.txt")
#fr = open("para.txt")
fw = open("data/labelled_data_T016.txt", "w")

title_regex = re.compile("^\d+\|t\|")
abstract_regex = re.compile("^\d+\|a\|") 

In [0]:
from tqdm.notebook import trange
#opens the file and sends the title and abstract line by line to get_words_tags function
__ = len(fr.readlines())
fr.seek(0)

print('Classes being used: ',classes)

abstract_count = 0
for i,_ in zip(fr.readlines(),trange(__, desc = 'Abstract',ncols='700px')):
    line = i.strip()
    title_search = title_regex.search(line)
    abstract_search = abstract_regex.search(line)
    if title_search:
        title_line = title_regex.sub('',line)
    elif abstract_search:
        abstract = abstract_regex.sub('',line)
        pmid = int(abstract_search.group(0)[:-3])
        words,tags = get_words_tags(title_line,pmid,0)
        fw.write("\t".join(words)+"\n")
        fw.write("\t".join(tags)+"\n")
        title_size = len(title_line) + 1                     # 1 for \n        
        for sentence in re.split("\. ", abstract):
            if len(sentence) == 0:
                continue
            _size = abstract.index(sentence) + title_size
            words, tags = get_words_tags(sentence, pmid, _size)
            fw.write("\t".join(words)+"\n")
            fw.write("\t".join(tags)+"\n") 
        abstract_count += 1
        if abstract_count%100 == 0:
            print("PMID {} written to file. Abstract Count: {}".format(pmid, abstract_count))
fw.close()
fr.close()

Classes being used:  ['T016']


HBox(children=(IntProgress(value=0, description='Abstract', layout=Layout(flex='2'), max=365672, style=Progres…

PMID 27255600 written to file. Abstract Count: 100
PMID 27288403 written to file. Abstract Count: 200
PMID 27322653 written to file. Abstract Count: 300
PMID 27355434 written to file. Abstract Count: 400
PMID 27390849 written to file. Abstract Count: 500
PMID 27430240 written to file. Abstract Count: 600
PMID 27454637 written to file. Abstract Count: 700
PMID 27482869 written to file. Abstract Count: 800
PMID 27510784 written to file. Abstract Count: 900
PMID 27543896 written to file. Abstract Count: 1000
PMID 27575503 written to file. Abstract Count: 1100
PMID 27609696 written to file. Abstract Count: 1200
PMID 27634799 written to file. Abstract Count: 1300
PMID 27660632 written to file. Abstract Count: 1400
PMID 27692572 written to file. Abstract Count: 1500
PMID 27724977 written to file. Abstract Count: 1600
PMID 27760426 written to file. Abstract Count: 1700
PMID 27787956 written to file. Abstract Count: 1800
PMID 27812858 written to file. Abstract Count: 1900
PMID 27846331 written

In [0]:
print('__     ______    ___    __   _________')
print('| \   /  __  \  | \ \   | |  |   _____|')
print('|  |  | |  | |  | |\ \  | |  |  |_____')
print('|  |  | |__| |  | | \ \ | |  |  |_____')
print('|_/   \______/  |_|  \_\|_|  |________|')

__     ______    ___    __   _________
| \   /  __  \  | \ \   | |  |   _____|
|  |  | |  | |  | |\ \  | |  |  |_____
|  |  | |__| |  | | \ \ | |  |  |_____
|_/   \______/  |_|  \_\|_|  |________|
