In [2]:
#for creation of labelled_data.txt
import re
import pandas as pd  #for preparing training data
from collections import Counter
import itertools

In [3]:
# tags from corpus
tagged_corpus = pd.read_csv("data/corpus_tags.tsv", sep="\t", names=["pmid", "start", "end", "word", "type", "umls_id"])
tagged_corpus = tagged_corpus.dropna()
tagged_corpus.head()

Unnamed: 0,pmid,start,end,word,type,umls_id
0,25763772,0,5,DCTN4,"T116,T123",C4308010
1,25763772,23,63,chronic Pseudomonas aeruginosa infection,T047,C0854135
2,25763772,67,82,cystic fibrosis,T047,C0010674
3,25763772,83,120,Pseudomonas aeruginosa (Pa) infection,T047,C0854135
4,25763772,124,139,cystic fibrosis,T047,C0010674


In [4]:
classes = []
with open("data/vocab_class.txt") as ftags:
    classes = [i.strip() for i in ftags.readlines()]
    ftags.close()
classes

['T047', 'T005', 'T007', 'T016']

In [5]:
# for each sentence the words and tags for each words is returned
def get_words_tags(text, pmid,start=0):
    words = []
    tags = []  
    typ = "0"
    df = tagged_corpus[(tagged_corpus["pmid"]==int(pmid)) & (tagged_corpus["end"] <= start + len(text)) & (tagged_corpus["start"] >= start)]
    indices = df.index.values
    for word in text.split():
        words.append(word)
        for i in indices:
            text2 = df.loc[i]["word"]
            if word in text2.split() and set(df.loc[i]["type"].split(',')).intersection(classes):
                typ = df.loc[i]["type"]
                break
            else:
                typ = "0"
        tags.append(typ)
    return words,tags

In [6]:
fr = open("data/corpus_pubtator.txt")
#fr = open("para.txt")
fw = open("data/labelled_data.txt", "w")

title_regex = re.compile("^\d+\|t\|")
abstract_regex = re.compile("^\d+\|a\|") 

In [7]:
#opens the file and sends the title and abstract line by line to get_words_tags function
abstract_count = 0
for i in fr.readlines():
    line = i.strip()
    title_search = title_regex.search(line)
    abstract_search = abstract_regex.search(line)
    if title_search:
        title_line = title_regex.sub('',line)
    elif abstract_search:
        abstract = abstract_regex.sub('',line)
        pmid = int(abstract_search.group(0)[:-3])
        words,tags = get_words_tags(title_line,pmid,0)
        fw.write("\t".join(words)+"\n")
        fw.write("\t".join(tags)+"\n")
        title_size = len(title_line) + 1                     # 1 for \n        
        for sentence in re.split("\.", abstract):
            if len(sentence) == 0:
                continue
            _size = abstract.index(sentence) + title_size
            words, tags = get_words_tags(sentence, pmid)
            fw.write("\t".join(words)+"\n")
            fw.write("\t".join(tags)+"\n") 
        abstract_count += 1
        if abstract_count%10 == 0:
            print("PMID {} written to file. Abstract Count: {}".format(pmid, abstract_count))
fw.close()
fr.close()

PMID 26868132 written to file. Abstract Count: 10
PMID 27113387 written to file. Abstract Count: 20


KeyboardInterrupt: 

Training Data

In [9]:
#Sorting the data from labelled_data.txt to a df (data frame)
fr = open("data/labelled_data.txt")
data =[]
tags = []
text = True
for line in fr.readlines():
    if len(line.strip()) == 0:
        continue
    else:
        if text:
            line_txt = line
            text = not text
        else:
            line_typ = line
            tags_typ = line_typ.split()
            unique = set(tags_typ)
            unique.discard('0')
            data.append([Counter(tags_typ), line_txt, line_typ, len(unique)])
            tags.extend(unique)
            text = not text
fr.close()

df = pd.DataFrame(data, columns=["count", "line_word", "line_tag","unique_tags"])
tags = list(set(tags))
for i in tags:
    df[i] = df["count"].apply(lambda x: x[i] if i in x.keys() else 0)
df.head()

Unnamed: 0,count,line_word,line_tag,unique_tags,T047,"T019,T047",T007
0,"{'0': 6, 'T047': 6}",DCTN4\tas\ta\tmodifier\tof\tchronic\tPseudomon...,0\t0\t0\t0\t0\tT047\tT047\tT047\tT047\t0\tT047...,1,6,0,0
1,"{'T047': 11, '0': 31}",Pseudomonas\taeruginosa\t(Pa)\tinfection\tin\t...,T047\tT047\tT047\tT047\t0\tT047\tT047\t0\t0\t0...,1,11,0,0
2,"{'0': 28, 'T047': 1}",By\tusing\texome\tsequencing\tand\textreme\tph...,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,1,1,0,0
3,"{'0': 34, 'T047': 5}",The\tpurpose\tof\tthis\tstudy\twas\tto\tinvest...,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0...,1,5,0,0
4,{'0': 15},Polymerase\tchain\treaction\tand\tdirect\tsequ...,0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\n,0,0,0,0


In [None]:
training = []
testing = []
included_index = []
for i in tags:
    df_tag = df[df[i] > 0]
    df_tag = df_tag[~df_tag.index.isin(included_index)]
    included_index.extend(df_tag.index.tolist())
    random_sample = df_tag.sample(frac=0.75, random_state = 112358)
    training.append(random_sample)
    testing.append(df_tag[~df_tag.index.isin(random_sample.index)])

df_training = pd.concat(training)
df_training.drop_duplicates(subset=["line_word", "line_tag"])

df_testing = pd.concat(testing)
df_testing.drop_duplicates(subset=["line_word", "line_tag"])

# write training
fout = open("data/training_data.txt", "w")
for i in df_training.index:
    fout.write(df_training.ix[i]["line_word"]+"\n"+df_training.ix[i]["line_tag"]+"\n")

fout.close()

# write testing
fout = open("data/testing_data.txt", "w")
for i in df_testing.index:
    fout.write(df_testing.ix[i]["line_word"]+"\n"+df_testing.ix[i]["line_tag"]+"\n")

fout.close()

In [None]:
fr = open("data/corpus_pubtator.txt")
#fr = open("para.txt")
fw = open("data/abstract.txt","w")

title_regex = re.compile("^\d+\|t\|")
abstract_regex = re.compile("^\d+\|a\|") 

In [None]:
#opens the file and sends the title and abstract line by line to get_words_tags function

abstract_count = 0
for i in fr.readlines():
    line = i.strip()
    title_search = title_regex.search(line)
    abstract_search = abstract_regex.search(line)
    if title_search:
        title_line = title_regex.sub('',line)
    elif abstract_search:
        abstract = abstract_regex.sub('',line)
        pmid = int(abstract_search.group(0)[:-3])
        fw.write(title_line+"\n")
        fw.write(abstract+"\n")
        abstract_count += 1
        if abstract_count%100 == 0:
            print("PMID {} written to file. Abstract Count: {}".format(pmid, abstract_count))
fw.close()
fr.close()