In [1]:
from tqdm import tqdm
from random import shuffle

In [2]:
class Token:
    def __init__(self, features=None,label=None):
        if(features == None):
            self.features = []
        else:
            self.features = features
        self.label = label
    
    def add_feature(self,feature):
        self.features.append(feature)
    
    def get_string(self,label=True):
        s = " ".join(self.features)
        if(label):
            s = s + " " + self.label
        s = s + "\n"
        return s

class Sentence:
    def __init__(self,tokens=None):
        if(tokens == None):
            self.tokens = []
        else:
            self.tokens = tokens

    def add_token(self,token):
        #print("Before appending: ",self.get_in_format())
        self.tokens.append(token)

    def get_num_tokens(self):
        return len(self.tokens)
    
    def get_in_format(self,label=True):
        sent_list = []
        for token in self.tokens:
            token_string = token.get_string(label=label)
            sent_list.append(token_string)
        sent_list.append('\n')
        return sent_list

In [3]:
def read_train_file(path,label=True):
    data = []
    count = 0
    lines = []
    sentence = Sentence()
    with open(path,'r',encoding='latin1') as f:
        for line in f:
            if(line == '\n'):
                count += 1
                #if(sentence.get_num_tokens() > 0):
                data.append(sentence)
                sentence = Sentence()
                #print("new sentence created: ",sentence)
                #print(sentence.get_in_format())
            else:
                token = []
                contents = line.split()
                #print(contents)
                if(label):
                    token = Token(contents[:-1],contents[-1])
                else:
                    token = Token(contents)

                #print(token.features)
                #print(token.label)
                #print(token.get_string(True))
                sentence.add_token(token)
                #print("sentence contents: ",sentence.get_in_format())
            
    print("Number of sentences: ",count)
    return data

In [4]:
def write_file(data,path,label=True):
    with open(path,'w+',encoding='latin1') as f:
        for sentence in tqdm(data):
            f.writelines(sentence.get_in_format(label))
            #print(sentence.get_in_format(label))
        

In [5]:
all_data = read_train_file("ner.txt")

Number of sentences:  3655


In [6]:
all_data[2].get_in_format()

['Abnormal O\n',
 'presentation O\n',
 'was O\n',
 'the O\n',
 'most O\n',
 'common O\n',
 'indication O\n',
 '( O\n',
 '25.6 O\n',
 '% O\n',
 ', O\n',
 '88 O\n',
 'of O\n',
 '344 O\n',
 ') O\n',
 '\n']

In [7]:
len(all_data)

3655

In [8]:
train_size = 0.8

In [9]:
shuffle(all_data)

In [10]:
data_len = len(all_data)
train_index = int(data_len*train_size)
training_data = all_data[:train_index]
testing_data = all_data[train_index:]

In [11]:
print("train data size: ",len(training_data))
print("test data size: ",len(testing_data))

train data size:  2924
test data size:  731


In [12]:
write_file(training_data,"train.txt")

100%|██████████| 2924/2924 [00:00<00:00, 35926.25it/s]


In [13]:
write_file(testing_data,"test_gold.txt")

100%|██████████| 731/731 [00:00<00:00, 29340.62it/s]


In [14]:
write_file(testing_data,"test.txt",label=False)

100%|██████████| 731/731 [00:00<00:00, 34351.81it/s]
