In [1]:
from tqdm import tqdm

from nltk import conlltags2tree
from nltk.stem.snowball import SnowballStemmer
from nltk import tree2conlltags
from nltk.chunk import ChunkParserI
from nltk import pos_tag,pos_tag_sents 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn

from pprint import pprint
import re
from collections import defaultdict
import string
import dill as pickle
import os
from random import shuffle

### NLTK resource downloads:

import nltk
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')

In [2]:
LOAD_GLOVE = True
GLOVE_DIR = '/home/gpsbhargav/NLU/assignment3/project/resources/'
GLOVE_NAME = 'glove.6B.50d.float.pkl'
GLOVE_SIZE = 50

In [3]:
def pickler(path,pkl_name,obj):
    with open(os.path.join(path, pkl_name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def unpickler(path,pkl_name):
    with open(os.path.join(path, pkl_name) ,'rb') as f:
        obj = pickle.load(f)
    return obj

In [4]:
if LOAD_GLOVE:
    glove = unpickler(GLOVE_DIR,GLOVE_NAME)

In [5]:
class Token:
    def __init__(self, features=None,label=None):
        if(features == None):
            self.features = []
        else:
            self.features = features
        self.label = label
    
    def add_feature(self,feature):
        self.features.append(feature)
    
    def add_features(self,features):
        for feature in features:
            self.features.append(feature)
    
    def get_string(self,only_word=False,label=False,new_line=False):
        if(only_word):
            return self.features[0]
        s = " ".join(map(str, self.features))
        if(label):
            s = s + " " + self.label
        if(new_line):
            s = s + "\n"
        return s

class Sentence:
    def __init__(self,tokens=None):
        if(tokens == None):
            self.tokens = []
        else:
            self.tokens = tokens

    def add_token(self,token):
        #print("Before appending: ",self.get_in_format())
        self.tokens.append(token)

    def get_num_tokens(self):
        return len(self.tokens)
    
    def get_token_list(self):
        tok_list = []
        for token in self.tokens:
            tok_list.append(token.features[0])
        return tok_list
    
    def get_in_format(self,only_word=False,label=False,new_line=False):
        sent_list = []
        for token in self.tokens:
            token_string = token.get_string(only_word=False,label=label,new_line=new_line)
            sent_list.append(token_string)
        if(new_line):
            sent_list.append('\n')
        return sent_list

In [6]:
def read_file(path,label=True):
    data = []
    count = 0
    lines = []
    sentence = Sentence()
    with open(path,'r',encoding='latin1') as f:
        for line in f:
            if(line == '\n'):
                count += 1
                #if(sentence.get_num_tokens() > 0):
                data.append(sentence)
                sentence = Sentence()
                #print("new sentence created: ",sentence)
                #print(sentence.get_in_format())
            else:
                token = []
                contents = line.split()
                #print(contents)
                if(label):
                    token = Token(contents[:-1],contents[-1])
                else:
                    token = Token(contents)

                #print(token.features)
                #print(token.label)
                #print(token.get_string(True))
                sentence.add_token(token)
                #print("sentence contents: ",sentence.get_in_format())
            
    print("Number of sentences: ",count)
    return data

In [7]:
def write_file(data,path,label=True,new_line=True):
    with open(path,'w+',encoding='latin1') as f:
        for sentence in tqdm(data):
            f.writelines(sentence.get_in_format(label=label,new_line=new_line))
            #print(sentence.get_in_format(label))
        

In [8]:
original_data = read_file("../train.txt")
test_data_original = read_file("../test.txt",label=False)

Number of sentences:  2924
Number of sentences:  731


In [9]:
def shape(word):
    word_shape = 'other'
    if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
        word_shape = 'number'
    elif re.match('\W+$', word):
        word_shape = 'punct'
    elif re.match('[A-Z][a-z]+$', word):
        word_shape = 'capitalized'
    elif re.match('[A-Z]+$', word):
        word_shape = 'uppercase'
    elif re.match('[a-z]+$', word):
        word_shape = 'lowercase'
    elif re.match('[A-Z][a-z]+[A-Z][a-z]+[A-Za-z]*$', word):
        word_shape = 'camelcase'
    elif re.match('[A-Za-z]+$', word):
        word_shape = 'mixedcase'
    elif re.match('__.+__$', word):
        word_shape = 'wildcard'
    elif re.match('[A-Za-z0-9]+\.$', word):
        word_shape = 'ending-dot'
    elif re.match('[A-Za-z0-9]+\.[A-Za-z0-9\.]+\.$', word):
        word_shape = 'abbreviation'
    elif re.match('[A-Za-z0-9]+\-[A-Za-z0-9\-]+.*$', word):
        word_shape = 'contains-hyphen'
 
    return word_shape

In [10]:
resource_path = "../resources/"
mesh_cat_pkl = "categorized_terms_ABCDEJN_BOW.pkl"

In [11]:
categorized_mesh_terms = unpickler(resource_path,mesh_cat_pkl)

In [12]:
def get_mesh_categories(word):
    word = word.lower()
    tags = []
    for cat in categorized_mesh_terms.keys():
        if word in categorized_mesh_terms[cat]:
            tags.append("y_"+cat)
        else:
            tags.append("n_"+cat)
    return tags

In [13]:
get_mesh_categories("sarcoidosis")

['n_D', 'n_J', 'n_A', 'y_C', 'n_B', 'n_N', 'n_E']

In [14]:
def get_prefixes_and_suffixes(word):
    word = word.lower()
    return [word[:1],word[:2],word[:3],word[:4],word[-1:],word[-2:],word[-3:],word[-4:]]

In [15]:
stemmer = SnowballStemmer('english')
lmtzr = WordNetLemmatizer() 
 
def ner_features(tokens, index, history=None):
    """
    tokens  = a POS-tagged sentence [(w1, t1), ...]
    index   = the index of the token we want to extract features for
    history = the previous predicted IOB tags
    """

    # Pad the sequence with placeholders
    tokens = [('__START2__', '__START2__'), ('__START1__', '__START1__')] + list(tokens) + [('__END1__', '__END1__'), ('__END2__', '__END2__')]
    #history = ['__START2__', '__START1__'] + list(history)
    useful_pos = ['NN','JJ','NNS','NNP','CC','IN'
                 #for O:
                 #'IN','DT','CC','CD','VBN','VBD'
                 ]


    # shift the index with 2, to accommodate the padding
    index += 2

    word, pos = tokens[index]
#     pos = pos if pos in useful_pos else 'Other'
    prevword, prevpos = tokens[index - 1]
    prevprevword, prevprevpos = tokens[index - 2]
    nextword, nextpos = tokens[index + 1]
    nextnextword, nextnextpos = tokens[index + 2]
    #previob = history[-1]
    #prevpreviob = history[-2]
    
    mesh_categories = get_mesh_categories(word)
    
    
    features= [
        pos,
#         shape(word),
        lmtzr.lemmatize(word),

         nextword,
         nextpos,
         lmtzr.lemmatize(nextword),
#          shape(nextword),

#         nextnextword,
#         nextnextpos,
#         lmtzr.lemmatize(nextnextword),
#         shape(nextnextword),

        prevword,
        prevpos,
        lmtzr.lemmatize(prevword),
#         shape(prevword),

#         prevprevword,
#         prevprevpos,
#         lmtzr.lemmatize(prevprevword),
#         shape(prevprevword)
    ]
    
#    features = features + mesh_categories
#    features = mesh_categories
    
    if(LOAD_GLOVE):
       if(word.lower() in glove.keys()):
           g_features = glove[word.lower()].tolist()
       else:
           g_features = [0] * GLOVE_SIZE
    
    features = features + get_prefixes_and_suffixes(word) + mesh_categories + g_features

    return features

### Features
feat_dict = {
        #'word': word,
        'lemma': stemmer.stem(word),
        'pos': pos,
        'shape': shape(word),
 
        'next-word': nextword,
        'next-pos': nextpos,
        'next-lemma': stemmer.stem(nextword),
        'next-shape': shape(nextword),
 
        'next-next-word': nextnextword,
        'next-next-pos': nextnextpos,
        'next-next-lemma': stemmer.stem(nextnextword),
        'next-next-shape': shape(nextnextword),
 
        'prev-word': prevword,
        'prev-pos': prevpos,
        'prev-lemma': stemmer.stem(prevword),
        #'prev-iob': previob,
        'prev-shape': shape(prevword),
 
        'prev-prev-word': prevprevword,
        'prev-prev-pos': prevprevpos,
        'prev-prev-lemma': stemmer.stem(prevprevword),
        #'prev-prev-iob': prevpreviob,
        'prev-prev-shape': shape(prevprevword),
    }

In [16]:
new_data = original_data.copy()
for data in tqdm(new_data):
    tokens_str = pos_tag(data.get_in_format(only_word=True))
    for i,token in enumerate(data.tokens):
        features = ner_features(tokens_str,i)
        token.add_features(features)

100%|██████████| 2924/2924 [00:07<00:00, 385.83it/s]


In [17]:
new_test_data = test_data_original.copy()
for data in tqdm(new_test_data):
    tokens_str = pos_tag(data.get_in_format(only_word=True))
    for i,token in enumerate(data.tokens):
        features = ner_features(tokens_str,i)
        token.add_features(features)

100%|██████████| 731/731 [00:01<00:00, 570.18it/s]


In [18]:
new_test_data[0].get_in_format(label=False,new_line=True)

['Does NNP Does blinding NN blinding __START1__ __START1__ __START1__ d do doe does s es oes does n_D n_J n_A n_C n_B n_N n_E 0.22930000722408295 0.34231001138687134 0.05981700122356415 0.08300299942493439 0.5768499970436096 0.28852999210357666 -0.011265999637544155 -0.1784600019454956 0.16947999596595764 0.327349990606308 0.14047999680042267 0.8287000060081482 -0.2763200104236603 -0.14559000730514526 0.8733000159263611 1.0743999481201172 0.5294100046157837 0.006656699813902378 0.414249986410141 -0.7607600092887878 -0.4442799985408783 0.03714999929070473 0.01876699924468994 0.3484399914741516 0.48342999815940857 -2.188199996948242 -0.7182199954986572 0.11967000365257263 0.7330099940299988 -0.7412199974060059 3.1152000427246094 0.2626500129699707 -0.8239200115203857 -0.4830799996852875 -0.26423001289367676 -0.4987199902534485 0.13840000331401825 -0.1821800023317337 0.24401000142097473 -0.5337700247764587 0.05577300116419792 0.36048001050949097 -0.10412999987602234 0.3241199851036072 -0.

In [19]:
new_data[0].get_in_format(True,True)

['Sarcoidosis NN Sarcoidosis involving VBG involving __START1__ __START1__ __START1__ s sa sar sarc s is sis osis n_D n_J n_A y_C n_B n_N n_E 1.3918999433517456 0.23127999901771545 -0.16245000064373016 0.19147999584674835 -1.142300009727478 1.1601999998092651 0.6109700202941895 0.5369200110435486 -0.018831999972462654 0.5237600207328796 0.19789999723434448 -0.016954999417066574 0.9602699875831604 -0.071322001516819 -0.35558998584747314 -0.45969000458717346 -0.579200029373169 0.06953699886798859 0.0550290010869503 0.2514599859714508 -1.1691999435424805 -0.3417600095272064 0.7171099781990051 -0.26747000217437744 0.5116400122642517 0.47450000047683716 0.16333000361919403 0.7372400164604187 0.2549700140953064 -0.025102000683546066 -0.4036499857902527 -0.2348800003528595 0.7593100070953369 -0.7135599851608276 -0.5720999836921692 -0.3852599859237671 0.2422800064086914 -0.011851999908685684 0.773829996585846 0.23239000141620636 -0.41391000151634216 -0.2872599959373474 0.26923999190330505 0.19

In [20]:
write_file(new_data,"../input_data/temp.txt")

100%|██████████| 2924/2924 [00:02<00:00, 1140.30it/s]


In [21]:
write_file(new_test_data,"../test_data/temp.txt",label=False)

100%|██████████| 731/731 [00:00<00:00, 1161.05it/s]
