# Create vocabularies of words and tags from datasets

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os, sys
from collections import Counter
import json

In [3]:
# Extra word for PAD (padding) and UNK (unrecognized word)
PAD_WORD = '<PAD>'
PAD_TAG = 'O'
UNK_WORD = '<UNK>'

## I/O helper functions

In [4]:
def write_dict_to_json(d, out_path):
    """
    Writes a dictionary object to a json file
    - d (dict)
    - output (str): path to the output filename
    """
    with open(out_path, 'w') as f:
        json.dump(d, f, indent=4)
        
def write_list_to_file(myIterable, file_path):
    """
    Write each element in the input to a new line
    Args:
    - myIterable (iterable)
    - file_path (str): path to the file
    """
    with open(file_path, 'w') as f:
        for ele in myIterable:
            f.write( str(ele)+"\n")

## Process datasets

In [5]:
# print(sys.path)
if ".." not in sys.path:
    sys.path.insert(0, "..")

In [8]:
from nlp_utils import data_converter, conlleval

- Load datasets

In [9]:
# train_data_iob = data_converter.read_conll('../data/eng.train')[1:] #ignore header
train_data_bio = data_converter.read_conll('../data/train.bio')[1:] #ignore header
testa_data_bio = data_converter.read_conll('../data/testa.bio')[1:]
testb_data = data_converter.read_conll('../data/eng.testb')[1:]

In [10]:
all_data = train_data_bio.copy()
all_data.extend(testa_data_bio)
all_data.extend(testb_data)

In [8]:
datasets = [train_data_bio, testa_data_bio, testb_data, all_data]
print("Dataset sizes")
print(" train, testa, testb, all")
print(list(map(lambda data: len(data), datasets)))

Dataset sizes
 train, dev, test, all
[14985, 3464, 3683, 22132]


- Collect all words as we encounter in the three datasets

In [25]:
all_words = []
for dataset in datasets:
    for sent in dataset:
        for (w, *_) in sent:
            all_words.append(w)   

In [26]:
print("Number of token occurances: ", len(all_words))
word_counter = Counter(all_words)
print("Number of unique words: ", len(word_counter))

Number of token occurances:  605600
Number of unique words:  30289


In [27]:
import matplotlib.pyplot as plt
# plt.hist(all_words, bins=50)

- Add PAD and UNK as extra words

In [28]:
# hyperparameter for thresholding the most common N words to keep in our vocab
N_MOST_COMMON = 15000 #len(word_counter)/2
common_words = word_counter.most_common(N_MOST_COMMON)
plt.hist(common_words.values(),bins=50)
# OR, filter by MIN_COUNT and MAX_COUNT
# MIN_COUNT = 5
# MAX_COUNT = 
for i,(w,c) in enumerate(common_words[::-1]):
    if i<10:
        print(w,c)

Montoya 4
Tacambaro 4
convoy 4
SANTIAGO 4
gift 4
Woman 4
Sleeping 4
Fidel 4
novel 4
Granma 4


In [29]:
words = [ w for w,_ in common_words]
print(len(words) == len(common_words))

True


In [30]:
# Add PAD and UNK as extra words
words.extend([PAD_WORD, UNK_WORD])
write_list_to_file(words, '../data/words.txt')

In [31]:
word2idx = { w:i for i,w in enumerate(words) }
print(len(word2idx))

15002


In [36]:
write_dict_to_json(word2idx, "../data/word2idx.json")

In [41]:
# Tag to idx mapping 
tag2idx = {}
it = 0 #conunters
for sent in train_data_bio:
    for (*_, tag) in sent:
        if tag in tag2idx:
            continue
        else:
            tag2idx[tag] = it
            it += 1
            

In [46]:
# Add START and STOP tags
START_TAG = '<START>'
STOP_TAG = '<STOP>'
# Add START_TAG and STOP_TAG to tag2idx
temp_n = len(tag2idx)
tag2idx.update( {START_TAG:temp_n,
                 STOP_TAG:temp_n+1} )
print(tag2idx)

{'B-ORG': 0, 'O': 1, 'B-MISC': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8, '<START>': 9, '<STOP>': 10}


In [47]:
write_dict_to_json(tag2idx, '../data/tag2idx.json')

## Sentences proprocessing
We need to map each word in a sentence to an index in our `word2idx` map. Similarly, we need to map each tag in a tag sequence to an index in `tag2idx`. We perform this transformation on each dataset. Two subtle, yet important details are:

1.  If we encounter a word that doesn't exist in our `word2idx` map (because the word was not one of the most N_MOST_COMMON words), then we mark the word as UNK word by assigning the index=`word2idx['<UNK>']`.  

2. Our RNN model takes in the sequence of sentences of same length.  Therefore, we fill in a shorter sentences to SEQ_LENGTH (50 by default) using PAD_WORD. 



In [54]:
train_sentences = []
train_labels = []
c_unks = 0
for sent in train_data_bio:
    sent_i = []
    tag_i = []
    for (w,*_,t) in sent:
        idx = word2idx[w] if w in word2idx else word2idx[UNK_WORD]
        sent_i.append(idx)
        tag_i.append(tag2idx[t])
        
        # count how many unknown words in train dataset
        if w not in word2idx:
            c_unks += 1
            
    train_sentences.append(sent_i)
print("Num. of unknown words: ", c_unks)


Num. of unknown words:  9682


In [57]:
print("Total word occurances:\n",
      len([w for sent in train_data_bio for w,*_ in sent]))

Total word occurances:
 204562


In [58]:
write_list_to_file(train_sentences, '../data/train_sentences.txt')

In [66]:
def process_labelled_dataset(dataset, word2idx, tag2idx):
    """
    dataset: a list of sentence. Sentence is a list of word_infos.
            word_info is a tuple of (word, POS, ..., tag)
    Returns:
    train_sentences and train_labels after mapping to indices space.
    """
    train_sentences = []
    train_labels = []
    c_unks = 0
    for sent in dataset:
        sent_i = []
        tag_i = []
        for (w,*_,t) in sent:
            idx = word2idx[w] if w in word2idx else word2idx[UNK_WORD]
            sent_i.append(idx)
            tag_i.append(tag2idx[t])

            # count how many unknown words in train dataset
            if w not in word2idx:
                c_unks += 1

        train_sentences.append(sent_i)
        train_labels.append(tag_i)
    print("Num. of unknown words: ", c_unks)
    return train_sentences, train_labels

def process_unlabelled_dataset(dataset, word2idx, tag2idx):
    """
    dataset: a list of sentence. Sentence is a list of word_infos.
            word_info is a tuple of (word, POS). Notice this does
            not include tag (label)
    Returns:
    - sentences: a list of sentence after mapping each word to indices
    """
    sentences = []
    c_unks = 0
    for sent in dataset:
        sent_i = []
        for (w,*_) in sent:
            idx = word2idx[w] if w in word2idx else word2idx[UNK_WORD]
            sent_i.append(idx)

            # count how many unknown words in train dataset
            if w not in word2idx:
                c_unks += 1

        sentences.append(sent_i)
    print("Num. of unknown words: ", c_unks)
    return sentences
    

In [71]:
train_sentences, train_labels = process_labelled_dataset(train_data_bio, word2idx, tag2idx)
testa_sentences, testa_labels = process_labelled_dataset(testa_data_bio, word2idx, tag2idx)
test_sentences = process_unlabelled_dataset(test_data, word2idx, tag2idx)

Num. of unknown words:  9682
Num. of unknown words:  2547
Num. of unknown words:  3549


In [65]:
# write_list_to_file(train_sentences, '../data/train_sentences.txt')
# write_list_to_file(train_labels, '../data/train_labels.txt')

# write_list_to_file(dev_sentences, '../data/dev_sentences.txt')
# write_list_to_file(dev_labels, '../data/dev_labels.txt')

# write_list_to_file(test_sentences, '../data/test_sentences.txt')

In [83]:
import joblib
import numpy as np
all_train_sentences = np.array(train_sentences)
all_train_labels = np.array(train_labels)
joblib.dump(all_train_sentences, '../data/all_train_sentences.sav')
joblib.dump(all_train_labels, '../data/all_train_labels.sav')

['../data/all_train_labels.sav']

In [72]:
joblib.dump(testa_sentences, '../data/testa_sentences.sav')
joblib.dump(testa_labels, '../data/testa_labels.sav')

joblib.dump(testb_sentences, '../data/testb_sentences.sav')

['../data/test_sentences.sav']

In [73]:
joblib.dump(word2idx, '../data/word2idx.sav')
joblib.dump(tag2idx, '../data/tag2idx.sav')

['../data/tag2idx.sav']

In [74]:
# Prepare some dev set
len(train_sentences)

14985

In [75]:
len(train_labels)

14985

In [76]:
n_dev = int(len(train_sentences)*0.3)

In [77]:
print(n_dev)

4495


In [86]:
indices = [i for i in range(len(all_train_sentences))]
print(indices[:10])
shuffle(indices)
print("shuffled: ", indices[:10])

# get indices for dev and train data
dev_indices = indices[:n_dev]
train_indices = indices[n_dev:]

# split to train and dev data
train_sentences = all_train_sentences[train_indices]
train_labels = all_train_labels[train_indices]

dev_sentences = all_train_sentences[dev_indices]
dev_labels = all_train_labels[dev_indices]


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
shuffled:  [9140, 4747, 7470, 3488, 4291, 188, 2335, 7385, 10654, 1722]


10490

In [88]:
len(dev_sentences)

4495

In [89]:
joblib.dump(train_sentences, '../data/train_sentences.sav')
joblib.dump(train_labels, '../data/train_labels.sav')

joblib.dump(dev_sentences, '../data/dev_sentences.sav')
joblib.dump(dev_labels, '../data/dev_labels.sav')

['../data/dev_labels.sav']