In [84]:
import numpy as np
from itertools import product
import h5py

In [18]:
# Tags mapping
tag2index = {}

with open('../data/tags.txt', 'r') as f:
    for line in f:
        line_split = line[:-1].split(' ')
        tag2index[line_split[0]] = int(line_split[1])

# Adding tags for end/start of sentence
tag2index['<t>'] = 8
tag2index['<\t>'] = 9
print(tag2index)

{'I-LOC': 3, '<\t>': 9, 'I-PER': 2, '<t>': 8, 'O': 1, 'I-MISC': 5, 'B-MISC': 6, 'I-ORG': 4, 'B-LOC': 7}


In [77]:

def count_elements(filename):
    # Counting the number of elements to stored (ie num_words + 2*num_sentences)
    num_words = 0
    num_sentences = 0
    with open(filename, 'r') as f:
        for line in f:
            line_split = line[:-1].split('\t')
            # Case blank
            if len(line_split) == 1:
                num_sentences += 1
            else:
                num_words += 1

    return num_words, num_sentences

def build_input_matrix(filename ,num_rows, tag2index, tags=True, word2index=None):
    # Building input matrix with columns: (id, id_in_sentence, id_word, id_tag)
    input_matrix = np.zeros((num_rows, 4), dtype=int)
    input_matrix[0] = [1,1,1,8]
    row = 1
    # Boolean to indicate if a sentence is starting
    starting = False
    # Boolean if a mapping is defined (last element of the mapping is for unknown words)
    if word2index==None:
        test = False
        word2index = {'<s>': 1, '<\s>': 2}
        id_word = 3
    else:
        test = True
    with open('../data/train.num.txt', 'r') as f:
        for line in f:
            line_split = line[:-1].split('\t')
            if starting == True:
                # Start of sentence
                input_matrix[row, 0] = input_matrix[row-1, 0] + 1
                input_matrix[row, 1] = 1
                input_matrix[row, 2] = 1
                input_matrix[row, 3] = 8
                row+=1
                starting = False
            if len(line_split) == 1:
                # End of sentence
                input_matrix[row, :2] = input_matrix[row-1, :2] + 1
                input_matrix[row, 2] = 2
                input_matrix[row, 3] = 9
                row+=1
                starting = True
            else:
                input_matrix[row, 0] = input_matrix[row-1, 0] + 1
                input_matrix[row, 1] = int(line_split[1]) + 1
                word_clean = line_split[2].lower()
                if not test:
                    if word_clean not in word2index:
                        word2index[word_clean] = id_word
                        id_word += 1
                    input_matrix[row, 2] = word2index[word_clean]
                else:
                    # Unseen word during train
                    if word_clean not in word2index:
                        input_matrix[row, 2] = len(word2index)
                    else:
                        input_matrix[row, 2] = word2index[word_clean]
                if tags:
                    input_matrix[row, 3] = tag2index[line_split[3]]
                row += 1
    # Add special word if training
    if not test:
        word2index['<unk>'] = len(word2index)+1
    if tags:
        return input_matrix, word2index
    else:
        return input_matrix[:,:3], word2index

In [98]:
def train_hmm(input_matrix, num_features, num_tags):
    # Emission matrix:
    # size (num_features, num_tags)
    # row: observation / colum: tag
    # (un-normalized if smoothing required)
    emission = np.zeros((num_features, num_tags))
    for r in input_matrix:
        emission[r[2]-1, r[3]-1] += 1

    # Transition matrix
    # size (num_tags, num_tags)
    # row: to / colum: from
    # (un-normalized if smoothing required)
    transition = np.zeros((num_tags, num_tags))
    for i in xrange(input_matrix.shape[0] - 1):
        transition[input_matrix[i+1,3]-1, input_matrix[i,3]-1] += 1
        
    return emission, transition


In [99]:
# Train
num_words, num_sentences = count_elements('../data/train.num.txt')
num_rows = num_words + 2*num_sentences
input_matrix_train, word2index = build_input_matrix('../data/train.num.txt', num_rows, tag2index)

# Building the count matrix
num_tags = len(tag2index)
num_features = len(word2index)
emission, transition = train_hmm(input_matrix_train, num_features, num_tags)

# Dev & test
input_matrix_dev, word2index = build_input_matrix('../data/dev.num.txt', num_rows, tag2index, word2index=word2index)
input_matrix_test, word2index = build_input_matrix('../data/test.num.txt', num_rows, tag2index, tags=False, word2index=word2index)

In [101]:
# Should be 0, ie p(<\t>|<t>)
print transition[9-1, 8-1]
print input_matrix_dev.shape
print input_matrix_test.shape

0.0
(55961, 4)
(55961, 3)


In [103]:
# Saving pre-processing
filename = '../data/words_feature.hdf5'
with h5py.File(filename, "w") as f:
    # Model
    f['emission'] = emission
    f['transition'] = transition
    
    f['input_matrix_train'] = input_matrix_train
    f['input_matrix_dev'] = input_matrix_dev
    f['input_matrix_test'] = input_matrix_test

In [104]:
input_matrix_dev[:30,:]

array([[ 1,  1,  1,  8],
       [ 2,  2,  3,  4],
       [ 3,  3,  4,  1],
       [ 4,  4,  5,  5],
       [ 5,  5,  6,  1],
       [ 6,  6,  7,  1],
       [ 7,  7,  8,  1],
       [ 8,  8,  9,  5],
       [ 9,  9, 10,  1],
       [10, 10, 11,  1],
       [11, 11,  2,  9],
       [12,  1,  1,  8],
       [13,  2, 12,  2],
       [14,  3, 13,  2],
       [15,  4,  2,  9],
       [16,  1,  1,  8],
       [17,  2, 14,  3],
       [18,  3, 15,  1],
       [19,  4,  2,  9],
       [20,  1,  1,  8],
       [21,  2, 16,  1],
       [22,  3, 17,  4],
       [23,  4, 18,  4],
       [24,  5, 19,  1],
       [25,  6, 20,  1],
       [26,  7, 21,  1],
       [27,  8, 22,  1],
       [28,  9, 23,  1],
       [29, 10, 24,  1],
       [30, 11,  5,  5]])

In [105]:
print(tag2index)
print 'Transision to <t>'
print transition[8-1]
print 'Transision to <\\t>'
print transition[9-1]

{'I-LOC': 3, '<\t>': 9, 'I-PER': 2, '<t>': 8, 'O': 1, 'I-MISC': 5, 'B-MISC': 6, 'I-ORG': 4, 'B-LOC': 7}
Transision to <t>
[    0.     0.     0.     0.     0.     0.     0.     0.  3344.]
Transision to <\t>
[ 3159.    84.    43.    27.    32.     0.     0.     0.     0.]


In [None]:
# TODO: implement a function to evaluate the predicted sequence ()