In [189]:
import numpy as np
from itertools import product
import h5py
import re

In [18]:
# Tags mapping
tag2index = {}

with open('../data/tags.txt', 'r') as f:
    for line in f:
        line_split = line[:-1].split(' ')
        tag2index[line_split[0]] = int(line_split[1])

# Adding tags for end/start of sentence
tag2index['<t>'] = 8
tag2index['<\t>'] = 9
print(tag2index)

{'I-LOC': 3, '<\t>': 9, 'I-PER': 2, '<t>': 8, 'O': 1, 'I-MISC': 5, 'B-MISC': 6, 'I-ORG': 4, 'B-LOC': 7}


In [233]:

def count_elements(filename, tags=True):
    # Counting the number of elements to stored (ie num_words + 2*num_sentences)
    num_words = 0
    num_sentences = 0
    with open(filename, 'r') as f:
        for line in f:
            if tags:
                line_split = line[:-1].split('\t')
            else:
                line_split = line[:-1].split(' ')
            # Case blank
            if len(line_split) == 1:
                num_sentences += 1
            else:
                num_words += 1

    return num_words, num_sentences

def get_cap_feature(word):
    # Return the caps feature for the given word
    # 1 - low caps; 2 - all caps; 3 - first cap; 4 - one cap; 5 - other
    if len(word) == 0 or word.islower() or re.search('[.?\-",]+', word):
        feature = 1
    elif word.isupper():
        feature = 2
    elif len(word) and word[0].isupper():
        feature = 3
    elif sum([w.isupper() for w in word]):
        feature = 4
    else:
        feature = 5
    return feature
    

def build_input_matrix(filename ,num_rows, tag2index, tags=True, word2index=None):
    # Building input matrix with columns: (id, id_in_sentence, id_word, id_caps, id_tag)
    # caps feature:
    # 1 - low caps; 2 - all caps; 3 - first cap; 4 - one cap; 5 - other
    # Tags: if correct solution given (ie 4th column)
    # word2index: if use of previously built word2index mapping
    
    # Features for starting/ending of sentence (3 last columns)
    start = [1,1,8]
    end = [2,1,9]
    
    # initialization
    input_matrix = np.zeros((num_rows, 5), dtype=int)
    input_matrix[0] = [1,1,1,1,8]
    row = 1
    
    # Boolean to indicate if a sentence is starting
    starting = False
    # Boolean if a mapping is defined (last element of the mapping is for unknown words)
    if word2index==None:
        test = False
        word2index = {'<s>': 1, '<\s>': 2}
        id_word = 3
    else:
        test = True
    with open(filename, 'r') as f:
        for line in f:
            if tags:
                line_split = line[:-1].split('\t')
            else:
                line_split = line[:-1].split(' ')
            if starting == True:
                # Start of sentence
                input_matrix[row, 0] = input_matrix[row-1, 0] + 1
                input_matrix[row, 1] = 1
                input_matrix[row, 2:] = start
                row+=1
                starting = False
            if len(line_split) == 1:
                # End of sentence
                input_matrix[row, :2] = input_matrix[row-1, :2] + 1
                input_matrix[row, 2:] = end
                row+=1
                starting = True
            else:
                # Indexing
                input_matrix[row, 0] = input_matrix[row-1, 0] + 1
                input_matrix[row, 1] = int(line_split[1]) + 1
                # Build cap feature
                word = line_split[2]
                input_matrix[row, 3] = get_cap_feature(word)
                # Build word count feature
                word_clean = word.lower()
                if not test:
                    if word_clean not in word2index:
                        word2index[word_clean] = id_word
                        id_word += 1
                    input_matrix[row, 2] = word2index[word_clean]
                else:
                    # Unseen word during train
                    if word_clean not in word2index:
                        input_matrix[row, 2] = len(word2index)
                    else:
                        input_matrix[row, 2] = word2index[word_clean]
                if tags:
                    input_matrix[row, 4] = tag2index[line_split[3]]
                row += 1
    # Add special word if training
    if not test:
        word2index['<unk>'] = len(word2index)+1
    if tags:
        return input_matrix, word2index
    else:
        return input_matrix[:,:4], word2index

In [234]:
def train_hmm(input_matrix, num_features, num_tags):
    # Emission word_count matrix:
    # size (num_words, num_tags)
    # row: observation / colum: tag
    # (un-normalized if smoothing required)
    emission_w = np.zeros((num_features, num_tags), dtype=int)
        
    # Emission word_count matrix:
    # size (5, num_tags)
    # row: observation / colum: tag
    # (un-normalized if smoothing required)
    emission_c = np.zeros((5, num_tags), dtype=int)
    
    # Building
    for r in input_matrix:
        emission_w[r[2]-1, r[4]-1] += 1
        emission_c[r[3]-1, r[4]-1] += 1

    # Transition matrix
    # size (num_tags, num_tags)
    # row: to / colum: from
    # (un-normalized if smoothing required)
    transition = np.zeros((num_tags, num_tags), dtype=int)
    for i in xrange(input_matrix.shape[0] - 1):
        transition[input_matrix[i+1,4]-1, input_matrix[i,4]-1] += 1
        
    return emission_w, emission_c, transition


In [235]:
# Train
num_words, num_sentences = count_elements('../data/train.num.txt')
num_rows = num_words + 2*num_sentences
input_matrix_train, word2index = build_input_matrix('../data/train.num.txt', num_rows, tag2index)

In [236]:
# Building the count matrix
num_tags = len(tag2index)
num_features = len(word2index)
emission_w, emission_c, transition = train_hmm(input_matrix_train, num_features, num_tags)

In [239]:
input_matrix_test[:30]

array([[   1,    1,    1,    1],
       [   2,    2, 9137,    3],
       [   3,    3, 9137,    3],
       [   4,    4,  638,    1],
       [   5,    5, 9137,    1],
       [   6,    6, 5750,    1],
       [   7,    7, 5190,    1],
       [   8,    8,   11,    1],
       [   9,    9,    2,    1],
       [  10,    1,    1,    1],
       [  11,    2, 5748,    2],
       [  12,    3, 9137,    1],
       [  13,    4,    2,    1],
       [  14,    1,    1,    1],
       [  15,    2,  994,    3],
       [  16,    3,  991,    1],
       [  17,    4,    7,    1],
       [  18,    5, 1587,    3],
       [  19,    6, 1315,    5],
       [  20,    7,   71,    1],
       [  21,    8,  456,    5],
       [  22,    9,    2,    1],
       [  23,    1,    1,    1],
       [  24,    2,  130,    5],
       [  25,    3,  231,    1],
       [  26,    4, 5751,    1],
       [  27,    5,  161,    1],
       [  28,    6, 5752,    1],
       [  29,    7, 1666,    1],
       [  30,    8, 5753,    1]])

In [207]:
tag2index

{'<\t>': 9,
 '<t>': 8,
 'B-LOC': 7,
 'B-MISC': 6,
 'I-LOC': 3,
 'I-MISC': 5,
 'I-ORG': 4,
 'I-PER': 2,
 'O': 1}

In [238]:
# Dev & test
num_words, num_sentences = count_elements('../data/dev.num.txt')
# Miss 1 blank line at the end of the file for the dev set
num_rows = num_words + 2*num_sentences + 1
input_matrix_dev, word2index = build_input_matrix('../data/dev.num.txt', num_rows, tag2index, word2index=word2index)

num_words, num_sentences = count_elements('../data/test.num.txt', tags=False)
num_rows = num_words + 2*num_sentences
input_matrix_test, word2index = build_input_matrix('../data/test.num.txt', num_rows, tag2index, tags=False, word2index=word2index)

In [166]:
# Should be 0, ie p(<\t>|<t>)
print transition[9-1, 8-1]
print input_matrix_train.shape
print input_matrix_dev.shape
print input_matrix_test.shape

0.0
(55961, 4)
(29105, 4)
(28677, 3)


In [230]:
# Saving pre-processing
filename = '../data/words_caps_feature.hdf5'
with h5py.File(filename, "w") as f:
    # Model
    f['emission_w'] = emission_w
    f['emission_c'] = emission_c
    f['transition'] = transition
    
    f['input_matrix_train'] = input_matrix_train
    f['input_matrix_dev'] = input_matrix_dev
    f['input_matrix_test'] = input_matrix_test

In [106]:
for k,v in tag2index.iteritems():
    print 'Transision to '+k
    print transition[v-1]

Transision to I-LOC
[ 1654.     0.   271.     0.     2.     0.     0.   406.     0.]
Transision to <	>
[ 3159.    84.    43.    27.    32.     0.     0.     0.     0.]
Transision to I-PER
[ 1379.  1309.     0.     4.    20.     0.     0.   351.     0.]
Transision to <t>
[    0.     0.     0.     0.     0.     0.     0.     0.  3344.]
Transision to O
[  3.28530000e+04   1.66900000e+03   2.00400000e+03   1.16000000e+03
   8.49000000e+02   3.00000000e+00   1.00000000e+00   2.11100000e+03
   0.00000000e+00]
Transision to I-MISC
[ 783.    1.   10.    3.  309.    5.    0.  120.    0.]
Transision to B-MISC
[ 0.  0.  0.  0.  8.  0.  0.  0.  0.]
Transision to I-ORG
[ 822.    0.    4.  791.   11.    0.    0.  357.    0.]
Transision to B-LOC
[ 0.  0.  1.  0.  0.  0.  0.  0.  0.]
