In [1]:
import numpy as np
import h5py

In [48]:
filename = '../submission/v_seq_test_hmm'
with h5py.File(filename, "r") as f:
    # Model
    v_seq = np.array(f.get('v_seq_test'), dtype=int)
    v_seq_dev = np.array(f.get('v_seq_dev'), dtype=int)

In [49]:
filename = '../data/words_feature.hdf5'
with h5py.File(filename, "r") as f:
    # Model
    input_matrix_test = np.array(f.get('input_matrix_test'), dtype=int)
    input_matrix_dev = np.array(f.get('input_matrix_dev'), dtype=int)

In [35]:
tag2index = {}
with open('../data/tags.txt', 'r') as f:
    for line in f:
        line_split = line[:-1].split(' ')
        tag2index[line_split[0]] = int(line_split[1])

# Adding tags for end/start of sentence
print(tag2index)

# removing I and B from the tags
index2tag = {v:k for k,v in tag2index.iteritems()}
print(index2tag)

{'I-LOC': 3, 'I-PER': 2, 'O': 1, 'I-MISC': 5, 'B-MISC': 6, 'I-ORG': 4, 'B-LOC': 7}
{1: 'O', 2: 'I-PER', 3: 'I-LOC', 4: 'I-ORG', 5: 'I-MISC', 6: 'B-MISC', 7: 'B-LOC'}


In [65]:
# Formating the output
def get_kaggle_output(pred_seq, input_matrix, index2tag):
    # #### First loop: build the list of results by line
    kaggle_output = []
    current_line = []
    for r_input, r_pred in zip(input_matrix, pred_seq):
        # Start of a new line
        if (r_input[1] == 1):
            kaggle_output.append(current_line)
            current_line = []
        else:
            if (r_pred > 1) and (r_pred < 8):
                current_line.append((r_input[1] - 1, r_pred))
    kaggle_output.append(current_line)
    # Remove first element
    kaggle_output = kaggle_output[1:]
    
    # #### Second loop: Format the result of each line

    kaggle_output_new = []
    for i, k in enumerate(kaggle_output):
        if len(k):
            current_seq = ''
            seq = ''
            prev_tag = ''
            prev_ind = -1
            for u in k:
                ind, tag_ind = u
                tag = index2tag[tag_ind][2:]
                # Growing the current tag sequence
                if tag == prev_tag and ind == prev_ind+1:
                    seq = seq+'-'+str(ind)
                # New tag
                else:
                    if len(seq):
                        current_seq += seq+' '
                    seq = tag+'-'+str(ind)
                    prev_tag = tag
                prev_ind = ind
            # adding remaining element
            current_seq += seq
            kaggle_output_new.append((i+1, current_seq))
        else:
            kaggle_output_new.append((i+1, ''))
    
    # Output: list of tuple (line_index, line_output)
    return kaggle_output_new

In [71]:
test = get_kaggle_output(v_seq, input_matrix_test, index2tag)

In [64]:
# Write to file
with open('../submission/v_seq_test_hmm.txt', 'w') as f:
    f.write('ID,Labels\n')
    for line in test:
        f.write(str(line[0]) + ',' + line[1] +'\n')

In [72]:
test

[(1, ''),
 (2, 'LOC-1'),
 (3, ''),
 (4, ''),
 (5, ''),
 (6, ''),
 (7, ''),
 (8, ''),
 (9, ''),
 (10, 'ORG-5'),
 (11, 'LOC-6'),
 (12, 'LOC-1'),
 (13, 'LOC-4'),
 (14, ''),
 (15, ''),
 (16, ''),
 (17, ''),
 (18, ''),
 (19, ''),
 (20, 'LOC-1'),
 (21, 'LOC-1'),
 (22, 'LOC-1'),
 (23, ''),
 (24, 'LOC-1'),
 (25, ''),
 (26, 'PER-1-2'),
 (27, 'LOC-1'),
 (28, 'ORG-1 PER-3-4'),
 (29, ''),
 (30, 'LOC-39'),
 (31, ''),
 (32, 'LOC-1'),
 (33, 'PER-1 LOC-10'),
 (34, 'ORG-1-2'),
 (35, 'LOC-1'),
 (36, 'LOC-9'),
 (37, 'ORG-8-9-10 LOC-19 LOC-30 PER-35-36'),
 (38, ''),
 (39, ''),
 (40, 'LOC-18 LOC-40 ORG-43-44-45'),
 (41, ''),
 (42, ''),
 (43, ''),
 (44, 'LOC-1'),
 (45, ''),
 (46, 'ORG-1-2 LOC-4'),
 (47, 'PER-7-8-9'),
 (48, ''),
 (49, ''),
 (50, ''),
 (51, ''),
 (52, 'PER-1-2-3-4'),
 (53, ''),
 (54, 'LOC-35'),
 (55, 'PER-6-7'),
 (56, ''),
 (57, 'LOC-3'),
 (58, 'MISC-3-4-5-6'),
 (59, ''),
 (60, 'LOC-3 MISC-7'),
 (61, 'LOC-3'),
 (62, 'MISC-3'),
 (63, 'LOC-1'),
 (64, ''),
 (65, ''),
 (66, ''),
 (67, 'LOC-2 LOC-