In [2]:
import numpy as np
import h5py as p

In [3]:
words = []
with open('../data/glove.6B.50d.txt') as f:
    for l in f:
        words.append(l.split()[0])

In [7]:
embeddings = np.zeros((len(words),50))
i = 0
with open('../data/glove.6B.50d.txt') as f:
    for l in f:
        emb = [float(c) for c in l.split()[1:]]
        embeddings[i] = emb
        i+=1

In [31]:
word_ind = dict(zip(words, [x+1 for x in range(len(words))]))
word_ind['<s>'] = len(words) + 1
word_ind['<\s>'] = len(words) + 2
ind_word = {}
for k,v in word_ind.items():
    ind_word[v] = k

In [16]:
# Tags mapping
tag2index = {}

with open('../data/tags.txt', 'r') as f:
    for line in f:
        line_split = line[:-1].split(' ')
        tag2index[line_split[0]] = int(line_split[1])

# Adding tags for end/start of sentence
tag2index['<t>'] = 8
tag2index['<\t>'] = 9
print(tag2index)

{'I-LOC': 3, '<\t>': 9, 'I-PER': 2, '<t>': 8, 'O': 1, 'I-MISC': 5, 'B-MISC': 6, 'I-ORG': 4, 'B-LOC': 7}


In [23]:
word_ind['strawberry']

13217

In [51]:
def count_elements(filename, tags=True):
    # Counting the number of elements to stored (ie num_words + 2*num_sentences)
    num_words = 0
    num_sentences = 0
    with open(filename, 'r') as f:
        for line in f:
            if tags:
                line_split = line[:-1].split('\t')
            else:
                line_split = line[:-1].split(' ')
            # Case blank
            if len(line_split) == 1:
                num_sentences += 1
            else:
                num_words += 1

    return num_words, num_sentences

def build_input_matrix(filename ,num_rows, tag2index, tags=True, word2index=None):
    # Building input matrix with columns: (id, id_in_sentence, id_word, id_tag)
    # Tags: if correct solution given (ie 4th column)
    # word2index: if use of previously built word2index mapping
    input_matrix = np.zeros((num_rows, 4), dtype=int)
    input_matrix[0] = [1,1,word2index['<s>'],8]
    row = 1
    # Boolean to indicate if a sentence is starting
    starting = False
    # Boolean if a mapping is defined (last element of the mapping is for unknown words)
    if word2index==None:
        test = False
        word2index = {'<s>': 1, '<\s>': 2}
        id_word = 3
    else:
        test = True
    with open(filename, 'r') as f:
        for line in f:
            if tags:
                line_split = line[:-1].split('\t')
            else:
                line_split = line[:-1].split(' ')
            if starting == True:
                # Start of sentence
                input_matrix[row, 0] = input_matrix[row-1, 0] + 1
                input_matrix[row, 1] = 1
                input_matrix[row, 2] = word2index['<s>']
                input_matrix[row, 3] = 8
                row+=1
                starting = False
            if len(line_split) == 1:
                # End of sentence
                input_matrix[row, :2] = input_matrix[row-1, :2] + 1
                input_matrix[row, 2] = word2index['<\s>']
                input_matrix[row, 3] = 9
                row+=1
                starting = True
            else:
                input_matrix[row, 0] = input_matrix[row-1, 0] + 1
                input_matrix[row, 1] = int(line_split[1]) + 1
                word_clean = line_split[2].lower()
                if not test:
                    if word_clean not in word2index:
                        word2index[word_clean] = id_word
                        id_word += 1
                    input_matrix[row, 2] = word2index[word_clean]
                else:
                    # Unseen word during train
                    if word_clean not in word2index:
                        input_matrix[row, 2] = len(word2index)
                    else:
                        input_matrix[row, 2] = word2index[word_clean]
                if tags:
                    input_matrix[row, 3] = tag2index[line_split[3]]
                row += 1
    # Add special word if training
    if not test:
        word2index['<unk>'] = len(word2index)+1
    if tags:
        return input_matrix, word2index
    else:
        return input_matrix[:,:3], word2index

In [70]:
def input_mm(matrix):
    
    nwords = matrix.shape[0]
    
    res = np.zeros((nwords,10),dtype = int)
    
    res[:,0] = matrix[:,2]
    
    for i in range(nwords):
        tag_1_hot = np.zeros(9)
        tag_1_hot[matrix[i,3]-1] = 1
        res[i,1:] = tag_1_hot
    
    return res

In [69]:
num_words, num_sentences = count_elements('../data/train.num.txt')
num_rows = num_words + 2*num_sentences
input_matrix_train, word2index = build_input_matrix('../data/train.num.txt', num_rows, tag2index, word2index = word_ind)

In [71]:
input_matrix_train_ = input_mm(input_matrix_train)

In [75]:
# Dev & test
num_words, num_sentences = count_elements('../data/dev.num.txt')
# Miss 1 blank line at the end of the file for the dev set
num_rows = num_words + 2*num_sentences + 1
input_matrix_dev, word2index = build_input_matrix('../data/dev.num.txt', num_rows, tag2index, word2index = word_ind)

num_words, num_sentences = count_elements('../data/test.num.txt', tags=False)
num_rows = num_words + 2*num_sentences
input_matrix_test, word2index = build_input_matrix('../data/test.num.txt', num_rows, tag2index, tags=False, word2index = word_ind)

In [81]:
input_matrix_dev_ = input_mm(input_matrix_dev)
input_matrix_test_ = input_matrix_test[:,2]

In [83]:
# Saving pre-processing
filename = '../data/MM_data.hdf5'
with p.File(filename, "w") as f:
    # Model
    f['input_matrix_train'] = input_matrix_train_
    f['input_matrix_dev'] = input_matrix_dev_
    f['input_matrix_test'] = input_matrix_test_