In [43]:
import numpy as np
import h5py as p
import re

In [2]:
words = []
with open('../data/glove.6B.50d.txt') as f:
    for l in f:
        words.append(l.split()[0])

In [3]:
word_ind = dict(zip(words, [x+1 for x in range(len(words))]))
word_ind['<s>'] = len(words) + 1
word_ind['<\s>'] = len(words) + 2
ind_word = {}
for k,v in word_ind.items():
    ind_word[v] = k

In [4]:
embeddings = np.zeros((len(words),50))
i = 0
with open('../data/glove.6B.50d.txt') as f:
    for l in f:
        emb = [float(c) for c in l.split()[1:]]
        embeddings[i] = emb
        i+=1

In [5]:
embeddings_ = np.zeros((len(words)+2,50))
embeddings_[:len(words),:] = embeddings
embeddings_[400000,:] = embeddings[word_ind['.']] + np.random.normal(0,0.001,50)
embeddings_[400001,:] = embeddings[word_ind['.']] + np.random.normal(0,0.001,50)

In [6]:
# Tags mapping
tag2index = {}

with open('../data/tags.txt', 'r') as f:
    for line in f:
        line_split = line[:-1].split(' ')
        tag2index[line_split[0]] = int(line_split[1])

# Adding tags for end/start of sentence
tag2index['<t>'] = 8
tag2index['<\t>'] = 9
print(tag2index)

{'I-LOC': 3, '<\t>': 9, 'I-PER': 2, '<t>': 8, 'O': 1, 'I-MISC': 5, 'B-MISC': 6, 'I-ORG': 4, 'B-LOC': 7}


In [94]:
embeddings_[word_ind['<\s>']-1]

array([ 0.70808726,  0.570312  , -0.4699188 ,  0.17873327,  0.54282193,
        0.72641014,  0.18071378, -0.52278846,  0.10434391, -0.17651015,
        0.07872373, -0.36301821, -0.11876748, -0.83267948,  0.11833908,
       -0.16743652,  0.06052769, -0.01320265, -0.56591652,  0.01239596,
        0.2286612 , -0.14386772, -0.06802176, -0.38141981, -0.23626355,
       -1.70160089, -0.86701506, -0.26599092, -0.25720381,  0.17526465,
        3.86777287, -0.16281838, -0.13381515, -0.6882982 ,  0.18437706,
        0.00619479, -0.33774413, -0.07871775,  0.24368503,  0.36629811,
       -0.34840807,  0.28469391,  0.07581127, -0.060944  , -0.39140224,
        0.22796707, -0.21601652, -0.22596162, -0.09260529, -0.80184728])

In [27]:
def count_elements(filename, tags=True):
    # Counting the number of elements to stored (ie num_words + 2*num_sentences)
    num_words = 0
    num_sentences = 0
    with open(filename, 'r') as f:
        for line in f:
            if tags:
                line_split = line[:-1].split('\t')
            else:
                line_split = line[:-1].split(' ')
            # Case blank
            if len(line_split) == 1:
                num_sentences += 1
            else:
                num_words += 1

    return num_words, num_sentences

def build_input_matrix(filename ,num_rows, tag2index, tags=True, word2index=None):
    # Building input matrix with columns: (id, id_in_sentence, id_word, id_tag)
    # Tags: if correct solution given (ie 4th column)
    # word2index: if use of previously built word2index mapping
    input_matrix = np.zeros((num_rows, 4), dtype=int)
    input_matrix[0] = [1,1,word2index['<s>'],1,8]
    row = 1
    
    start = [word2index['<s>'],8]
    end = [word2index['</s>'],9]
    
    # Boolean to indicate if a sentence is starting
    starting = False
    # Boolean if a mapping is defined (last element of the mapping is for unknown words)
    if word2index==None:
        test = False
        word2index = {'<s>': 1, '<\s>': 2}
        id_word = 3
    else:
        test = True
    with open(filename, 'r') as f:
        for line in f:
            if tags:
                line_split = line[:-1].split('\t')
            else:
                line_split = line[:-1].split(' ')
            if starting == True:
                # Start of sentence
                input_matrix[row, 0] = input_matrix[row-1, 0] + 1
                input_matrix[row, 1] = 1
                input_matrix[row, 2:] = start
                row+=1
                starting = False
            if len(line_split) == 1:
                # End of sentence
                input_matrix[row, :2] = input_matrix[row-1, :2] + 1
                input_matrix[row, 2:] = end
                row+=1
                starting = True
            else:
                input_matrix[row, 0] = input_matrix[row-1, 0] + 1
                input_matrix[row, 1] = int(line_split[1]) + 1
                # Build cap feature
                word = line_split[2]
                input_matrix[row, 3] = get_cap_feature(word)
                word_clean = line_split[2].lower()
                if not test:
                    if word_clean not in word2index:
                        word2index[word_clean] = id_word
                        id_word += 1
                    input_matrix[row, 2] = word2index[word_clean]
                else:
                    # Unseen word during train
                    if word_clean not in word2index:
                        input_matrix[row, 2] = len(word2index)
                    else:
                        input_matrix[row, 2] = word2index[word_clean]
                if tags:
                    input_matrix[row, 3] = tag2index[line_split[3]]
                row += 1
    # Add special word if training
    if not test:
        word2index['<unk>'] = len(word2index)+1
    if tags:
        return input_matrix, word2index
    else:
        return input_matrix[:,:3], word2index

In [8]:
def input_mm(matrix):
    
    nwords = matrix.shape[0]
    
    res = np.zeros((nwords,10),dtype = int)
    
    res[:,0] = matrix[:,2]
    
    for i in range(nwords):
        tag_1_hot = np.zeros(9)
        tag_1_hot[matrix[i,3]-1] = 1
        res[i,1:] = tag_1_hot
    
    return res

def input_mm_embed(matrix, embed):
    
    nwords = matrix.shape[0]
    
    res = np.zeros((nwords,50+9+1))
    
    res[:,0] = matrix[:,2]
    
    for i in range(nwords):
        res[i,:50] = embed[matrix[i,2]-1,:]
        tag_1_hot = np.zeros(9)
        tag_1_hot[matrix[i,3]-1] = 1
        res[i,50:59] = tag_1_hot
    
    res[:,59] = matrix[:,3]
    return res

In [9]:
2+2

4

In [10]:
num_words, num_sentences = count_elements('../data/train.num.txt')
num_rows = num_words + 2*num_sentences
input_matrix_train, word2index = build_input_matrix('../data/train.num.txt', num_rows, tag2index, word2index = word_ind)

In [11]:
input_matrix_train_ = input_mm(input_matrix_train)
input_matrix_train_embed = input_mm_embed(input_matrix_train, embeddings_)

In [53]:
input_matrix_train_[0]

array([400001,      0,      0,      0,      0,      0,      0,      0,
            1,      0])

In [16]:
# Dev & test
num_words, num_sentences = count_elements('../data/dev.num.txt')
# Miss 1 blank line at the end of the file for the dev set
num_rows = num_words + 2*num_sentences + 1
input_matrix_dev, word2index = build_input_matrix('../data/dev.num.txt', num_rows, tag2index, word2index = word_ind)

num_words, num_sentences = count_elements('../data/test.num.txt', tags=False)
num_rows = num_words + 2*num_sentences
input_matrix_test, word2index = build_input_matrix('../data/test.num.txt', num_rows, tag2index, tags=False, word2index = word_ind)

In [17]:
input_matrix_dev_ = input_mm(input_matrix_dev)
input_matrix_dev_embed = input_mm_embed(input_matrix_dev, embeddings_)
input_matrix_test_ = input_matrix_test[:,2]
input_matrix_test_embed = np.zeros((input_matrix_test_.shape[0],50))
for i in range(input_matrix_test_.shape[0]):
    input_matrix_test_embed[i,:] = embeddings_[input_matrix_test_[i]-1,:]

In [19]:
input_matrix_dev_embed.shape

(29104, 60)

In [125]:
# Saving pre-processing
filename = '../data/MM_data.hdf5'
with p.File(filename, "w") as f:
    # Model
    f['input_matrix_train'] = input_matrix_train_
    f['input_matrix_train_embed'] = input_matrix_train_embed
    f['input_matrix_dev'] = input_matrix_dev_
    f['input_matrix_dev_embed'] = input_matrix_dev_embed
    f['input_matrix_test'] = input_matrix_test_

In [84]:
# Saving pre-processing
filename = '../data/embeddings.hdf5'
with p.File(filename, "w") as f:
    # Model
    f['embeddings'] = embeddings

In [111]:
input_matrix_train_[:2]

array([[400001,      0,      0,      0,      0,      0,      0,      0,
             1,      0],
       [   645,      0,      0,      0,      1,      0,      0,      0,
             0,      0]])

### Preprocess with caps:

In [78]:

def count_elements(filename, tags=True):
    # Counting the number of elements to stored (ie num_words + 2*num_sentences)
    num_words = 0
    num_sentences = 0
    with open(filename, 'r') as f:
        for line in f:
            if tags:
                line_split = line[:-1].split('\t')
            else:
                line_split = line[:-1].split(' ')
            # Case blank
            if len(line_split) == 1:
                num_sentences += 1
            else:
                num_words += 1

    return num_words, num_sentences

def get_cap_feature(word):
    # Return the caps feature for the given word
    # 1 - low caps; 2 - all caps; 3 - first cap; 4 - one cap; 5 - other
    if len(word) == 0 or word.islower() or re.search('[.?\-",]+', word):
        feature = 1
    elif word.isupper():
        feature = 2
    elif len(word) and word[0].isupper():
        feature = 3
    elif sum([w.isupper() for w in word]):
        feature = 4
    else:
        feature = 5
    return feature
    

def build_input_matrix_cap(filename ,num_rows, tag2index, tags=True, word2index=None):
    # Building input matrix with columns: (id, id_in_sentence, id_word, id_caps, id_tag)
    # caps feature:
    # 1 - low caps; 2 - all caps; 3 - first cap; 4 - one cap; 5 - other
    # Tags: if correct solution given (ie 4th column)
    # word2index: if use of previously built word2index mapping
        
    # initialization
    input_matrix = np.zeros((num_rows, 5), dtype=int)
    input_matrix[0] = [1,1,word2index['<s>'],1,8]
    row = 1
    
    # Features for starting/ending of sentence (3 last columns)
    start = [word2index['<s>'],1,8]
    end = [word2index['<\s>'],1,9]
    
    # Boolean to indicate if a sentence is starting
    starting = False
    # Boolean if a mapping is defined (last element of the mapping is for unknown words)
    if word2index==None:
        test = False
        word2index = {'<s>': 1, '<\s>': 2}
        id_word = 3
    else:
        test = True
    with open(filename, 'r') as f:
        for line in f:
            if tags:
                line_split = line[:-1].split('\t')
            else:
                line_split = line[:-1].split(' ')
            if starting == True:
                # Start of sentence
                input_matrix[row, 0] = input_matrix[row-1, 0] + 1
                input_matrix[row, 1] = 1
                input_matrix[row, 2:] = start
                row+=1
                starting = False
            if len(line_split) == 1:
                # End of sentence
                input_matrix[row, :2] = input_matrix[row-1, :2] + 1
                input_matrix[row, 2:] = end
                row+=1
                starting = True
            else:
                # Indexing
                input_matrix[row, 0] = input_matrix[row-1, 0] + 1
                input_matrix[row, 1] = int(line_split[1]) + 1
                # Build cap feature
                word = line_split[2]
                input_matrix[row, 3] = get_cap_feature(word)
                # Build word count feature
                word_clean = word.lower()
                if not test:
                    if word_clean not in word2index:
                        word2index[word_clean] = id_word
                        id_word += 1
                    input_matrix[row, 2] = word2index[word_clean]
                else:
                    # Unseen word during train
                    if word_clean not in word2index:
                        input_matrix[row, 2] = len(word2index)
                    else:
                        input_matrix[row, 2] = word2index[word_clean]
                if tags:
                    input_matrix[row, 4] = tag2index[line_split[3]]
                row += 1
    # Add special word if training
    if not test:
        word2index['<unk>'] = len(word2index)+1
    if tags:
        return input_matrix, word2index
    else:
        return input_matrix[:,:4], word2index

In [29]:
tttt = np.array([1,2,3,4,5,6,7])

In [57]:
tttt[1:3]

array([2, 3])

In [58]:
def input_mm_cap(matrix):
    
    nwords = matrix.shape[0]
    
    res = np.zeros((nwords,16),dtype = int)
    
    res[:,0] = matrix[:,2]
    
    for i in range(nwords):
        tag_1_hot = np.zeros(9)
        tag_1_hot[matrix[i,4]-1] = 1
        tag_1_hot_cap = np.zeros(5)
        tag_1_hot_cap[matrix[i,3]-1] = 1
        res[i,1:10] = tag_1_hot
        res[i,10:15] = tag_1_hot_cap
    res[:,15] = matrix[:,4]
    return res

def input_mm_embed_cap(matrix, embed):
    
    nwords = matrix.shape[0]
    
    res = np.zeros((nwords,50+9+1))
    
    res[:,0] = matrix[:,2]
    
    for i in range(nwords):
        res[i,:50] = embed[matrix[i,2]-1,:]
        tag_1_hot = np.zeros(9)
        tag_1_hot[matrix[i,3]-1] = 1
        res[i,50:59] = tag_1_hot
    
    res[:,59] = matrix[:,3]
    return res

In [59]:
num_words, num_sentences = count_elements('../data/train.num.txt')
num_rows = num_words + 2*num_sentences
input_matrix_train_cap_, word2index = build_input_matrix_cap('../data/train.num.txt', num_rows, tag2index, word2index = word_ind)
input_matrix_train_cap = input_mm_cap(input_matrix_train_cap_)

In [79]:
# Dev & test
num_words, num_sentences = count_elements('../data/dev.num.txt')
# Miss 1 blank line at the end of the file for the dev set
num_rows = num_words + 2*num_sentences + 1
input_matrix_dev_cap_, word2index = build_input_matrix_cap('../data/dev.num.txt', num_rows, tag2index, word2index = word_ind)

num_words, num_sentences = count_elements('../data/test.num.txt', tags=False)
num_rows = num_words + 2*num_sentences
input_matrix_test_cap_, word2index = build_input_matrix_cap('../data/test.num.txt', num_rows, tag2index, tags=False, word2index = word_ind)

In [80]:
input_matrix_dev_cap = input_mm_cap(input_matrix_dev_cap_)

In [86]:
input_matrix_train_cap[:10,:]

array([[400001,      0,      0,      0,      0,      0,      0,      0,
             1,      0,      1,      0,      0,      0,      0,      8],
       [   645,      0,      0,      0,      1,      0,      0,      0,
             0,      0,      0,      1,      0,      0,      0,      4],
       [  7579,      1,      0,      0,      0,      0,      0,      0,
             0,      0,      1,      0,      0,      0,      0,      1],
       [   515,      0,      0,      0,      0,      1,      0,      0,
             0,      0,      0,      0,      1,      0,      0,      5],
       [   581,      1,      0,      0,      0,      0,      0,      0,
             0,      0,      1,      0,      0,      0,      0,      1],
       [     5,      1,      0,      0,      0,      0,      0,      0,
             0,      0,      1,      0,      0,      0,      0,      1],
       [  5261,      1,      0,      0,      0,      0,      0,      0,
             0,      0,      1,      0,      0,      0,   

In [84]:
input_matrix_test_cap = np.zeros((input_matrix_test_cap_.shape[0],6))
input_matrix_test_cap[:,0] = input_matrix_test_cap_[:,2]
for i in range(input_matrix_test_cap_.shape[0]):
    tag_1_hot_cap = np.zeros(5)
    tag_1_hot_cap[input_matrix_test_cap_[i,3]-1] = 1
    input_matrix_test_cap[i,1:] = tag_1_hot_cap

In [87]:
# Saving pre-processing
filename = '../data/MM_data_cap.hdf5'
with p.File(filename, "w") as f:
    # Model
    f['input_matrix_train_cap'] = input_matrix_train_cap
    f['input_matrix_dev_cap'] = input_matrix_dev_cap
    f['input_matrix_test_cap'] = input_matrix_test_cap

In [121]:
def sentences(matrix):
    res = []
    start  = 1
    length = 2
    
    for i in range(1,len(matrix)):
        if matrix[i,1] == 1:
            res.append([start,length-1])
            start = matrix[i,0]
            length = 1
        if i == (len(matrix) - 1):
            res.append([start,length])
        length += 1
    
    return np.array(res)

In [123]:
sent = sentences(input_matrix_train_cap_)

In [124]:
# Saving pre-processing
filename = '../data/sent_start.hdf5'
with p.File(filename, "w") as f:
    # Model
    f['sent_start'] = sent

In [125]:
sent[:5,]

array([[ 1, 11],
       [12,  4],
       [16,  4],
       [20, 32],
       [52, 33]])

In [126]:
input_matrix_train_cap_[:20,]

array([[     1,      1, 400001,      1,      8],
       [     2,      2,    645,      2,      4],
       [     3,      3,   7579,      1,      1],
       [     4,      4,    515,      3,      5],
       [     5,      5,    581,      1,      1],
       [     6,      6,      5,      1,      1],
       [     7,      7,   5261,      1,      1],
       [     8,      8,    298,      3,      5],
       [     9,      9,  10239,      1,      1],
       [    10,     10,      3,      1,      1],
       [    11,     11, 400002,      1,      9],
       [    12,      1, 400001,      1,      8],
       [    13,      2,   1295,      3,      2],
       [    14,      3,   9004,      3,      2],
       [    15,      4, 400002,      1,      9],
       [    16,      1, 400001,      1,      8],
       [    17,      2,   3880,      2,      3],
       [    18,      3, 400002,      1,      1],
       [    19,      4, 400002,      1,      9],
       [    20,      1, 400001,      1,      8]])