In [29]:
import re
import numpy as np

In [87]:
def get_word_embeddings(filename, line_count, dimension):
    word_embeddings = np.zeros((line_count+2, dimension - 1))
    word_embeddings[0,:] = 2 * np.random.random(dimension - 1) - 1
    word_embeddings[1,:] = 2 * np.random.random(dimension - 1) - 1
    with open(filename) as f:
        for i, line in enumerate(f):
            vector = line.split()
            word_embeddings[i+2, :] = vector[1:]
    return word_embeddings
filename = 'data/glove.6B.50d.txt'
line_count, dimension = get_number_elements(filename)
word_embeddings = get_word_embeddings(filename, line_count, dimension)

In [88]:
word_embeddings.shape

(400002, 50)

In [85]:
# Loading the tags to index mapping
def get_tags2index(filename):
    tags2index = {}
    with open(filename) as f:
        for line in f:
           (key, val) = line.split()
           tags2index[key] = int(val)
    return tags2index
tags2index = get_tags2index('data/tags.dict')
print len(tags2index)

45


In [81]:
# Loading the dictionary (words2index mapping)
def get_words2index(filename):
    words2index = {'PADDING':0, 'RARE':1}
    with open(filename) as f:
        for i, line in enumerate(f):
            # Restricing to the first 100 000 words
            if i == 100000:
                break
            vect = line.split()
            # Shifting of two for padding
            words2index[vect[0]] = i + 2
    return words2index

In [82]:
words2index = get_words2index('data/glove.6B.50d.txt')

In [65]:

# Function to pre-process the words
# Return (feature_1, feature_2)
def pre_process(word, words2index):
    # Removing number if present
    word = re.sub("\d", "", word)
    # Case if only digits
    if not len(word):
        word = 'NUMBER'
    # Building feature 1
    word_lower = word.lower()
    if word_lower in words2index:
        feature1 = words2index[word_lower]
    else:
        word = 'RARE'
        feature1 = 1
    # Building feature 2   
    if word.islower() or re.search('[.?\-",]+', word):
        feature2 = 0
    elif word.isupper():
        feature2 = 1
    elif word[0].isupper():
        feature2 = 2
    else:
        feature2 = 3
    return feature1, feature2

In [86]:
# Counting number of elements
def get_number_elements(filename):
    line_count = 0
    dimension = 0
    with open(filename) as f:
        for line in f:
            sp = line.split()
            if sp:
                if not dimension:
                    dimension = len(sp)
                line_count += 1
    return line_count, dimension

In [114]:
# Step 1: build the array (id_in_sentence, word_feature, cap_feature) and the output array
def build_processed_input(filename, line_count, words2index, tags2index, test=False):
    output = np.zeros(line_count, dtype=int)
    # Contains: id_in_sentence, word_feature, cap_feature
    processed_input = np.zeros((line_count, 3), dtype=int)
    i = 0
    with open(filename) as f:
        for line in f:
            sp = line.split()
            # Check if blanck
            if sp:
                idword, id_in_sentence, word, tag = sp
                word_feature, cap_feature = pre_process(word, words2index)
                if test:
                    output[i] = tags2index[tag]
                processed_input[i, :] = [id_in_sentence, word_feature, cap_feature]
                i += 1
    return processed_input, output


In [115]:
# Step 2: building the two arrays for word_feature and cap_feature using window of dim 5 and the output vector
def build_feature_array(filename, line_count, processed_input):
    # Initialization
    input_word = np.zeros((line_count, 5), dtype=int)
    input_cap = np.zeros((line_count, 5), dtype=int)

    for i in xrange(line_count - 2):
        # Last element of the window
        id_in_sentence_cur, feature1_cur, feature2_cur = tuple(processed_input[i, :])
        id_in_sentence_next1, feature1_next1, feature2_next1 = tuple(processed_input[i+1, :])
        id_in_sentence_next2, feature1_next2, feature2_next2 = tuple(processed_input[i+2, :])
        # Case current word is the first one of a sentence
        if id_in_sentence_cur == 1:
            input_word[i,:2] = 0
            input_cap[i,:2] = 1
            input_word[i,2] = feature1_cur
            input_cap[i,2] = feature2_cur
            input_word[i,3] = feature1_next1
            input_cap[i,3] = feature2_next1
            input_word[i,4] = feature1_next2
            input_cap[i,4] = feature2_next2
        else:
            input_word[i,:4] = input_word[i-1,1:5]
            input_cap[i,:4] = input_cap[i-1,1:5]
            # Case current word is within one position to the last one of a sentence
            if id_in_sentence_next2 == 1:
                input_word[i,4] = 0
                input_cap[i,4] = 1
            # Case current word is the last one of a sentence
            elif id_in_sentence_next1 == 1:
                input_word[i,3] = 0
                input_cap[i,3] = 1
                input_word[i,4] = 0
                input_cap[i,4] = 1
            else:
                input_word[i,4] = feature1_next2
                input_cap[i,4] = feature2_next2
    # Corner Case: two last rows
    i = line_count - 2
    # Case one to last word at a beginning of a sentence
    id_in_sentence_last1, feature1_last1, feature2_last1 = tuple(processed_input[i + 1, :])
    id_in_sentence_last2, feature1_last2, feature2_last2 = tuple(processed_input[i, :])
    if id_in_sentence_last2 == 1:
        input_word[i,:2] = 0
        input_cap[i,:2] = 1
        input_word[i,2] = feature1_last2
        input_cap[i,2] = feature2_last2
        input_word[i,3] = feature1_last1
        input_cap[i,3] = feature2_last1
        input_word[i,4] = 0
        input_cap[i,4] = 1
    else:
        input_word[i,:4] = input_word[i-1,1:5]
        input_cap[i,:4] = input_cap[i-1,1:5]
        input_word[i,4] = 0
        input_cap[i,4] = 1
    # Last word case
    input_word[i+1,:4] = input_word[i,1:5]
    input_cap[i+1,:4] = input_cap[i,1:5]
    input_word[i+1,4] = 0
    input_cap[i+1,4] = 1
    
    return input_cap.astype(int), input_word.astype(int)

In [106]:
%%time
# Test
filename = 'data/train.tags.txt'
line_count, dimension = get_number_elements(filename)
processed_input, output = build_processed_input(filename, line_count, words2index, tags2index)
input_cap, input_word = build_feature_array(filename, line_count, processed_input)

CPU times: user 13.1 s, sys: 255 ms, total: 13.4 s
Wall time: 14 s


In [91]:
train = "data/train.tags.txt"
valid = "data/dev.tags.txt"
test = "data/test.tags.txt"
tag_dict = "data/tags.dict"
embedding = "data/glove.6B.50d.txt"

In [116]:
tags2index = get_tags2index(tag_dict)
print 'tags2index size', len(tags2index)
C = len(tags2index)
words2index = get_words2index(embedding)
print 'words2index size', len(words2index)
line_count_dict, dimension_dict = get_number_elements(embedding)
word_embeddings = get_word_embeddings(embedding, line_count_dict,
                                      dimension_dict)

input_features = {}
for name, filename in zip(['train', 'valid', 'test'], [train, valid, test]):
        if name=='test':
            test_bool = False
        else:
            test_bool = True
        line_count, dimension = get_number_elements(filename)
        processed_input, output = build_processed_input(filename, line_count,
                                                        words2index,
                                                        tags2index, test=test_bool)
        input_cap, input_word = build_feature_array(filename, line_count, processed_input)
        input_features[name] = input_word, input_cap, output

tags2index size 45
words2index size 100002


In [117]:
input_features['train']

(array([[   0,    0, 5031,    1,    3],
        [   0, 5031,    1,    3,  225],
        [5031,    1,    3,  225,   84],
        ..., 
        [   6,  616, 1520,  775,    4],
        [ 616, 1520,  775,    4,    0],
        [1520,  775,    4,    0,    0]]), array([[1, 1, 2, 1, 0],
        [1, 2, 1, 0, 1],
        [2, 1, 0, 1, 0],
        ..., 
        [0, 2, 2, 0, 0],
        [2, 2, 0, 0, 1],
        [2, 0, 0, 1, 1]]), array([ 1,  1,  2, ...,  1, 17, 11]))

In [121]:
import h5py

with h5py.File('PTB.hdf5', "r") as f:
    test = f['train_input_word_windows']
    

In [123]:
test.dtype

ValueError: Not a dataset (Not a dataset)