In [1]:
# create a w2v index from a filtered version of the Brown corpus from NLTK
# these vectors will be used as the initial input for our Deep Learning models

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from __future__ import division, print_function
import codecs
import os
import cPickle
import logging
from collections import Counter, defaultdict

import nltk
import numpy as np
import matplotlib.pyplot as plt
import pylab
import pandas as pd
from scipy.stats import norm
from gensim.models import Word2Vec
from fuel.datasets import H5PYDataset

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
# load our version of the brown dataset, and get an iterator over all of the documents
DATASET_LOCATION = '../../datasets/'
DATASET_NAME = 'brown_pos_dataset.hdf5'
DATASET_PATH = os.path.join(DATASET_LOCATION, DATASET_NAME)

with open(os.path.join(DATASET_LOCATION, 'brown_pos_dataset.indices')) as indices_file:
    corpus_indices = cPickle.load(indices_file)
    
# ok lets load the brown corpus, and use the indexes to convert it to ints,
# then build the W2V index over this corpus
UNKNOWN_TOKEN = u'_UNK_'

def map_to_unknown(tok, index):
    if tok in index:
        return tok
    else:
        return UNKNOWN_TOKEN

brown_documents = [[w for p in d for w in p] for d in nltk.corpus.brown.paras()]
print(len(brown_documents))

# Gensim expects strings, so we'll need to map back to indices to use our vectors consistently
# let's also pad with start and end
brown_documents = [[u'_START_'] + [map_to_unknown(w, corpus_indices['word2idx']) for w in d] + [u'_END_'] 
                   for d in brown_documents]

15667


In [4]:
def train_word2vec(text_iterator, model_file='w2v_model', workers=6, vec_size=100, min_count=1):
    """
    Trains word2vec model using the corpus contained in text_iterator
  
    Parameters:
        Model is stored in <model_file>
        <workers> controls the number of processors Word2Vec can use
        min_count is the minimum number of occurences for a word to be included in the model
  
    Returns:
        The model contains vectors of <vec_size> dimensions (default 100)
    """

    docs = text_iterator

    model = Word2Vec(docs, size=vec_size, workers=workers, iter=1, min_count=min_count) 
    model.save(model_file)
    return model

In [5]:
EMBEDDING_SIZE = 50

w2v_model = train_word2vec(brown_documents,
                           model_file=os.path.join(DATASET_LOCATION, 'brown_w2v_model'),
                           vec_size=EMBEDDING_SIZE)

orig_w2v_vectors = w2v_model.syn0



In [6]:
set(set(corpus_indices['word2idx'].keys())).difference(w2v_model.vocab.keys())

set()

In [10]:
# reindex w2v_vectors to correspond to our index
w2v_index_order = [w2v_model.vocab[w].index 
                   for w,v in sorted(corpus_indices['word2idx'].items(), key=lambda x: x[1])]

final_w2v_vectors = orig_w2v_vectors[w2v_index_order]

# now persist this version for later
with open(os.path.join(DATASET_LOCATION, 'brown_w2v_vectors.npy'), 'wb') as outfile:
    np.save(outfile, final_w2v_vectors)


### Challenges: 
    
(1)
- try using the word2vec index in some of the experiments from day1 (there is a pre-trained index in dl4mt_exercises/datasets)
- how does the performance compare with the simple embedding we were previously using?