In [1]:
import re

import emoji
import nltk
from nltk.tokenize import word_tokenize
import numpy as np

from utils import get_dict

# Data preparation

In the data preparation phase, starting with a corpus of text, you will:

- Clean and tokenize the corpus.

- Extract the pairs of context words and center word that will make up the training data set for the CBOW model. The context words are the features that will be fed into the model, and the center words are the target values that the model will learn to predict.

- Create simple vector representations of the context words (features) and center words (targets) that can be used by the neural network of the CBOW model.

## Cleaning and tokenization

To demonstrate the cleaning and tokenization process, consider a corpus that contains emojis and various punctuation signs.

In [2]:
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!'

In [3]:
print(f'Corpus: {corpus}')
data = re.sub(r'[,!?;-]+', '.', corpus)

print(f'After cleaning punctuation: {data}')

Corpus: Who ❤️ "word embeddings" in 2020? I do!!!
After cleaning punctuation: Who ❤️ "word embeddings" in 2020. I do.


In [4]:
print(f'Initial string: {data}')
data = nltk.word_tokenize(data)
print(f'After tokenization: {data}')

Initial string: Who ❤️ "word embeddings" in 2020. I do.
After tokenization: ['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '.']


In [5]:
print(f'Initial list of tokens: {data}')
data = [ch.lower() for ch in data
        if ch.isalpha()
        or ch == '.'
        or emoji.get_emoji_regexp().search(ch)]
print(f'After cleaning:  {data}')

Initial list of tokens: ['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '.']
After cleaning:  ['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']


In [6]:
# Wrap in a functin
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus)
    data = nltk.word_tokenize(data)
    data = [ch.lower() for ch in data
            if ch.isalpha() 
            or ch == '.' 
            or emoji.get_emoji_regexp().search(ch)]
    return data

In [7]:
corpus = 'I am happy because I am learning'
print(f'Corpus:  {corpus}')
words = tokenize(corpus)
print(f'Words (tokens):  {words}')

Corpus:  I am happy because I am learning
Words (tokens):  ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']


In [8]:
tokenize("A man, a plan, a canal: 'Panama!'")

['a', 'man', '.', 'a', 'plan', '.', 'a', 'canal', '.']

# Sliding window of words

Now that you have transformed the corpus into a list of clean tokens, you can slide a window of words across this list. For each window you can extract a center word and the context words.

The `get_windows` function in the next cell was introduced in the lecture.

In [9]:
def get_windows(words, context_halfsize):
    i = context_halfsize
    while i < len(words) - context_halfsize:
        center_word = words[i]
        context_words = (words[(i - context_halfsize):i] 
                         + words[(i + 1):(i + context_halfsize + 1)])
        yield context_words, center_word
        i += 1

The first argument of this function is a list of words (or tokens). The second argument, `C`, is the context half-size. Recall that for a given center word, the context words are made of `C` words to the left and `C` words to the right of the center word.

Here is how you can use this function to extract context words and center words from a list of tokens. These context and center words will make up the training set that you will use to train the CBOW model.

In [10]:
for x, y in get_windows(
    ['i', 'am', 'happy', 'because', 'i', 'am', 'learning'], 2):
    print(f'{x}\t{y}')

['i', 'am', 'because', 'i']	happy
['am', 'happy', 'i', 'am']	because
['happy', 'because', 'am', 'learning']	i


The first example of the training set is made of:

- the context words "i", "am", "because", "i",

- and the center word to be predicted: "happy".

**Now try it out yourself. In the next cell, you can change both the sentence and the context half-size.**

In [11]:
s = 'I like the ukelele lady, and the ukelele lady likes me'
window = 7
context_halfsize = (window - 1) // 2
for x, y in get_windows(tokenize(s), context_halfsize):
    print(f'{x}\t{y}')

['i', 'like', 'the', 'lady', '.', 'and']	ukelele
['like', 'the', 'ukelele', '.', 'and', 'the']	lady
['the', 'ukelele', 'lady', 'and', 'the', 'ukelele']	.
['ukelele', 'lady', '.', 'the', 'ukelele', 'lady']	and
['lady', '.', 'and', 'ukelele', 'lady', 'likes']	the
['.', 'and', 'the', 'lady', 'likes', 'me']	ukelele


## Transforming words into vectors for the training set

To finish preparing the training set, you need to transform the context words and center words into vectors.

### Mapping words to indices and indices to words

The center words will be represented as one-hot vectors, and the vectors that represent context words are also based on one-hot vectors.

To create one-hot word vectors, you can start by mapping each unique word to a unique integer (or index). We have provided a helper function, `get_dict`, that creates a Python dictionary that maps words to integers and back.

In [12]:
word2ind, ind2word = get_dict(words)

In [13]:
word2ind

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}

In [14]:
print("Index of the word 'happy':  ", word2ind['happy'])

Index of the word 'happy':   2


In [15]:
ind2word

{0: 'am', 1: 'because', 2: 'happy', 3: 'i', 4: 'learning'}

In [16]:
print('Word at index 2:', ind2word[2] )

Word at index 2: happy


In [17]:
V = len(word2ind)
print('Size of vocabulary:', V)

Size of vocabulary: 5


### Getting one-hot word vectors

Recall from the lecture that you can easily convert an integer, $n$, into a one-hot vector.

Consider the word "happy". First, retrieve its numeric index.

In [18]:
n = word2ind['happy']
n

2

In [19]:
center_word_vector = np.zeros(V)
center_word_vector

array([0., 0., 0., 0., 0.])

In [20]:
assert len(center_word_vector) == V

In [21]:
center_word_vector[n] = 1
center_word_vector

array([0., 0., 1., 0., 0.])

In [22]:
# wrap in func
def word_to_one_hot_vector(word, word2ind, vocab_size):
    one_hot_vector = np.zeros(vocab_size)
    one_hot_vector[word2ind[word]] = 1
    return one_hot_vector

In [23]:
word_to_one_hot_vector('happy', word2ind, V)

array([0., 0., 1., 0., 0.])

In [24]:
word_to_one_hot_vector('learning', word2ind, V)

array([0., 0., 0., 0., 1.])

### Getting context word vectors
To create the vectors that represent context words, you will calculate the average of the one-hot vectors representing the individual words.

Let's start with a list of context words.

In [25]:
context_words = ['i', 'am', 'because', 'i']

In [26]:
context_words_vectors = [word_to_one_hot_vector(w, word2ind, V) 
                         for w in context_words]
context_words_vectors

[array([0., 0., 0., 1., 0.]),
 array([1., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0.]),
 array([0., 0., 0., 1., 0.])]

In [27]:
np.mean(context_words_vectors, axis=0)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

Note the `axis=0` parameter that tells `mean` to calculate the average of the rows (if you had wanted the average of the columns, you would have used `axis=1`).

**Now create the `context_words_to_vector` function that takes in a list of context words, a word-to-index dictionary, and a vocabulary size, and outputs the vector representation of the context words.**

In [28]:
def context_words_to_vector(context_words, word2ind, vocab_size):
    context_words_vectors = [
        word_to_one_hot_vector(w, word2ind, vocab_size) 
        for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    return context_words_vectors

In [29]:
context_words_to_vector(['i', 'am', 'because', 'i'], word2ind, V)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [30]:
context_words_to_vector(['am', 'happy', 'i', 'am'], word2ind, V)

array([0.5 , 0.  , 0.25, 0.25, 0.  ])

## Building the training set
You can now combine the functions that you created in the previous sections, to build a training set for the CBOW model, starting from the following tokenized corpus.

In [31]:
words

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

In [32]:
for context_words, center_word in get_windows(words, 2):
    print(f'Context words:  {context_words} -> ' 
          f'{context_words_to_vector(context_words, word2ind, V)}')
    print(f'Center word:  {center_word} -> '
          f'{word_to_one_hot_vector(center_word, word2ind, V)}')
    print()

Context words:  ['i', 'am', 'because', 'i'] -> [0.25 0.25 0.   0.5  0.  ]
Center word:  happy -> [0. 0. 1. 0. 0.]

Context words:  ['am', 'happy', 'i', 'am'] -> [0.5  0.   0.25 0.25 0.  ]
Center word:  because -> [0. 1. 0. 0. 0.]

Context words:  ['happy', 'because', 'am', 'learning'] -> [0.25 0.25 0.25 0.   0.25]
Center word:  i -> [0. 0. 0. 1. 0.]



In [36]:
def get_training_example(words, context_halfsize, word2ind, vocab_size):
    for ctx_words, center_word in get_windows(words, context_halfsize):
        yield (context_words_to_vector(ctx_words, word2ind, vocab_size), 
               word_to_one_hot_vector(center_word, word2ind, vocab_size))

In [37]:
for ctx_words_vec, center_word_vec in get_training_example(
        words, 2, word2ind, V):
    print(f'Context words vector:  {ctx_words_vec}')
    print(f'Center word vector:  {center_word_vec}')
    print()

Context words vector:  [0.25 0.25 0.   0.5  0.  ]
Center word vector:  [0. 0. 1. 0. 0.]

Context words vector:  [0.5  0.   0.25 0.25 0.  ]
Center word vector:  [0. 1. 0. 0. 0.]

Context words vector:  [0.25 0.25 0.25 0.   0.25]
Center word vector:  [0. 0. 0. 1. 0.]

