In [1]:
import re
import nltk
import emoji
import numpy as np
from nltk.tokenize import word_tokenize
from collections import defaultdict
from utils2 import get_dict

In [2]:
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!'

In [3]:
# Print original corpus
print(f'Corpus:  {corpus}')

# Do the substitution
data = re.sub(r'[,!?;-]+', '.', corpus)

# Print cleaned corpus
print(f'After cleaning punctuation:  {data}')

Corpus:  Who ❤️ "word embeddings" in 2020? I do!!!
After cleaning punctuation:  Who ❤️ "word embeddings" in 2020. I do.


In [4]:
# Print cleaned corpus
print(f'Initial string:  {data}')

# Tokenize the cleaned corpus
data = nltk.word_tokenize(data)

# Print the tokenized version of the corpus
print(f'After tokenization:  {data}')

Initial string:  Who ❤️ "word embeddings" in 2020. I do.
After tokenization:  ['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '.']


In [5]:
# Print the tokenized version of the corpus
print(f'Initial list of tokens: {data}')

# Filter tokenized corpus using list comprehension
data = [ch.lower() for ch in data if ch.isalpha() or ch == '.' or emoji.get_emoji_regexp().search(ch)]

# Print the tokenized and filtered version of the corpus
print(f'After cleanin: {data}')

Initial list of tokens: ['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '.']
After cleanin: ['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']


In [None]:
# Run this with any sentence
tokenize("Now it's your turn: try with your own sentence!")

In [6]:
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus)
    data = nltk.word_tokenize(data)  # tokenize string to words
    data = [ ch.lower() for ch in data
             if ch.isalpha()
             or ch == '.'
             or emoji.get_emoji_regexp().search(ch)
           ]
    return data

In [7]:
# Define new corpus
corpus = 'I am happy because I am learning'

# Print new corpus
print(f'Corpus:  {corpus}')

# Save tokenized version of corpus into 'words' variable
words = tokenize(corpus)

# Print the tokenized version of the corpus
print(f'Words (tokens):  {words}')

Corpus:  I am happy because I am learning
Words (tokens):  ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']


In [8]:
tokenize("Now it's your turn: try with your own sentence!")

['now', 'it', 'your', 'turn', 'try', 'with', 'your', 'own', 'sentence', '.']

### Sliding window of words

In [10]:
def get_windows(words, C):
    i = C
    while(i < len(words) - C):   
        center_word = words[i]
        context_words = words[i-C:i] + words[i+1:i+1+C]
        yield context_words, center_word
        i += 1

In [11]:
# Print 'context_words' and 'center_word' for the new corpus with a 'context half-size' of 2
for x, y in get_windows(['i', 'am', 'happy', 'because', 'i', 'am', 'learning'], 2):
    print(f'{x}\t{y}')

['i', 'am', 'because', 'i']	happy
['am', 'happy', 'i', 'am']	because
['happy', 'because', 'am', 'learning']	i


In [12]:
# Print 'context_words' and 'center_word' for any sentence with a 'context half-size' of 1
for x, y in get_windows(tokenize("Now it's your turn: try with your own sentence!"), 1):
    print(f'{x}\t{y}')

['now', 'your']	it
['it', 'turn']	your
['your', 'try']	turn
['turn', 'with']	try
['try', 'your']	with
['with', 'own']	your
['your', 'sentence']	own
['own', '.']	sentence


### Transforming words into vectors for the training set

In [13]:
word2Ind, Ind2word = get_dict(words)
word2Ind

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}

In [14]:
# Print value for the key 'i' within word2Ind dictionary
print("Index of the word 'i':  ",word2Ind['i'])

Index of the word 'i':   3


In [15]:
# Print 'Ind2word' dictionary
Ind2word

{0: 'am', 1: 'because', 2: 'happy', 3: 'i', 4: 'learning'}

In [16]:
# Print value for the key '2' within Ind2word dictionary
print("Word which has index 2:  ",Ind2word[2] )

Word which has index 2:   happy


In [17]:
# Save length of word2Ind dictionary into the 'V' variable
V = len(word2Ind)

# Print length of word2Ind dictionary
print("Size of vocabulary: ", V)

Size of vocabulary:  5


### Getting 1-hot word vectors

In [18]:
n = word2Ind['happy']
n

2

In [19]:
# Create vector with the same length as the vocabulary, filled with zeros
center_word_vector = np.zeros(V)
center_word_vector

array([0., 0., 0., 0., 0.])

In [20]:
# Assert that the length of the vector is the same as the size of the vocabulary
len(center_word_vector) == V

True

In [22]:
# Replace element number 'n' with a 1
center_word_vector[n] = 1
center_word_vector

array([0., 0., 1., 0., 0.])

In [23]:
def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    n = word2Ind[word]
    one_hot_vector[n] = 1
    return one_hot_vector

In [24]:
word_to_one_hot_vector('happy', word2Ind, V)

array([0., 0., 1., 0., 0.])

In [25]:
word_to_one_hot_vector('learning', word2Ind, V)

array([0., 0., 0., 0., 1.])

### Getting context word vectors

In [26]:
# Define list containing context words
context_words = ['i', 'am', 'because', 'i']

In [27]:
# Create one-hot vectors for each context word using list comprehension

context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
context_words_vectors

[array([0., 0., 0., 1., 0.]),
 array([1., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0.]),
 array([0., 0., 0., 1., 0.])]

In [28]:
np.mean(context_words_vectors, axis=0)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [35]:
# Define the context words to vector function that will include the steps previously seen
def context_words_to_vector(context_words, word2Ind, V):
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    return context_words_vectors

In [36]:
# Print output of 'context_words_to_vector' function for context words: 'i', 'am', 'because', 'i'
context_words_to_vector(['i', 'am', 'because', 'i'], word2Ind, V)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [37]:
# Print output of 'context_words_to_vector' function for context words: 'am', 'happy', 'i', 'am'
context_words_to_vector(['am', 'happy', 'i', 'am'], word2Ind, V)

array([0.5 , 0.  , 0.25, 0.25, 0.  ])

### Building the training set

In [39]:
words

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

In [40]:
# Print vectors associated to center and context words for corpus
for context_words, center_word in get_windows(words, 2):
    print(f'Context words: {context_words} -> {context_words_to_vector(context_words, word2Ind, V)}')
    print(f'Center word: {center_word} ->{word_to_one_hot_vector(center_word, word2Ind, V)}')
    print()

Context words: ['i', 'am', 'because', 'i'] -> [0.25 0.25 0.   0.5  0.  ]
Center word: happy ->[0. 0. 1. 0. 0.]

Context words: ['am', 'happy', 'i', 'am'] -> [0.5  0.   0.25 0.25 0.  ]
Center word: because ->[0. 1. 0. 0. 0.]

Context words: ['happy', 'because', 'am', 'learning'] -> [0.25 0.25 0.25 0.   0.25]
Center word: i ->[0. 0. 0. 1. 0.]



In [41]:
# Define the generator function 'get_training_example'
def get_training_example(words, C, word2Ind, V):
    for context_words, center_word in get_windows(words, C):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

In [45]:
# Print vectors associated to center and context words for corpus using the generator function
for context_words_vector, center_word_vector in get_training_example(words, 2, word2Ind, V):
    print(f'Context words vector:  {context_words_vector}')
    print(f'Center word vector:  {center_word_vector}')
    print()

Context words vector:  [0.25 0.25 0.   0.5  0.  ]
Center word vector:  [0. 0. 1. 0. 0.]

Context words vector:  [0.5  0.   0.25 0.25 0.  ]
Center word vector:  [0. 1. 0. 0. 0.]

Context words vector:  [0.25 0.25 0.25 0.   0.25]
Center word vector:  [0. 0. 0. 1. 0.]

