<img style="float: left;;" src='Figures/alinco.png' /></a>

# Modulo II: Vectores Palabra (Word Embeddings) y CBOW 01

In [2]:
import re
import nltk
import emoji
import numpy as np
from nltk.tokenize import word_tokenize


In [3]:
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!'

In [4]:
# Print original corpus
print(f'Corpus:  {corpus}')

# Do the substitution
data = re.sub(r'[,!?;-]+', '.', corpus)

# Print cleaned corpus
print(f'After cleaning punctuation:  {data}')

Corpus:  Who ❤️ "word embeddings" in 2020? I do!!!
After cleaning punctuation:  Who ❤️ "word embeddings" in 2020. I do.


In [5]:
# Print cleaned corpus
print(f'Initial string:  {data}')

# Tokenize the cleaned corpus
data = nltk.word_tokenize(data)

# Print the tokenized version of the corpus
print(f'After tokenization:  {data}')

Initial string:  Who ❤️ "word embeddings" in 2020. I do.
After tokenization:  ['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '.']


In [6]:
# Print the tokenized version of the corpus
print(f'Initial list of tokens:  {data}')

# Filter tokenized corpus using list comprehension
data = [ ch.lower() for ch in data
         if ch.isalpha()
         or ch == '.'
         or emoji.get_emoji_regexp().search(ch)
       ]

# Print the tokenized and filtered version of the corpus
print(f'After cleaning:  {data}')

Initial list of tokens:  ['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '.']
After cleaning:  ['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']


In [7]:
# Define the 'tokenize' function that will include the steps previously seen
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus)
    data = nltk.word_tokenize(data)  # tokenize string to words
    data = [ ch.lower() for ch in data
             if ch.isalpha()
             or ch == '.'
             or emoji.get_emoji_regexp().search(ch)
           ]
    return data

In [8]:
# Define new corpus
corpus = 'I am happy because I am learning'

# Print new corpus
print(f'Corpus:  {corpus}')

# Save tokenized version of corpus into 'words' variable
words = tokenize(corpus)

# Print the tokenized version of the corpus
print(f'Words (tokens):  {words}')

Corpus:  I am happy because I am learning
Words (tokens):  ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']


In [9]:
# Run this with any sentence
tokenize("Now it's your turn: try with your own sentence!")

['now', 'it', 'your', 'turn', 'try', 'with', 'your', 'own', 'sentence', '.']

In [10]:
# Define the 'get_windows' function
def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
        yield context_words, center_word
        i += 1

In [12]:
words

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

In [19]:
get_windows(words,2)

<generator object get_windows at 0x7f96817e62e0>

In [20]:
for x, y in get_windows(words, 2):
    print(f'{x}\t{y}')

['i', 'am', 'because', 'i']	happy
['am', 'happy', 'i', 'am']	because
['happy', 'because', 'am', 'learning']	i


In [11]:
# Print 'context_words' and 'center_word' for the new corpus with a 'context half-size' of 2
for x, y in get_windows(['i', 'am', 'happy', 'because', 'i', 'am', 'learning'], 2):
    print(f'{x}\t{y}')

['i', 'am', 'because', 'i']	happy
['am', 'happy', 'i', 'am']	because
['happy', 'because', 'am', 'learning']	i


# Transformación de palabras a vectores para el conjunto de entrenamiento


In [21]:
def get_dict(data):
    
    words = sorted(list(set(data)))
    n = len(words)
    
    idx = 0
    
    word2Ind = {}
    Ind2word = {}
    
    for k in words:
        word2Ind[k] = idx
        Ind2word[idx] = k
        idx +=1
    
    return word2Ind, Ind2word

In [22]:
words

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

In [23]:
word2Ind, Ind2word = get_dict(words)

In [None]:
a={key:value}

In [24]:
word2Ind

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}

In [25]:
Ind2word

{0: 'am', 1: 'because', 2: 'happy', 3: 'i', 4: 'learning'}

In [27]:
V = len(word2Ind)
V

5

In [29]:
center_word_vector = np.zeros(V)

center_word_vector

array([0., 0., 0., 0., 0.])

In [30]:
words

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

In [31]:
idx = word2Ind['happy']

In [32]:
idx

2

In [33]:
center_word_vector[idx] = 1

In [34]:
center_word_vector

array([0., 0., 1., 0., 0.])

In [39]:
V = len(word2Ind)

In [35]:
def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    return one_hot_vector
    

In [36]:
word_to_one_hot_vector('happy', word2Ind, V)

array([0., 0., 1., 0., 0.])

In [38]:
for w in words:
    print(w)
    print(word_to_one_hot_vector(w, word2Ind, V))

i
[0. 0. 0. 1. 0.]
am
[1. 0. 0. 0. 0.]
happy
[0. 0. 1. 0. 0.]
because
[0. 1. 0. 0. 0.]
i
[0. 0. 0. 1. 0.]
am
[1. 0. 0. 0. 0.]
learning
[0. 0. 0. 0. 1.]


In [40]:
context_words= ['i', 'am', 'because', 'i']


In [41]:
contex_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]

In [42]:
contex_words_vectors

[array([0., 0., 0., 1., 0.]),
 array([1., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0.]),
 array([0., 0., 0., 1., 0.])]

In [43]:
np.mean(contex_words_vectors, axis=0)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [44]:
def context_words_to_vector(context_words, word2Ind, V):
    contex_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    contex_words_vectors = np.mean(contex_words_vectors, axis=0)
    return contex_words_vectors
    
    

In [46]:
context_words_to_vector(context_words, word2Ind, V)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [47]:
context2=['am', 'happy', 'i', 'am']
context_words_to_vector(context2, word2Ind, V)


array([0.5 , 0.  , 0.25, 0.25, 0.  ])

## Creación de conjunto de entrenamiento

In [48]:
words

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

In [50]:
for contex_words, center_word in get_windows(words, 2):
    print(f'context words: {contex_words} ---> {context_words_to_vector(context_words, word2Ind, V)}')
    print(f'center word: {center_word} ---> {word_to_one_hot_vector(center_word, word2Ind, V)}')
    print()
    

context words: ['i', 'am', 'because', 'i'] ---> [0.25 0.25 0.   0.5  0.  ]
center word: happy ---> [0. 0. 1. 0. 0.]

context words: ['am', 'happy', 'i', 'am'] ---> [0.25 0.25 0.   0.5  0.  ]
center word: because ---> [0. 1. 0. 0. 0.]

context words: ['happy', 'because', 'am', 'learning'] ---> [0.25 0.25 0.   0.5  0.  ]
center word: i ---> [0. 0. 0. 1. 0.]



In [51]:
def get_training_example(words, C, word2Ind, V):
    for contex_words, center_word in get_windows(words, 2):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)
        

In [53]:
for contex_words, center_word in get_training_example(words, 2, word2Ind, V):
    print(f'context words:---> {contex_words}')
    print(f'center word: ---> {center_word}')
    print()
    

context words:---> [0.25 0.25 0.   0.5  0.  ]
center word: ---> [0. 0. 1. 0. 0.]

context words:---> [0.25 0.25 0.   0.5  0.  ]
center word: ---> [0. 1. 0. 0. 0.]

context words:---> [0.25 0.25 0.   0.5  0.  ]
center word: ---> [0. 0. 0. 1. 0.]

