<a href="https://colab.research.google.com/github/dsogden/NLP-Specialization/blob/main/Chap2_W4_Data_Prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install emoji==1.4.1 --upgrade

import re
import nltk
nltk.download('punkt')

import emoji
import numpy as np
from nltk.tokenize import word_tokenize

def get_dict(data):
    """
    Input:
        K: the number of negative samples
        data: the data you want to pull from
        indices: a list of word indices
    Output:
        word_dict: a dictionary with the weighted probabilities of each word
        word2Ind: returns dictionary mapping the word to its index
        Ind2Word: returns dictionary mapping the index to its word
    """
    #
    #     words = nltk.word_tokenize(data)
    words = sorted(list(set(data)))
    n = len(words)
    idx = 0
    # return these correctly
    word2Ind = {}
    Ind2word = {}
    for k in words:
        word2Ind[k] = idx
        Ind2word[idx] = k
        idx += 1
    return word2Ind, Ind2word



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Data Preparation

In [2]:
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!'
print(f'Corpus: {corpus}')

data = re.sub(r'[,!?;-]+', '.', corpus)
print(f'data after cleaning: {data}')

Corpus: Who ❤️ "word embeddings" in 2020? I do!!!
data after cleaning: Who ❤️ "word embeddings" in 2020. I do.


In [3]:
# tokenize cleaned data
data = word_tokenize(data)
print(f'after tokenization: {data}')

after tokenization: ['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '.']


In [4]:
print(f'Initial list of data: {data}')

data = [
    char.lower() for char in data
    if char.isalpha()
    or char == '.'
    or emoji.get_emoji_regexp().search(char)
]

print(f'After cleaning: {data}')

Initial list of data: ['Who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'I', 'do', '.']
After cleaning: ['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']


In [5]:
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus)
    data = word_tokenize(data)
    data = [
        char.lower() for char in data
        if char.isalpha()
        or char == '.'
        or emoji.get_emoji_regexp().search(char)
    ]
    return data

In [6]:
corpus = 'I am happy because I am learning'
print(f'Corpus: {corpus}')
words = tokenize(corpus)
print(f'Words (tokens): {words}')

Corpus: I am happy because I am learning
Words (tokens): ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']


### Sliding window of words

In [7]:
def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i - C): i] + words[(i + 1): (i + C + 1)]
        yield context_words, center_word
        i += 1

In [11]:
for x, y in get_windows(words, 2):
    print(f'x = {x} \t y = {y}')

x = ['i', 'am', 'because', 'i'] 	 y = happy
x = ['am', 'happy', 'i', 'am'] 	 y = because
x = ['happy', 'because', 'am', 'learning'] 	 y = i


### Transforming words into vectors

In [12]:
word_2_ind, ind_2_word = get_dict(words)
word_2_ind

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}

### Getting one-hot word vectors

In [13]:
V = len(word_2_ind)
center_word_vector = np.zeros(V)
center_word_vector

array([0., 0., 0., 0., 0.])

In [14]:
n = word_2_ind['happy']
center_word_vector[n] = 1
center_word_vector

array([0., 0., 1., 0., 0.])

In [15]:
def one_hot_encoder(word, word_2_ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word_2_ind[word]] = 1
    return one_hot_vector

In [16]:
one_hot_encoder('happy', word_2_ind, V)

array([0., 0., 1., 0., 0.])

### Getting context words

In [17]:
context_words = ['i', 'am', 'because', 'i']
context_word_vectors = [
    one_hot_encoder(word, word_2_ind, V) for word in context_words
]

In [18]:
print(f'Context word vectors: {context_word_vectors}')

Context word vectors: [array([0., 0., 0., 1., 0.]), array([1., 0., 0., 0., 0.]), array([0., 1., 0., 0., 0.]), array([0., 0., 0., 1., 0.])]


In [20]:
np.mean(context_word_vectors, axis=0)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])