Uploading pre-trained word vectors:

In [1]:
from urllib.request import urlretrieve
import os
if not os.path.isfile('datasets/mini.h5'):
    print("Downloading Conceptnet Numberbatch word embeddings...") 
    conceptnet_url = "http://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5"
    urlretrieve(conceptnet_url, "mini.h5")

Downloading Conceptnet Numberbatch word embeddings...
let's go
working...
done


In [3]:
pip install h5py

Collecting h5py
  Downloading h5py-3.11.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.5 kB)
Downloading h5py-3.11.0-cp311-cp311-macosx_11_0_arm64.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: h5py
Successfully installed h5py-3.11.0
Note: you may need to restart the kernel to use updated packages.


Opening the mini.h5 file and extracting uft-8 words:

In [5]:
import h5py

with h5py.File('mini.h5', 'r') as f:
    all_words = [word.decode('utf-8') for word in f['mat']['axis1'][:]]
    all_embeddings = f['mat']['block0_values'][:]
    
print("all_words dimensions: {}".format(len(all_words)))
print("all_embeddings dimensions: {}".format(all_embeddings.shape))

print("Random example word: {}".format(all_words[1337]))

all_words dimensions: 362891
all_embeddings dimensions: (362891, 300)
Random example word: /c/de/aufmachung


In [15]:
all_embeddings

array([[ -2,  -2, -15, ...,  -2,  -3,   0],
       [ -1,  -1,  -2, ...,  -2,  -2,   2],
       [  0,   0,  -2, ...,   0,   0,  -2],
       ...,
       [  0,   3,   4, ...,   0,   1,   2],
       [ -2,   3,   3, ...,   0,  -7,   1],
       [  1,   0,   2, ...,  -7,   0,   0]], dtype=int8)

In [8]:
all_words

['/c/de/###er',
 '/c/de/##jahre',
 '/c/de/0',
 '/c/de/1',
 '/c/de/2',
 '/c/de/2d',
 '/c/de/3',
 '/c/de/3d',
 '/c/de/4',
 '/c/de/5',
 '/c/de/6',
 '/c/de/7',
 '/c/de/8',
 '/c/de/9',
 '/c/de/a',
 '/c/de/a.d',
 '/c/de/aa',
 '/c/de/aaa',
 '/c/de/aachen',
 '/c/de/aachener',
 '/c/de/aal',
 '/c/de/aalen',
 '/c/de/aarau',
 '/c/de/aargau',
 '/c/de/aaron',
 '/c/de/ab',
 '/c/de/aba',
 '/c/de/abarbeiten',
 '/c/de/abartig',
 '/c/de/abb',
 '/c/de/abba',
 '/c/de/abbas',
 '/c/de/abbau',
 '/c/de/abbauen',
 '/c/de/abbekommen',
 '/c/de/abbiegen',
 '/c/de/abbild',
 '/c/de/abbilden',
 '/c/de/abbildung',
 '/c/de/abbrechen',
 '/c/de/abbringen',
 '/c/de/abbruch',
 '/c/de/abc',
 '/c/de/abdecken',
 '/c/de/abdeckung',
 '/c/de/abdruck',
 '/c/de/abe',
 '/c/de/abel',
 '/c/de/abend',
 '/c/de/abendblatt',
 '/c/de/abendbrot',
 '/c/de/abendessen',
 '/c/de/abendland',
 '/c/de/abendmahl',
 '/c/de/abends',
 '/c/de/abendzeitung',
 '/c/de/abenteuer',
 '/c/de/abenteuern',
 '/c/de/abenteurer',
 '/c/de/aber',
 '/c/de/aberglaube

Picking just the english words:

In [9]:
english_words = [word[6:] for word in all_words if word.startswith('/c/en/')]
english_word_indices = [i for i, word in enumerate(all_words) if word.startswith('/c/en/')]
english_embeddings = all_embeddings[english_word_indices]

print("Number of English words in all_words: {0}".format(len(english_words)))
print("english_embeddings dimensions: {0}".format(english_embeddings.shape))

Number of English words in all_words: 150875
english_embeddings dimensions: (150875, 300)


In [16]:
english_words

['#####ish',
 '####ed',
 '####s',
 '####th',
 '###.##e',
 '###.##i',
 '###er',
 '###s',
 '###st',
 '###th',
 '###z',
 '##base#',
 '##er',
 '##mo',
 '##nd',
 '##rd',
 '##s',
 '##st',
 '##th',
 '##x',
 '##º',
 '0',
 '0f',
 '0ff',
 '0h',
 '0k',
 '0ld',
 '0n',
 '0ne',
 '0r',
 '0s',
 '0th',
 '0ver',
 '1',
 '1_corinthians',
 '1a',
 '1am',
 '1b',
 '1d',
 '1g',
 '1m',
 '1mm',
 '1o',
 '1pm',
 '1pp',
 '1s',
 '1st',
 '1th',
 '2',
 '2.0',
 '2_2_2',
 '2a',
 '2am',
 '2b',
 '2c',
 '2cv',
 '2d',
 '2dr',
 '2f',
 '2g',
 '2k',
 '2km',
 '2l',
 '2lt',
 '2m',
 '2nd',
 '2o',
 '2ot',
 '2pac',
 '2pm',
 '2s',
 '2th',
 '2wd',
 '2x',
 '2x4',
 '3',
 '3_d',
 '3a',
 '3am',
 '3b',
 '3c',
 '3d',
 '3d_printer',
 '3f',
 '3g',
 '3k',
 '3m',
 '3o',
 '3ot',
 '3p',
 '3pm',
 '3po',
 '3rd',
 '3s',
 '3th',
 '3w',
 '4',
 '4_aminopyridine',
 '4_h_er',
 '4a',
 '4am',
 '4b',
 '4c',
 '4chan',
 '4d',
 '4d_ultrasound',
 '4dr',
 '4eva',
 '4ever',
 '4f',
 '4g',
 '4gl',
 '4h',
 '4k',
 '4o',
 '4pm',
 '4s',
 '4th',
 '4to',
 '4wd',
 '4x2',

Now, we are going to normalize the vectors. Here, the focus is not on the length of them (which represents frequency of use), but in the direction and proximity of them. So, normalizing them and using the dot product as a form of calculating the similarity, as it is proportional to the cossine of the angle between the vectors, is the strategy:

In [17]:
import numpy as np

norms = np.linalg.norm(english_embeddings, axis=1)
normalized_embeddings = english_embeddings.astype('float32') / norms.astype('float32').reshape([-1, 1])

In [19]:
# Dic that maps each word to its index
index = {word: i for i, word in enumerate(english_words)}

In [22]:
# Defining the similarity between words:
def similarity_score(w1, w2):
    score = np.dot(normalized_embeddings[index[w1], :], normalized_embeddings[index[w2], :])
    return score

In [24]:
similarity_score("blue", "red")

0.67598516

In [25]:
similarity_score("cat", "feline")

0.81995475

In [26]:
similarity_score("soccer", "steak")

0.013479051

In [27]:
similarity_score("cat", "cat")

1.0

In [28]:
# Most similar words to a given word
def closest_to_vector(v, n):
    all_scores = np.dot(normalized_embeddings, v)
    best_words = list(map(lambda i: english_words[i], reversed(np.argsort(all_scores))))
    return best_words[:n]

def most_similar(w, n):
    return closest_to_vector(normalized_embeddings[index[w], :], n)

In [29]:
most_similar("cat", 10)

['cat',
 'humane_society',
 'kitten',
 'feline',
 'colocolo',
 'cats',
 'kitty',
 'maine_coon',
 'housecat',
 'sharp_teeth']

In [30]:
most_similar("soccer", 10)

['soccer',
 'fa_cup',
 'football',
 'soccerball',
 'white_hart_lane',
 'footballs',
 'toe_poke',
 'footgolf',
 'bafana_bafana',
 'footballing']

In [31]:
most_similar("love", 5)

['love', 'unconditional_love', 'loves', 'unlove', 'loved']

In [37]:
# Finding words that are close to tuple of words
def closest_to_vector_ndim(a1, b1, a2):
    b2 = normalized_embeddings[index[b1], :] - normalized_embeddings[index[a1], :] + normalized_embeddings[index[a2], :]
    return closest_to_vector(b2, 1)

In [38]:
print(closest_to_vector_ndim("man", "brother", "woman"))

['sister']


In [41]:
closest_to_vector_ndim("spain", "madrid", "france")

['paris']

Now, we are going to perform sentiment analysis based on a txt that contains movie reviews and classifications (0 for negative and 1 for positive). Here, SWEM (Simple Word Embedding Model) is used, that is, we are going to take the mean of each word vector in the document and used it in a logistic regression. Let's read the txt 

In [43]:
import string
remove_punct=str.maketrans('','',string.punctuation)

# This function converts a line of our data file into
# a tuple (x, y), where x is 300-dimensional representation
# of the words in a review, and y is its label.
def convert_line_to_example(line):
    # Pull out the first character: that's our label (0 or 1)
    y = int(line[0])
    
    # Split the line into words using Python's split() function
    words = line[2:].translate(remove_punct).lower().split()
    
    # Look up the embeddings of each word, ignoring words not
    # in our pretrained vocabulary.
    embeddings = [normalized_embeddings[index[w]] for w in words
                  if w in index]
    
    # Take the mean of the embeddings
    x = np.mean(np.vstack(embeddings), axis=0)
    return x, y

# Apply the function to each line in the file.
xs = []
ys = []
with open("movie-simple.txt", "r", encoding='utf-8', errors='ignore') as f:
    for l in f.readlines():
        x, y = convert_line_to_example(l)
        xs.append(x)
        ys.append(y)

# Concatenate all examples into a numpy array
xs = np.vstack(xs)
ys = np.vstack(ys)

In [48]:
xs.shape

(1411, 300)

In [49]:
ys.shape

(1411, 1)

So, we have 1411 reviews...

Now, let's make train-test split. First, we have to shuffle the dataset:

In [51]:
shuffle_idx = np.random.permutation(xs.shape[0])
xs = xs[shuffle_idx, :]
ys = ys[shuffle_idx, :]

Converting the data to PyTorch tensors:

In [53]:
import torch

num_train = 4*xs.shape[0] // 5

x_train = torch.tensor(xs[:num_train])
y_train = torch.tensor(ys[:num_train], dtype=torch.float32)

x_test = torch.tensor(xs[num_train:])
y_test = torch.tensor(ys[num_train:], dtype=torch.float32)

In [54]:
reviews_train = torch.utils.data.TensorDataset(x_train, y_train)
reviews_test = torch.utils.data.TensorDataset(x_test, y_test)

train_loader = torch.utils.data.DataLoader(reviews_train, batch_size=100, shuffle=True)
test_loader = torch.utils.data.DataLoader(reviews_test, batch_size=100, shuffle=False)

Let's build SWEM model:

In [55]:
import torch.nn as nn
import torch.nn.functional as F

In [56]:
class SWEM(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(300, 64)
        self.fc2 = nn.Linear(64, 1) # One prediction on the final. Passing on a sigmoid function, 0.5 will be the threshold

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

Training the model and printing the accuracy:

In [57]:
## Training
# Instantiate model
model = SWEM()

# Binary cross-entropy (BCE) Loss and Adam Optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Iterate through train set minibatchs 
for epoch in range(250):
    correct = 0
    num_examples = 0
    for inputs, labels in train_loader:
        # Zero out the gradients
        optimizer.zero_grad()
        
        # Forward pass
        y = model(inputs)
        loss = criterion(y, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        predictions = torch.round(torch.sigmoid(y))
        correct += torch.sum((predictions == labels).float())
        num_examples += len(inputs)
    
    # Print training progress
    if epoch % 25 == 0:
        acc = correct/num_examples
        print("Epoch: {0} \t Train Loss: {1} \t Train Acc: {2}".format(epoch, loss, acc))

## Testing
correct = 0
num_test = 0

with torch.no_grad():
    # Iterate through test set minibatchs 
    for inputs, labels in test_loader:
        # Forward pass
        y = model(inputs)
        
        predictions = torch.round(torch.sigmoid(y))
        correct += torch.sum((predictions == labels).float())
        num_test += len(inputs)
    
print('Test accuracy: {}'.format(correct/num_test))

Epoch: 0 	 Train Loss: 0.6902092099189758 	 Train Acc: 0.5647163391113281
Epoch: 25 	 Train Loss: 0.273639440536499 	 Train Acc: 0.9468085169792175
Epoch: 50 	 Train Loss: 0.14503586292266846 	 Train Acc: 0.9689716100692749
Epoch: 75 	 Train Loss: 0.11983732134103775 	 Train Acc: 0.9778369069099426
Epoch: 100 	 Train Loss: 0.030818801373243332 	 Train Acc: 0.9804964661598206
Epoch: 125 	 Train Loss: 0.033565063029527664 	 Train Acc: 0.9831560254096985
Epoch: 150 	 Train Loss: 0.06276717036962509 	 Train Acc: 0.9858155846595764
Epoch: 175 	 Train Loss: 0.014347316697239876 	 Train Acc: 0.9902482032775879
Epoch: 200 	 Train Loss: 0.05792086198925972 	 Train Acc: 0.9929078221321106
Epoch: 225 	 Train Loss: 0.06382538378238678 	 Train Acc: 0.9946808218955994
Test accuracy: 0.9575971961021423


Checking what our model has learned:

In [58]:
# Check some words
words_to_test = ["exciting", "hated", "boring", "loved"]

for word in words_to_test:
    x = torch.tensor(normalized_embeddings[index[word]].reshape(1, 300))
    print("Sentiment of the word '{0}': {1}".format(word, torch.sigmoid(model(x))))

Sentiment of the word 'exciting': tensor([[1.]], grad_fn=<SigmoidBackward0>)
Sentiment of the word 'hated': tensor([[8.6474e-20]], grad_fn=<SigmoidBackward0>)
Sentiment of the word 'boring': tensor([[4.6554e-14]], grad_fn=<SigmoidBackward0>)
Sentiment of the word 'loved': tensor([[1.]], grad_fn=<SigmoidBackward0>)


# Learning word vectors

Now, we are going to make the word vectors parameters of our model, and we are going to learn than in the training phase. Training the word vectors makes them more specific to the task in focus:

In [62]:
VOCAB_SIZE = 5000
EMBED_DIM = 300

embedding = nn.Embedding(VOCAB_SIZE, EMBED_DIM)
embedding.weight.size() # Matrix with 5000 word vectors with 300 dimensions

torch.Size([5000, 300])

SWEM model in this case:

In [63]:
class SWEMWithEmbeddings(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_dim, num_outputs):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.fc1 = nn.Linear(embedding_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_outputs)

    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=0)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

Specifying the model:

In [64]:
model = SWEMWithEmbeddings(
    vocab_size = 5000,
    embedding_size = 300, 
    hidden_dim = 64, 
    num_outputs = 1,
)
print(model)

SWEMWithEmbeddings(
  (embedding): Embedding(5000, 300)
  (fc1): Linear(in_features=300, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)


## RNN's (Recurrent Neural Networks)

RNN's are used in the case of predicting and infering sequential data, which is the case, in most of time, of NLP. So, let's do it with the phrase: "Recurrent neural networks are great"

In [65]:
mb = 1
x_dim = 300 
sentence = ["recurrent", "neural", "networks", "are", "great"]

xs = []
for word in sentence:
    xs.append(torch.tensor(normalized_embeddings[index[word]]).view(1, x_dim))
    
xs = torch.stack(xs, dim=0)
print("xs shape: {}".format(xs.shape))

xs shape: torch.Size([5, 1, 300])


In [66]:
xs

tensor([[[ 0.0000,  0.0000,  0.0527,  ...,  0.0176,  0.0351,  0.0176]],

        [[ 0.0175,  0.0000,  0.0350,  ..., -0.0525,  0.0175,  0.0350]],

        [[ 0.0174,  0.0000,  0.0000,  ...,  0.0174,  0.0174, -0.0348]],

        [[ 0.0000, -0.0347,  0.0521,  ..., -0.0868,  0.0174, -0.0347]],

        [[ 0.0000, -0.0172,  0.0345,  ...,  0.0000, -0.1552, -0.1207]]])

One vector for each word in the phrase with 300 dimensions

Building a RNN in PyTorch:

In [67]:
import numpy as np
import torch

h_dim = 128

# For projecting the input
Wx = torch.randn(x_dim, h_dim)/np.sqrt(x_dim)
Wx.requires_grad_()
bx = torch.zeros(h_dim, requires_grad=True)

# For projecting the previous state
Wh = torch.randn(h_dim, h_dim)/np.sqrt(h_dim)
Wh.requires_grad_()
bh = torch.zeros(h_dim, requires_grad=True)

h_dim = 128

# For projecting the input
Wx = torch.randn(x_dim, h_dim)/np.sqrt(x_dim)
Wx.requires_grad_()
bx = torch.zeros(h_dim, requires_grad=True)

# For projecting the previous state
Wh = torch.randn(h_dim, h_dim)/np.sqrt(h_dim)
Wh.requires_grad_()
bh = torch.zeros(h_dim, requires_grad=True)

Let's make a function that will do the time step in the RNN:

In [68]:
def RNN_step(x, h):
    h_next = torch.tanh((torch.matmul(x, Wx) + bx) + (torch.matmul(h, Wh) + bh))

    return h_next

First step, that is, beggining of sentence:

In [69]:
# Word embedding for first word
x1 = xs[0, :, :]

# Initialize hidden state to 0
h0 = torch.zeros([mb, h_dim])

Forward:

In [70]:
# Forward pass of one RNN step for time step t=1
h1 = RNN_step(x1, h0)

print("Hidden state h1 dimensions: {0}".format(h1.shape))

Hidden state h1 dimensions: torch.Size([1, 128])


Forward again:

In [71]:
# Word embedding for second word
x2 = xs[1, :, :]

# Forward pass of one RNN step for time step t=2
h2 = RNN_step(x2, h1)

print("Hidden state h2 dimensions: {0}".format(h2.shape))

Hidden state h2 dimensions: torch.Size([1, 128])


So, we can continue, feeding the word vector input of the moment, as well as the hidden unit from the last phase

We can use high level API's presented in PyTorch, that already have RNN's implemented:

In [72]:
import torch.nn

rnn = nn.RNN(x_dim, h_dim)
print("RNN parameter shapes: {}".format([p.shape for p in rnn.parameters()]))

RNN parameter shapes: [torch.Size([128, 300]), torch.Size([128, 128]), torch.Size([128]), torch.Size([128])]


Forward:

In [73]:
hs, h_T = rnn(xs)

print("Hidden states shape: {}".format(hs.shape))
print("Final hidden state shape: {}".format(h_T.shape))

Hidden states shape: torch.Size([5, 1, 128])
Final hidden state shape: torch.Size([1, 1, 128])


Example of LSTM using the torch.nn API:

In [74]:
lstm = nn.LSTM(x_dim, h_dim)
print("LSTM parameters: {}".format([p.shape for p in lstm.parameters()]))

LSTM parameters: [torch.Size([512, 300]), torch.Size([512, 128]), torch.Size([512]), torch.Size([512])]
