# Text Processing and Word Embeddings

## Imports

In [1]:
import os
import torch
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader

from sentiment_analysis.rnn.sentiment_dataset import (
    create_dummy_data,
    download_data
)

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

# 1. Preprocessing a Text Classification Dataset

In [2]:
path = os.path.dirname(os.path.abspath(os.getcwd()))
data_root = os.path.join(path, "datasets", "SentimentData")
path = download_data(data_root)
data = create_dummy_data(path)
for text, label in data:
    print('Text: {}'.format(text))
    print('Label: {}'.format(label))
    print()

Text: Smallville episode Justice is the best episode of Smallville ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! It's my favorite episode of Smallville! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !
Label: 1

Text: I don't know why I like this movie so well, but I never get tired of watching it.
Label: 1

Text: Smallville episode Justice is the best episode of Smallville ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! 

## 1.1 Tokenizing Data

In [3]:
import re

def tokenize(text):
    return [s.lower() for s in re.split(r'\W+', text) if len(s) > 0]

tokenized_data = []
for text, label in data:
    tokenized_data.append((tokenize(text), label))
    print(tokenized_data[-1], '\n')

(['smallville', 'episode', 'justice', 'is', 'the', 'best', 'episode', 'of', 'smallville', 'it', 's', 'my', 'favorite', 'episode', 'of', 'smallville'], 1) 

(['i', 'don', 't', 'know', 'why', 'i', 'like', 'this', 'movie', 'so', 'well', 'but', 'i', 'never', 'get', 'tired', 'of', 'watching', 'it'], 1) 

(['smallville', 'episode', 'justice', 'is', 'the', 'best', 'episode', 'of', 'smallville', 'it', 's', 'my', 'favorite', 'episode', 'of', 'smallville'], 1) 

(['a', 'rating', 'of', '1', 'does', 'not', 'begin', 'to', 'express', 'how', 'dull', 'depressing', 'and', 'relentlessly', 'bad', 'this', 'movie', 'is'], 0) 

(['comment', 'this', 'movie', 'is', 'impossible', 'is', 'terrible', 'very', 'improbable', 'bad', 'interpretation', 'e', 'direction', 'not', 'look'], 0) 

(['i', 'wouldn', 't', 'rent', 'this', 'one', 'even', 'on', 'dollar', 'rental', 'night'], 0) 



## 1.2 Creating a Vocabulary

In [4]:
from collections import Counter

freqs = Counter()
for tokens, _ in tokenized_data:
    freqs.update(tokens)

freqs

Counter({'smallville': 6,
         'episode': 6,
         'justice': 2,
         'is': 5,
         'the': 2,
         'best': 2,
         'of': 6,
         'it': 3,
         's': 2,
         'my': 2,
         'favorite': 2,
         'i': 4,
         'don': 1,
         't': 2,
         'know': 1,
         'why': 1,
         'like': 1,
         'this': 4,
         'movie': 3,
         'so': 1,
         'well': 1,
         'but': 1,
         'never': 1,
         'get': 1,
         'tired': 1,
         'watching': 1,
         'a': 1,
         'rating': 1,
         '1': 1,
         'does': 1,
         'not': 2,
         'begin': 1,
         'to': 1,
         'express': 1,
         'how': 1,
         'dull': 1,
         'depressing': 1,
         'and': 1,
         'relentlessly': 1,
         'bad': 2,
         'comment': 1,
         'impossible': 1,
         'terrible': 1,
         'very': 1,
         'improbable': 1,
         'interpretation': 1,
         'e': 1,
         'direction': 1,
  

In [5]:
vocab = {'<eos>': 0, '<unk>': 1}
for token, freq in freqs.most_common(20):
    vocab[token] = len(vocab)
vocab

{'<eos>': 0,
 '<unk>': 1,
 'smallville': 2,
 'episode': 3,
 'of': 4,
 'is': 5,
 'i': 6,
 'this': 7,
 'it': 8,
 'movie': 9,
 'justice': 10,
 'the': 11,
 'best': 12,
 's': 13,
 'my': 14,
 'favorite': 15,
 't': 16,
 'not': 17,
 'bad': 18,
 'don': 19,
 'know': 20,
 'why': 21}

## 1.3 Creating the Dataset

In [6]:
indexed_data = []
for tokens, label in tokenized_data:
    indices = [vocab.get(token, vocab['<unk>']) for token in tokens]
    indexed_data.append((indices, label))


for indices, label in indexed_data:
    print(indices, ' -> ', label)
    print()

[2, 3, 10, 5, 11, 12, 3, 4, 2, 8, 13, 14, 15, 3, 4, 2]  ->  1

[6, 19, 16, 20, 21, 6, 1, 7, 9, 1, 1, 1, 6, 1, 1, 1, 4, 1, 8]  ->  1

[2, 3, 10, 5, 11, 12, 3, 4, 2, 8, 13, 14, 15, 3, 4, 2]  ->  1

[1, 1, 4, 1, 1, 17, 1, 1, 1, 1, 1, 1, 1, 1, 18, 7, 9, 5]  ->  0

[1, 7, 9, 5, 1, 5, 1, 1, 1, 18, 1, 1, 1, 17, 1]  ->  0

[6, 1, 16, 1, 7, 1, 1, 1, 1, 1, 1]  ->  0



In [7]:
from sentiment_analysis.rnn.sentiment_dataset import SentimentDataset

combined_data = [
    (raw_text, tokens, indices, label)
    for (raw_text, label), (tokens, _), (indices, _)
    in zip(data, tokenized_data, indexed_data)
]

dataset = SentimentDataset(combined_data)

for elem in dataset:
    print(elem)
    print()

{'data': tensor([ 6, 19, 16, 20, 21,  6,  1,  7,  9,  1,  1,  1,  6,  1,  1,  1,  4,  1,
         8]), 'label': tensor(1.)}

{'data': tensor([ 1,  1,  4,  1,  1, 17,  1,  1,  1,  1,  1,  1,  1,  1, 18,  7,  9,  5]), 'label': tensor(0.)}

{'data': tensor([ 2,  3, 10,  5, 11, 12,  3,  4,  2,  8, 13, 14, 15,  3,  4,  2]), 'label': tensor(1.)}

{'data': tensor([ 2,  3, 10,  5, 11, 12,  3,  4,  2,  8, 13, 14, 15,  3,  4,  2]), 'label': tensor(1.)}

{'data': tensor([ 1,  7,  9,  5,  1,  5,  1,  1,  1, 18,  1,  1,  1, 17,  1]), 'label': tensor(0.)}

{'data': tensor([ 6,  1, 16,  1,  7,  1,  1,  1,  1,  1,  1]), 'label': tensor(0.)}



## 1.4 Minibatching

In [17]:
from torch.nn.utils.rnn import pad_sequence

def collate(batch):
    assert isinstance(batch, list)
    data = pad_sequence([b['data'] for b in batch])
    lengths = torch.tensor([len(b['data']) for b in batch])
    label = torch.stack([b['label'] for b in batch])
    return {
        'data': data,
        'label': label,
        'lengths': lengths
    }

loader = DataLoader(dataset, batch_size=3, collate_fn=collate)
for batch in loader:
    print('Data: \n', batch['data'])
    print('\nLabels: \n', batch['label'])
    print('\nSequence Lengths: \n', batch['lengths'])
    print('\n')

Data: 
 tensor([[ 6,  1,  2],
        [19,  1,  3],
        [16,  4, 10],
        [20,  1,  5],
        [21,  1, 11],
        [ 6, 17, 12],
        [ 1,  1,  3],
        [ 7,  1,  4],
        [ 9,  1,  2],
        [ 1,  1,  8],
        [ 1,  1, 13],
        [ 1,  1, 14],
        [ 6,  1, 15],
        [ 1,  1,  3],
        [ 1, 18,  4],
        [ 1,  7,  2],
        [ 4,  9,  0],
        [ 1,  5,  0],
        [ 8,  0,  0]])

Labels: 
 tensor([1., 0., 1.])

Sequence Lengths: 
 tensor([19, 18, 16])


Data: 
 tensor([[ 2,  1,  6],
        [ 3,  7,  1],
        [10,  9, 16],
        [ 5,  5,  1],
        [11,  1,  7],
        [12,  5,  1],
        [ 3,  1,  1],
        [ 4,  1,  1],
        [ 2,  1,  1],
        [ 8, 18,  1],
        [13,  1,  1],
        [14,  1,  0],
        [15,  1,  0],
        [ 3, 17,  0],
        [ 4,  1,  0],
        [ 2,  0,  0]])

Labels: 
 tensor([1., 0., 0.])

Sequence Lengths: 
 tensor([16, 15, 11])




# 2. Embeddings

<img src='https://developers.google.com/machine-learning/crash-course/images/linear-relationships.svg' width=80% height=80%/>

In [18]:
import torch.nn as nn

from sentiment_analysis.rnn.rnn_nn import Embedding

i2dl_embedding = Embedding(len(vocab), 16, padding_idx=0)
pytorch_embedding = nn.Embedding(len(vocab), 16, padding_idx=0)

loader = DataLoader(dataset, batch_size=len(dataset), collate_fn=collate)
for batch in loader:
    x = batch['data']


Difference between outputs: 0.0
Test passed :)!


True