In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from IPython.display import display

## Text Preprocessing

### Bag of Word

- Text

```
  John likes ice cream
  John hates chocolate.
  ```
  
  - Create index table
  {'John': 0, 'chocolate': 1, 'cream': 2, 'hates': 3, 'ice': 4, 'likes': 5}
  
  - Tokenization: Count number of words
  
  ```
    ['John': 1, 'chocolate': 0, 'cream': 1, 'hates': 0, 'ice': 1, 'likes': 1]
    ['John': 1, 'chocolate': 1, 'cream': 0, 'hates': 1, 'ice': 0, 'likes': 0]
  ```
  

In [2]:
sentences = ['John likes ice cream', 'John hates chocolate.']

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Vectorize the sentences
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)

# Vocabulary serves as an index of each word
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

In [4]:
# Tokenization: Transform into number-of-word array
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

### One hot Encoding

In [5]:
# sklearn - One hot encoding
cities = ['London', 'Berlin', 'Berlin', 'New York', 'London']
display(cities)

# Label Encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
city_labels = label_encoder.fit_transform(cities)
display(city_labels)

# one hot Encoding
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder(sparse=False)
cities_onehot = onehot_encoder.fit_transform(
    city_labels.reshape((len(city_labels), 1)))
display(cities_onehot)

# Invert
from numpy import argmax

inverted = np.array([argmax(city) for city in cities_onehot])
display(inverted)

['London', 'Berlin', 'Berlin', 'New York', 'London']

array([1, 0, 0, 2, 1])

array([[0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

array([1, 0, 0, 2, 1])

In [6]:
# Keras - One hot encoding
cities = ['London', 'Berlin', 'Berlin', 'New York', 'London']
display(cities)

# Label Encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
city_labels = label_encoder.fit_transform(cities)
display(city_labels)

# one hot Encoding
from keras.utils import to_categorical

cities_onehot = to_categorical(city_labels)
display(cities_onehot)

# Invert
from numpy import argmax

inverted = np.array([argmax(city) for city in cities_onehot])
display(inverted)

['London', 'Berlin', 'Berlin', 'New York', 'London']

array([1, 0, 0, 2, 1])

Using TensorFlow backend.


array([[0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]], dtype=float32)

array([1, 0, 0, 2, 1])

### Word Embedding
- Unknown words (words that are not in the vocabulary) are denoted in Keras with word_count + 1
#### Word Embedding vs BoW
- Bag of words: sequence of words = a single feature vector.
- Word Embedding:
    + Words represented by each word as a vector
    + Characters represented by each character as a vector
    + N-grams of words/characters represented as a vector (N-grams are overlapping groups of multiple succeeding words/characters in the text)

#### Word Embedding vs Onehot
- one-hot encoding = hardcoded
- Word Embedding = softcoded
    + collect more information into fewer dimensions
    + map semantic meaning into a geometric space(**embedding space**)

#### Train Word Embedding
- Train during Neural Net
- Pre-trained

In [7]:
sentences = [
    'John likes ice cream',
    'John hates chocolate',
    'Kathy likes Google',
    'Kathy hates Apple']

#### Tokenize
- vectorize a text corpus into a list of integers
- Each integer maps to a value in a dictionary that encodes the entire corpus
- index 0 is reserved and is not assigned to any word
- Indexing is ordered after the most common words in the text (1 = most common)

num_words: responsible for setting the size of the vocabulary


In [8]:
from keras.preprocessing.text import Tokenizer

# Fit tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences)

# Word index
display(tokenizer.word_index)

# Vocab size
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
print('\nVocab size: {}'.format(vocab_size))

# Transform
sentences_trained = tokenizer.texts_to_sequences(sentences)

for i in range(len(sentences)):
    print('')
    print(sentences[i])
    print(sentences_trained[i])

{'john': 1,
 'likes': 2,
 'hates': 3,
 'kathy': 4,
 'ice': 5,
 'cream': 6,
 'chocolate': 7,
 'google': 8,
 'apple': 9}


Vocab size: 10

John likes ice cream
[1, 2, 5, 6]

John hates chocolate
[1, 3, 7]

Kathy likes Google
[4, 2, 8]

Kathy hates Apple
[4, 3, 9]


#### pad_sequences
text sequence has in most cases different length of words

    -> pads the sequence of words with zeros

In [9]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 20

X_train = pad_sequences(sentences_trained, padding='post', maxlen=maxlen)
display(X_train.shape)
display(sentences_trained[0])
display(X_train[0])

(4, 20)

[1, 2, 5, 6]

array([1, 2, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32)

#### Train during Neural Net
weights of the embedding layer are initialized with random weights and are then adjusted through backpropagation during training

- `input_dim`: the size of the vocabulary
- `output_dim`: the size of the dense vector
- `input_length`: the length of the sequence

In [10]:
from keras.models import Sequential
from keras import layers

embedding_dim = 50

model = Sequential()

# Embedding layer
model.add(layers.Embedding(
    input_dim=vocab_size, 
    output_dim=embedding_dim, 
    input_length=maxlen))

# using a MaxPooling1D/AveragePooling1D 
#     or a GlobalMaxPooling1D/GlobalAveragePooling1D
#     after the embedding layer
model.add(layers.GlobalMaxPool1D())

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 50)            500       
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 50)                0         
Total params: 500
Trainable params: 500
Non-trainable params: 0
_________________________________________________________________


#### Use Pretrained dataset

In [11]:
# Download pretrained Glove dataset
! wget http://nlp.stanford.edu/data/glove.6B.zip -O glove.zip
! unzip glove.zip -d ./glove
! rm -rf glove.zip
! ls glove

--2018-10-27 11:56:38--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2018-10-27 11:56:43--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.zip’


2018-10-27 12:05:18 (1.60 MB/s) - ‘glove.zip’ saved [862182613/862182613]

Archive:  glove.zip
  inflating: ./glove/glove.6B.50d.txt  
  inflating: ./glove/glove.6B.100d.txt  
  inflating: ./glove/glove.6B.200d.txt  
  inflating: ./glove/glove.6B.300d.txt  
glove.6B.100d.txt  glove.6B.200d.txt  glove.6B.300d.txt  glove.6B.50d.txt


In [12]:
# Exam pretrained dataset
! head -n 1 glove/glove.6B.50d.txt | cut -c-50

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.04445


In [13]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [14]:
# Extract embedding matrix
embedding_dim = 50
embedding_matrix = create_embedding_matrix(
    'glove/glove.6B.50d.txt',
    tokenizer.word_index, embedding_dim)

display(embedding_matrix.shape)

# Check non-zero words (already exist in the pretrained dataset)
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
print('Percentage words covered {}%'.format(
    float(nonzero_elements) / vocab_size * 100.0))

(10, 50)

Percentage words covered 90.0%


In [15]:
from keras.models import Sequential
from keras.layers import Embedding

embedding_dim = 50

model = Sequential()

# Embedding layer
model.add(Embedding(
    input_dim=vocab_size, 
    output_dim=embedding_dim,
    weights=[embedding_matrix], 
    input_length=maxlen,
    trainable=False))
        ### trainable
        ###    False: Not allow continous trainning Embedding
        ###    True: Allow continous trainning Embedding

# using a MaxPooling1D/AveragePooling1D 
#     or a GlobalMaxPooling1D/GlobalAveragePooling1D
#     after the embedding layer
model.add(layers.GlobalMaxPool1D())

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 50)            500       
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 50)                0         
Total params: 500
Trainable params: 0
Non-trainable params: 500
_________________________________________________________________
