<a href="https://colab.research.google.com/github/zerotodeeplearning/ztdl-masterclasses/blob/master/solutions_do_not_open/Word_Embeddings_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Learn with us: www.zerotodeeplearning.com

Copyright © 2021: Zero to Deep Learning ® Catalit LLC.

In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Word Embeddings

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import gzip
import os

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
url = "https://raw.githubusercontent.com/zerotodeeplearning/ztdl-masterclasses/master/data/"

In [None]:
pos_path = tf.keras.utils.get_file(
    'rotten_tomatoes_positive_reviews.txt',
    url + 'rotten_tomatoes_positive_reviews.txt.gz',
    extract=True)
neg_path = tf.keras.utils.get_file(
    'rotten_tomatoes_negative_reviews.txt',
    url + 'rotten_tomatoes_negative_reviews.txt.gz',
    extract=True)

with gzip.open(pos_path) as fin:
  pos_rev = fin.readlines()
  pos_rev = [r.decode('utf-8') for r in pos_rev]

with gzip.open(neg_path) as fin:
  neg_rev = fin.readlines()
  neg_rev = [r.decode('utf-8') for r in neg_rev]
  
docs = np.array(pos_rev + neg_rev)
y = np.array([1]*len(pos_rev) + [0]*len(neg_rev))

docs_train, docs_test, y_train, y_test = train_test_split(docs, y, test_size=0.15, random_state=0)

### Sequence encoding with Keras Tokenizer

In [None]:
max_features = 20000

In [None]:
tokenizer = Tokenizer(
    num_words=max_features,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`\'{|}~\t\n',
    lower=True,
    split=" ",
    char_level=False,
    oov_token=None,
    document_count=0,
)

tokenizer.fit_on_texts(docs_train)

In [None]:
seq_train = tokenizer.texts_to_sequences(docs_train)
seq_test =tokenizer.texts_to_sequences(docs_test)

In [None]:
seq_train[0]

In [None]:
docs_train[0]

In [None]:
' '.join([tokenizer.index_word[i] for i in seq_train[0]])

In [None]:
max([len(s) for s in seq_train])

In [None]:
max([len(s) for s in seq_test])

In [None]:
maxlen=58

X_train = pad_sequences(seq_train, maxlen=maxlen)
X_test = pad_sequences(seq_test, maxlen=maxlen)

In [None]:
X_train.max()

In [None]:
X_test.max()

### Bag of word model with Embeddings

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout

In [None]:
embedding_dim=16

model = Sequential([
  Embedding(max_features,
            embedding_dim,
            input_length=maxlen,
            name='bow_embeddings'),
  Dropout(0.3),
  GlobalAveragePooling1D(),
  Dense(24, activation='relu'),
  Dense(1, activation='sigmoid')
])

model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

h = model.fit(
    X_train, y_train,
    batch_size=128,
    epochs=4,
    validation_split=0.1)

In [None]:
pd.DataFrame(h.history).plot();

### Exercise 1

The model above is still a bag of words model, despite the use of embeddings. Let's improve it using 1D convolutional layers.

- Define a new `Sequential` model that uses `Conv1D` layers after the `Embedding` layer
- Start with the simplest model possible and gradually increase the complexity
- Train the model as above and compare the performance of this model with the previous one

Your code will look like:

```python
model = Sequential([
  Embedding(# YOUR CODE HERE
  # YOUR CODE HERE
])
```

In [None]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, Flatten

In [None]:
model = Sequential([
  Embedding(max_features, embedding_dim, input_length=maxlen),
  Conv1D(32, 3, activation='relu'),
  Conv1D(64, 3, activation='relu'),
  Flatten(),
  Dense(24, activation='relu'),
  Dropout(0.3),
  Dense(1, activation='sigmoid')
])

model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

h = model.fit(
    X_train, y_train,
    batch_size=128,
    epochs=4,
    validation_split=0.1)

In [None]:
pd.DataFrame(h.history).plot();

### Gensim and pre-trained embeddings

In [None]:
import gensim
import gensim.downloader as api

In [None]:
info = api.info()

In [None]:
info.keys()

In [None]:
info['models'].keys()

In [None]:
glove_model = api.load('glove-wiki-gigaword-50')

In [None]:
glove_model.most_similar(positive=['good'], topn=5)

In [None]:
glove_model.most_similar(positive=['two'], topn=5)

In [None]:
glove_model.most_similar(positive=['king', 'woman'],
                         negative=['man'], topn=3)

In [None]:
glove_size = len(glove_model['cat'])
glove_size

In [None]:
glove_weights = np.zeros(shape=(max_features, glove_size))
for i in range(1, max_features):
  w = tokenizer.index_word[i]
  try:
    v = glove_model[w]
    glove_weights[i] = v
  except:
    pass

In [None]:
plt.subplot(211)
plt.plot(glove_model['two'])
plt.plot(glove_model['three'])
plt.plot(glove_model['four'])
plt.title("A few numbers")
plt.ylim(-2, 5)

plt.subplot(212)
plt.plot(glove_model['cat'])
plt.plot(glove_model['dog'])
plt.plot(glove_model['rabbit'])
plt.title("A few animals")
plt.ylim(-2, 5)

plt.tight_layout()

### Exercise 2

Let's use the Glove pre-trained embeddings as our input layer.

- Modify the Embedding layer in your model using a `Constant` initializer that sets the weights to be `glove_weight`
- Adapt the `output_dim` to correspond to the size of glove embeddings
- Set the Embedding layer to be frozen (`trainable=False`)
- Re-train the model and compare the performance

Your code will look like:
```python
model = Sequential([
  Embedding(# YOUR CODE HERE
  # YOUR CODE HERE
])
```

In [None]:
from tensorflow.keras.initializers import Constant

In [None]:
model = Sequential([
  Embedding(input_dim=max_features,
            output_dim=glove_size,
            embeddings_initializer=Constant(glove_weights),
            input_length=maxlen,
            mask_zero=False,
            trainable=False),
  Conv1D(128, 5, activation='relu'),
  MaxPooling1D(5),
  Conv1D(128, 5, activation='relu'),
  GlobalMaxPooling1D(),
  Dense(128, activation='relu'),
  Dropout(0.3),
  Dense(1, activation='sigmoid')
])

model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

h = model.fit(
    X_train, y_train,
    batch_size=128,
    epochs=8,
    validation_split=0.1)