# Week 2: https://www.coursera.org/learn/natural-language-processing-tensorflow/home/week/2

In [1]:
# Embeddings: words and associated words are clustered in 
# vectors in multi-dimensional space

In [2]:
# IMDB classification

In [2]:
import tensorflow as tf

In [3]:
tf.__version__

'2.1.0'

In [6]:
# !pip install -q tensorflow-datasets

In [5]:
import tensorflow_datasets as tfds

## Load in data

In [6]:
imbd, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [None]:
# train test split

In [7]:
train, test = imbd['train'], imbd['test']

In [8]:
train_sentences = []
train_labels = []

for i,j in train:
    train_sentences.append(str(i.numpy()))
    train_labels.append(str(j.numpy()))

In [9]:
test_sentences = []
test_labels = []

for i,j in test:
    test_sentences.append(str(i.numpy()))
    test_labels.append(str(j.numpy()))

In [10]:
import numpy as np
train_labels_array = np.array(train_labels)
test_labels_array = np.array(test_labels)

### convert str label to int

In [11]:
map_str_labels_to_int = {'1':1, '0':0}

In [12]:
train_labels_int=list(map(int, train_labels))
test_labels_int=list(map(int, test_labels))

In [13]:
train_labels_array_int = np.array(train_labels_int)
test_labels_array_int = np.array(test_labels_int)

## Create tokenizer and apply padding

In [14]:
VOCAB_SIZE = int(1e+4)  # size of vocabulary
OOV_TOK = "<OOV>"
MAX_LEN = 120  # max number of items in sequence
TRUNC_TYPE = 'post'
EMBED_DIM = 16  # embedding dimension (= output)

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer,text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
tokenizer = Tokenizer(oov_token=OOV_TOK, num_words=VOCAB_SIZE)

In [17]:
tokenizer.fit_on_texts(train_sentences)

In [18]:
word_index = tokenizer.word_index # 86539 length

In [20]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)

In [21]:
train_padded = pad_sequences(train_sequences, maxlen=MAX_LEN, truncating=TRUNC_TYPE)

In [22]:
train_padded.shape

(25000, 120)

In [23]:
# apply on test

In [24]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [25]:
test_padded = pad_sequences(test_sequences, maxlen=MAX_LEN)

In [26]:
test_padded.shape

(25000, 120)

### Reverse word index tuple

In [89]:
word_index_reversed = dict([(value,key) for (key,value) in word_index.items()])

## Model

### With flatten layer

In [28]:
model = tf.keras.Sequential([
    
    # Result is 2D array with length of sentence and dimension of embedding
    tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, 
                                  output_dim=EMBED_DIM,
                                  input_length=MAX_LEN),
    
    # In NLP another layer type than flatten is used (due to size of output vector)
    tf.keras.layers.Flatten(),
    
    # Dense NN for classification
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [29]:
(
    model
    .compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])
)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


### With global average pooling layer

In [30]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, 
                                  output_dim=EMBED_DIM,
                                  input_length=MAX_LEN),
    
    # Averages across the vectors to flatten it out (bit faster)
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [31]:
(
    model
    .compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])
)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 102       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________


### Fit model and validate

In [32]:
NUM_EPOCHS = 10

In [33]:
(
    model
    .fit(x=train_padded,
         y=train_labels_array_int,
         epochs=NUM_EPOCHS,
         validation_data=(test_padded, test_labels_array_int)
        )
)

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10


Epoch 3/10


Epoch 4/10


Epoch 5/10


Epoch 6/10


Epoch 7/10


Epoch 8/10


Epoch 9/10


Epoch 10/10




<tensorflow.python.keras.callbacks.History at 0x2514ef14e48>

In [59]:
model.summary()
# Params in first layer: VOCAB_SIZE * EMBED_DIM
# Embedding layer input size: MAX_LEN

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 102       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________


In [57]:
e_0 = model.layers[0]
weight_0 = e_0.get_weights()[0]

In [58]:
weight_0.shape # shape: [VOCAB_SIZE, EMBED_DIM]

(10000, 16)

In [60]:
import io

In [61]:
out_v = io.open('vecs.tsv', 'w', encoding= 'utf-8')
out_m = io.open('meta.tsv', 'w', encoding= 'utf-8')

In [91]:
# Skip the '<OOV>'; is it always on place 1?
for word_num in range(1,VOCAB_SIZE):
    word = word_index_reversed[word_num]
    embeddings = weight_0[word_num]  # embedding layer
    out_m.write(word + "\n")
    out_v.write("\t".join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()