# Week 2: https://www.coursera.org/learn/natural-language-processing-tensorflow/home/week/2

In [1]:
# Embeddings: words and associated words are clustered in 
# vectors in multi-dimensional space

In [2]:
# IMDB classification

In [5]:
import tensorflow as tf

In [4]:
tf.__version__

'2.1.0'

In [6]:
# !pip install -q tensorflow-datasets

In [7]:
import tensorflow_datasets as tfds

## Load in data

In [124]:
imbd, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [None]:
# train test split

In [9]:
train, test = imbd['train'], imbd['test']

In [10]:
train_sentences = []
train_labels = []

for i,j in train:
    train_sentences.append(str(i.numpy()))
    train_labels.append(str(j.numpy()))

In [11]:
test_sentences = []
test_labels = []

for i,j in test:
    test_sentences.append(str(i.numpy()))
    test_labels.append(str(j.numpy()))

In [12]:
import numpy as np
train_labels_array = np.array(train_labels)
test_labels_array = np.array(test_labels)

### convert str label to int

In [105]:
map_str_labels_to_int = {'1':1, '0':0}

In [120]:
train_labels_int=list(map(int, train_labels))
test_labels_int=list(map(int, test_labels))

In [121]:
train_labels_array_int = np.array(train_labels_int)
test_labels_array_int = np.array(test_labels_int)

## Create tokenizer and apply padding

In [61]:
VOCAB_SIZE = int(1e+4)  # size of vocabulary
OOV_TOK = "<OOV>"
MAX_LEN = 120  # max number of items in sequence
TRUNC_TYPE = 'post'
EMBED_DIM = 16  # embedding dimension (= output)

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer,text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
tokenizer = Tokenizer(oov_token=OOV_TOK, num_words=VOCAB_SIZE)

In [16]:
tokenizer.fit_on_texts(train_sentences)

In [17]:
word_index = tokenizer.word_index # 86539 length

In [18]:
sequences = tokenizer.texts_to_sequences(train_sentences)

In [78]:
padded = pad_sequences(sequences, maxlen=MAX_LEN, truncating=TRUNC_TYPE)

In [79]:
padded.shape

(25000, 120)

In [75]:
# apply on test

In [20]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [70]:
test_padded = pad_sequences(test_sequences, maxlen=MAX_LEN)

In [71]:
test_padded.shape

(25000, 120)

### Reverse word index tuple

In [72]:
word_index_reversed = [(value,key) for (key,value) in word_index.items()]

## Model

### With flatten layer

In [84]:
model = tf.keras.Sequential([
    
    # Result is 2D array with length of sentence and dimension of embedding
    tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, 
                                  output_dim=EMBED_DIM,
                                  input_length=MAX_LEN),
    
    # In NLP another layer type than flatten is used (due to size of output vector)
    tf.keras.layers.Flatten(),
    
    # Dense NN for classification
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [85]:
(
    model
    .compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])
)
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
flatten_3 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_12 (Dense)             (None, 6)                 11526     
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


### With global average pooling layer

In [68]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, 
                                  output_dim=EMBED_DIM,
                                  input_length=MAX_LEN),
    
    # Averages across the vectors to flatten it out (bit faster)
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [69]:
(
    model
    .compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])
)
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 6)                 102       
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 7         
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________


### Fit model and validate

In [86]:
NUM_EPOCHS = 10

In [123]:
(
    model
    .fit(x=padded,
         y=train_labels_array_int,
         epochs=NUM_EPOCHS,
         validation_data=(test_padded, test_labels_array_int)
        )
)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x25485d93148>