# import libraries

In [0]:
import tensorflow as tf
from tensorflow import keras 
print(tf.__version__)

import tensorflow_datasets as tfds

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


import os
import numpy as np

2.2.0-rc3


# define helper functions

In [0]:
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])


# get dataset

In [0]:
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Completed...', max=1, style=ProgressStyl…

HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Size...', max=1, style=ProgressStyle(des…







HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteC3V8OZ/imdb_reviews-train.tfrecord


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteC3V8OZ/imdb_reviews-test.tfrecord


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteC3V8OZ/imdb_reviews-unsupervised.tfrecord


HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


# split dataset into training & testing subsets

In [0]:
train_data, test_data = imdb['train'], imdb['test']

In [0]:
# define sentences & labels per dataset
train_sentences = []
train_labels = []

test_sentences = []
test_labels = []

# iterate over datasets and populate respective lists
for sentence, label in train_data:
  # convert data into numpy format for model processing
  train_sentences.append(str(sentence.numpy()))
  train_labels.append(label.numpy())

for sentence, label in test_data:
  test_sentences.append(str(sentence.numpy()))
  test_labels.append(label.numpy())


In [0]:
# convert label lists into numpy array for model processing
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)


# define hyperparameters

In [0]:
vocab_size = 10000
max_length = 120
trunc_type = 'post' # if sentence exceeds max_length, cut at the end
pad_type = 'post'
oov_token = '<OOV>'
num_epochs = 100

as an nlp neural network trains, it learns vectors and associates vectors with labels to come up with an EMBEDDING
- embedding= vector for each word with associated label
- embedding layer returns 2D array= (sentence_length, embedding_size)

In [0]:
embedding_dimension = 16

# preprocess

**tokenize sentences**

tokenize means to split corpus dataset into encoded words (words in numeric representation)


In [0]:
# instantiate tokenizer to generate word index dictionary
  # num_words= max num_words tracked by frequency in corpus
  # oov_token= token substitute for out_of_vocabulary words
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)

# .fit_on_texts() to tokenize training sentences
tokenizer.fit_on_texts(train_sentences)

# get word_index dictionary= word:index
word_index = tokenizer.word_index
print('word_index: \n{}'.format(word_index))


In [0]:
# texts_to_sequences() transforms list of sentences into lists of numeric representation
  # uses training word_index
sequences = tokenizer.texts_to_sequences(train_sentences)

In [0]:
# pad_sequences() transforms sentence/sequence into a uniform input_shape 
  # padding= 'post' because default padding adds 0's to beginning 
  # maxlen= max length of sequence
train_padded = pad_sequences(sequences, maxlen=max_length, padding=pad_type, truncating=trunc_type)

# tokenize & pad test sequences
# word_index is derived from training set, testing will probably generate more <OOV>
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences)


# reverse word_index for plotting

`"hello":1 ==> 1: "hello"`

In [0]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

print(decode_review(train_padded[1]))
print(train_sentences[1])


# define the model

**embedding process**

words are mapped in higher dimensional space, and semantics of the words are then learned when those words are labeled with similar meaning. 

*movie review examples:*
- movie reviews with positive sentiment had the dimensionality of their words ended up 'pointing' in a particular direction

- movie reviews with negative sentiment 'pointed' in a different direction

---

after model training, words in future sentences could have their direction established as positive or negative (inferred sentiment)

In [0]:
model = keras.Sequential([
  # over time, words cluster together due to the training labels (word meaning)
    # embedding= words found together are given similar vectors (shape & direction)
  # embedding output_shape=(sentence_length, embedding_size)
    # embedding_dimension=num_neurons
  keras.layers.Embedding(vocab_size, embedding_dimension, input_length=max_length),
  # flatten input_vector into a 1D array for dense layers
    # for nlp, use global average pooling 1D
  keras.layers.GlobalAveragePooling1D(),
  # use fully-connected dense layers to map inputs to outputs
  keras.layers.Dense(units=16, activation='relu'),
  # output layer
  # since binary classification, positive or negative, use sigmoid for single prediction
  keras.layers.Dense(units=1, activation='sigmoid')
])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


# compile the model

build the model by compiling it with a loss, optimizer, and objective metrics
- loss= prediction accuracy
- the optimizer uses the loss to adjust & improve prediction performance per epoch
- metrics= target

In [0]:
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['acc'])

# define callbacks

In [0]:
# enable early_stopping to prevent overfitting
class myCallback(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if logs.get('accuracy') >= .95:
      self.model.stop_training = True

# instantiate callbacks
callbacks = myCallback()   

# train the model

fit the model to train & learn the optimal weights/relationships

In [0]:
model.fit(train_padded, train_labels, epochs=num_epochs, validation_data=(test_padded, test_labels), verbose=1)

# get embeddings

over time, words cluster together due to the training labels (word meaning)

words found together are given similar vectors (shape & direction)

In [0]:
# embedding layer = layer[0]
embeddings = model.layers[0]

# get weights & shape
weights = embeddings.get_weights()[0]

# shape=(10k words in corpus, 16 labels)
  # shape: (vocab_size, embedding_dim)
print('weights.shape: {}'.format(weights.shape)) 

weights.shape: (10000, 16)


# iterate over array to pull out 16 dimensions

In [0]:
import io 

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  # metadata array, write out words
  out_m.write(word+'\n')
  # write out each value in array of embeddings(coefficient of each vector)
  out_v.write('\t'.join([str(x) for x in embeddings])+'\n')
out_v.close()
out_m.close()

# download files via colab

In [0]:
 try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

# evaluate embeddings

https://projector.tensorflow.org/

- load vecs and meta tsv files

on load, select "Spherize data"

In [0]:
sentence = "I really think this is amazing. honest."
sequence = tokenizer.texts_to_sequences(sentence)
print(sequence)

[[11], [], [1431], [966], [4], [1537], [1537], [4715], [], [790], [2019], [11], [2929], [2184], [], [790], [2019], [11], [579], [], [11], [579], [], [4], [1782], [4], [4517], [11], [2929], [1275], [], [], [2019], [1003], [2929], [966], [579], [790], []]


# clean up

In [0]:
import os, signal

os.kill(os.getpid())