In [1]:
import tensorflow as tf
# tf.config.run_functions_eagerly(True)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import cv2
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn
import datetime, pathlib, io, os, time, random, re, string
import gensim.downloader as api
from PIL import Image
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense, Flatten, InputLayer, BatchNormalization, Input, Embedding, TextVectorization
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dropout, Conv1D
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy, TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from tensorboard.plugins import projector

In [2]:
# Load Dataset

In [3]:
train_ds, val_ds, test_ds = tfds.load('imdb_reviews', split=['train', 'test[:50%]', 'test[50%:]'], as_supervised=True)

2023-10-24 15:13:02.275144: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2023-10-24 15:13:02.275164: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2023-10-24 15:13:02.275168: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2023-10-24 15:13:02.275195: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-10-24 15:13:02.275212: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


# Text Standardization

In [4]:
def standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    no_tag = tf.strings.regex_replace(lowercase, "<[^>]+>", "")
    output = tf.strings.regex_replace(no_tag, "[%s]"%re.escape(string.punctuation), "")
    return output

# Tokenization

In [5]:
VOCAB_SIZE = 10000
SEQUENCE_LENGTH = 250
EMBEDDING_DIM=300
BATCH_SIZE = 64

In [6]:
vectorize_layer = TextVectorization(
    standardize= standardization,
    max_tokens = VOCAB_SIZE,
    output_mode = 'int',
    output_sequence_length = SEQUENCE_LENGTH
)

### vocaburary

In [11]:
training_data = train_ds.map(lambda x, y: tf.expand_dims(x, 0))

In [12]:
for i in training_data.take(1):
    print(i)

tf.Tensor([b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."], shape=(1,), dtype=string)


2023-10-24 15:16:14.428859: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [13]:
vectorize_layer.adapt(training_data)

In [None]:
# with tf.device('/cpu:0'):
#     vectorize_layer.adapt(training_data)

### vectorized dataset

In [None]:
def vectorizer(review, label):
    return vectorize_layer(review), label

In [None]:
train_dataset = train_ds.map(vectorizer)
val_dataset = val_ds.map(vectorizer)

In [None]:
train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

# Pretrained Word2Vec(Gensim)

In [None]:
word2vec = api.load('word2vec-google-news-300')

In [None]:
word2vec.vectors.shape

In [None]:
word2vec['The']

In [None]:
word2vec.most_similar('Man')

In [None]:
def first_caps(word):
    return word[0].upper() + word[1:]

In [None]:
pretrained_embeddings = []

for i in range(len(vectorize_layer.get_vocabulary())):
    word = vectorize_layer.get_vocabulary()[i]
    # print(f"word is '{word}'")
    try:
        pretrained_embeddings.append(word2vec[word])
    except:
        # print(f"word is '{word}'")
        try:
            pretrained_embeddings.append(word2vec[first_caps(word)])
        except:
            pretrained_embeddings.append(np.random.normal(loc=0, scale=1, size=(EMBEDDING_DIM)))
        
    if i%1000 == 0:
        print(f"====> i is {i}")

In [None]:
pretrained_embeddings_array = np.array(pretrained_embeddings)
pretrained_embeddings_array.shape

In [None]:
# np.save('data/pretraining_embeddings.npy', pretrained_embeddings_array)

In [None]:
pretrained_embeddings_array = np.load('data/pretraining_embeddings.npy')

# Modeling

In [None]:
model = tf.keras.models.Sequential([
    Input(shape=(SEQUENCE_LENGTH,)),
    Embedding(
        VOCAB_SIZE,
        EMBEDDING_DIM,
        embeddings_initializer=tf.keras.initializers.Constant(pretrained_embeddings_array),
        trainable=False
    ),
    
    Conv1D(32, 3, activation='relu'),
    Flatten(),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model.summary()

In [None]:
checkpoint_filepath = 'localdata/Section6/conv1d_word2vec.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True
)

In [None]:
model.compile(optimizer=Adam(1e-4), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,
    callbacks=[model_checkpoint_callback])

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model_accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# Evaluation

In [None]:
model.load_weights(checkpoint_filepath)

In [None]:
test_dataset = test_ds.map(vectorizer)
test_dataset = test_dataset.batch(BATCH_SIZE)
model.evaluate(test_dataset)

# Testing

In [None]:
test_dataset = tf.data.Dataset.from_tensor_slices([['''This movie has always been a favorite of mine. I never like holiday movies, because i always find them to be full to bursting with slapstick comedy, or way too sugary-sweet and dramatic. both of these things are okay in moderation, but most Christmas movies seem to go to one side of the spectrum or the other. this wonderful fairy tale is perfect for someone like me, who likes a little bit of a darker movie, but expects a Christmas movie to have a good message. the darkness in the movie is not without cause-it shows the joy of Christmas in great contrast to the scariness of Halloween, and it made me love both holidays all the more for that reason. i don't know, maybe that's just because Halloween and Christmas are my favorite holidays, but i really feel that this movie is great for older children and adults. younger children (up to 5 or 6 years) may find this simply frightening, but older children would find it wonderful.'''],
                                                  ['''`Ballistic: Ecks vs. Sever' has been saddled with not only one of the worst movie titles in recent memory, but one of the worst screenplays as well. The film's third-rate espionage plot makes no sense at all and serves basically as a lame excuse for endless explosions, shootouts and double-flipping car chases, which have become the standard accoutrements for virtually every action picture since `Bullitt' in 1968. The problem with `Ballistic' is that the viewer can never tell who is doing what to whom or why  and we never care. The film is really all about style anyway. How else to account for the rather ludicrous image of Lucy Liu - looking more like a fashion model out on a shoot than a trained killer doing the shooting herself - strolling in elegant slow motion through the streets of Vancouver, wiping out what seems to be an entire hit squad with a combination of superhuman marksmanship and Matrix-like kickboxing moves? With her ankle-length designer coat and her icy-cool demeanor, she looks like Calvin Klein's idea of what the well-dressed assassin should be wearing this season. It's enough to reduce the whole enterprise to the level of comic absurdity  and, indeed, I often found myself laughing out loud at many of the ostensibly serious shenanigans occurring in the film. The flashbacks, which are obviously intended to clarify the characters' relationships, are so poorly done that they actually end up making the whole story more muddled and confusing. (And, although the child-kidnapping scenario is never as offensive in this film as it is in `Trapped,' one can still question the propriety of filmmakers running to this theme with the kind of frequency they seem to have been doing of late).Antonio Banderas makes up the other half of the film's title (he is Ecks, she Sever), and one only wonders what he could have been thinking about when he signed on to co-star in this particular project. `Ballistic' is utterly dispensable moviemaking: here today, forgotten tomorrow, a film utterly without distinction, conviction or purpose.''']])

In [None]:
def vectorizer_test(review):
    return vectorize_layer(review)
test_dataset = test_dataset.map(vectorizer_test)

In [None]:
model.predict(test_dataset)

### Inference ready testing

In [None]:
inputs = Input(shape=(1,), dtype="string")
vectorized_inputs = vectorize_layer(inputs)
outputs = model(vectorized_inputs)
interence_ready_model = tf.keras.Model(inputs, outputs)
interence_ready_model.summary()

In [None]:
interence_ready_model.predict(["This movie has always been a favorite of mine."])

# Visualizaing embeddings

In [None]:
EMBEDDING_DIM = 300
model = tf.keras.models.Sequential([
    Input(shape=(SEQUENCE_LENGTH,)),
    Embedding(VOCAB_SIZE, EMBEDDING_DIM),
    
    Conv1D(32, 3, activation='relu'),
    Flatten(),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model.summary()

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(1e-4),
             metrics=['accuracy'])

In [None]:
log_dir='localdata/logs/imbd/fit/' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+"/"

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir, histogram_freq=1)

In [None]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5,
    callbacks=[tensorboard_callback]
)

In [None]:
with open(os.path.join(log_dir, 'metadata.tsv'), 'w', encoding='utf-8') as f:
    for i in range(VOCAB_SIZE):
        f.write(f"{i} {vectorize_layer.get_vocabulary()[i]}\n")

In [None]:
embedding_weights = tf.Variable(model.layers[0].get_weights()[0])
embedding_weights.shape

In [None]:
checkpoint = tf.train.Checkpoint(embedding=embedding_weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

config = projector.ProjectorConfig()
embedding = config.embeddings.add()

In [None]:
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir localdata/logs/imbd/fit