In [None]:
# this is frfom TF learning boards

In [2]:
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np
import gc
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [3]:
print(tf.__version__)

2.2.0


In [4]:
# read all the input data files, tab seperated values
data_file = glob('/tf/deep_learning/sentiment_analysis/data/*.txt')
model_file = '/tf/deep_learning/sentiment_analysis/lib/model'
header_list = ["comments", "sentiment"]
# read all the data using windows encoding and python engine. Else it will give error for windows files
l = [pd.read_csv(f, sep='\t', names=header_list, encoding = "ISO-8859-1", engine='python') for f in data_file]
data = pd.concat(l, axis=0)
print('total length of the training data %s'%(len(data)))
data.dtypes

total length of the training data 3000


comments     object
sentiment     int64
dtype: object

In [4]:
# get training data, use balanced split
X_train, X_test, y_train, y_test = train_test_split(data["comments"], data["sentiment"], test_size=0.20, random_state=42, stratify=data['sentiment'])
# get test and validation data, use balanced split
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.25, random_state=42, stratify=y_test)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(X_val.shape)
print(y_val.shape)
# drop the base data and free the memory
del [[l,data]]
gc.collect()


(2400,)
(2400,)
(450,)
(450,)
(150,)
(150,)


75

In [5]:
print(X_train[:1])
print(y_train[:1])



96    The scenes are often funny and occasionally to...
Name: comments, dtype: object
96    1
Name: sentiment, dtype: int64


In [6]:
# check if the data is balanced
print(y_train.value_counts())
print(y_test.value_counts())
print(y_val.value_counts())

1    1200
0    1200
Name: sentiment, dtype: int64
1    225
0    225
Name: sentiment, dtype: int64
1    75
0    75
Name: sentiment, dtype: int64


In [7]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [8]:
# TextVectorization layer that lowercases text, splits on whitespace, 
# strips punctuation, and outputs integer vocab indices
max_features = 5000  # Maximum vocab size.
max_len = 100  # Sequence length to pad the outputs to.
embedding_dims = 16
# Create the layer.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=max_len)

vectorize_layer.adapt(np.array(X_train))


In [9]:
# check total voab
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

Vocabulary size: 4728


In [10]:
def vectorize_text(text):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text)

In [11]:
# retrieve a review from the dataset
pd.set_option('display.max_colwidth', None)
first_review, first_label = X_train[:1], y_train[:1]
print("Review", first_review)
print("Label", first_label)
print("Vectorized review", vectorize_text(first_review))

Review 96    The scenes are often funny and occasionally touching as the characters evaluate their lives and where they are going.  
Name: comments, dtype: object
Label 96    1
Name: sentiment, dtype: int64
Vectorized review tf.Tensor(
[[   2  261   25  671  203    3 1046 1284   27    2  145 3870   90 1522
     3  220   50   25  155    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]], shape=(1, 100), dtype=int64)


In [12]:
# according to documentations the metrix is off by 2
# Note that this vocabulary contains 1 OOV token, 
# so the effective number of tokens is (max_tokens - 1 - (1 if output == "int" else 0)).
print("2 ---> ",vectorize_layer.get_vocabulary()[2-2])
print("261 ---> ",vectorize_layer.get_vocabulary()[261-2])


2 --->  b'the'
261 --->  b'scenes'


In [13]:
X_train_v = np.array(vectorize_text(X_train))
X_test_v = np.array(vectorize_text(X_test))
X_val_v = np.array(vectorize_text(X_val))

In [14]:
X_train_v

array([[   2,  261,   25, ...,    0,    0,    0],
       [  49,  266, 1020, ...,    0,    0,    0],
       [  13,  255,   15, ...,    0,    0,    0],
       ...,
       [ 151,  675,    0, ...,    0,    0,    0],
       [  51,  325,    0, ...,    0,    0,    0],
       [1734,  340,    0, ...,    0,    0,    0]])

In [35]:
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dims),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1, activation='linear')])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 16)          80016     
_________________________________________________________________
dropout_6 (Dropout)          (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d_3 ( (None, 16)                0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 80,033
Trainable params: 80,033
Non-trainable params: 0
_________________________________________________________________


In [36]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True), optimizer='adam', metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [37]:
saveBestModel = tf.keras.callbacks.ModelCheckpoint(filepath='/tf/deep_learning/sentiment_analysis/my_model.h5',monitor='binary_accuracy', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', save_freq=1)
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

In [38]:
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(filepath='/tf/deep_learning/sentiment_analysis/my_model.h5', save_best_only=True)
earlystopping_cb = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [39]:
training_batch=1
epochs = 20
history = model.fit(
    X_train_v, y_train, 
    validation_data=(X_val_v, y_val),
    batch_size=training_batch,
    epochs=epochs,callbacks=[checkpoint_cb, earlystopping_cb], verbose=2)


Epoch 1/20
2400/2400 - 9s - loss: 0.6918 - binary_accuracy: 0.5254 - val_loss: 0.6874 - val_binary_accuracy: 0.6467
Epoch 2/20
2400/2400 - 9s - loss: 0.6771 - binary_accuracy: 0.6275 - val_loss: 0.6694 - val_binary_accuracy: 0.7467
Epoch 3/20
2400/2400 - 9s - loss: 0.6400 - binary_accuracy: 0.7371 - val_loss: 0.6391 - val_binary_accuracy: 0.6667
Epoch 4/20
2400/2400 - 9s - loss: 0.5852 - binary_accuracy: 0.7804 - val_loss: 0.5931 - val_binary_accuracy: 0.7467
Epoch 5/20
2400/2400 - 9s - loss: 0.5223 - binary_accuracy: 0.8179 - val_loss: 0.5533 - val_binary_accuracy: 0.7667
Epoch 6/20
2400/2400 - 9s - loss: 0.4647 - binary_accuracy: 0.8392 - val_loss: 0.5250 - val_binary_accuracy: 0.7067
Epoch 7/20
2400/2400 - 9s - loss: 0.4111 - binary_accuracy: 0.8604 - val_loss: 0.4901 - val_binary_accuracy: 0.8133
Epoch 8/20
2400/2400 - 9s - loss: 0.3650 - binary_accuracy: 0.8846 - val_loss: 0.4743 - val_binary_accuracy: 0.7867
Epoch 9/20
2400/2400 - 9s - loss: 0.3234 - binary_accuracy: 0.8988 - val

In [40]:
loss, accuracy = model.evaluate(X_test_v, y_test)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.44380539655685425
Accuracy:  0.804444432258606


In [57]:
# makig model
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
model.add(tf.keras.layers.Embedding(max_features+1, embedding_dims))
model.add(layers.Dropout(0.2))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1))


In [58]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_6 (TextVe (None, 100)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 100, 16)           160016    
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 16)           0         
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
________________________________________________

In [59]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True), optimizer='adam', metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [None]:
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)