In [2]:
# checking GPU
import tensorflow as tf
tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
# import all
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
import json
from glob import glob
import gc

In [4]:
# check tf version
tf.__version__

'2.2.0'

In [34]:
# read all the input data files, tab seperated values
data_file = glob('/tf/deep_learning/sentiment_analysis/data/*.txt')
model_file = '/tf/deep_learning/sentiment_analysis/lib/model'
header_list = ["comments", "sentiment"]
# read all the data using windows encoding and python engine. Else it will give error for windows files
l = [pd.read_csv(f, sep='\t', names=header_list, encoding = "ISO-8859-1", engine='python') for f in data_file]
data = pd.concat(l, axis=0)
print('total length of the training data %s'%(len(data)))
# get training data
X_train, X_test, y_train, y_test = train_test_split(data["comments"], data["sentiment"], test_size=0.20, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
# drop the base data and free the memory
del [[l,data]]
gc.collect()

total length of the training data 3000
(2400,)
(2400,)
(600,)
(600,)


75

In [44]:
# tokenizer using TF lib, for embeddings we will use glove
vocab_size = 20000
max_review_length = 100
oov_tok = "<OOV>"
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

# training data
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padding = pad_sequences(train_sequences, truncating='post', padding='post', maxlen=max_review_length)
# test data
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padding = pad_sequences(test_sequences, truncating='post', padding='post', maxlen=max_review_length)
# print
print(train_padding.shape)
print(y_train.shape)
print(test_padding.shape)
print(y_test.shape)

(2400, 100)
(2400,)
(600, 100)
(600,)


In [45]:
# Need this block to get it to work with TensorFlow 2.x
# training
training_padded = np.array(train_padding)
training_labels = np.array(y_train)
# testing
testing_padded = np.array(test_padding)
testing_labels = np.array(y_test)

In [46]:
# download the glove from link: http://nlp.stanford.edu/data/glove.6B.zip
glove_data = '/tf/deep_learning/sentiment_analysis/glove.6B/glove.6B.100d.txt'
embedding_dim=100

In [47]:
# Extract word embeddings from the Glove
embeddings_index = dict()
f = open(glove_data)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [48]:
# create a weight matrix for words in data after tokenizer. the 100 is the glove embedding dim
# metrix to hold the embedding
embedding_matrix = np.zeros((vocab_size, embedding_dim))
# for every word in input data
for word, index in tokenizer.word_index.items():
    if index > vocab_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [49]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [11]:
# # basic Sequential model
# model_glove = tf.keras.Sequential()
# model_glove.add(Embedding(vocab_size, 100, input_length=max_review_length, weights=[embedding_matrix], trainable=False))
# model_glove.add(Dropout(0.2))
# model_glove.add(Conv1D(64, 5, activation='relu'))
# model_glove.add(MaxPooling1D(pool_size=4))
# model_glove.add(LSTM(100))
# model_glove.add(Dense(1, activation='sigmoid'))
# model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [64]:
# basic Sequential model
# Lets see the model behavior in case the learned word weights do not get updated. 
# therefore, set the trainable attribute for the model to be False.
model_glove = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_review_length, weights=[embedding_matrix], trainable=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(100),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_glove.summary())

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 100, 100)          2000000   
_________________________________________________________________
dropout_5 (Dropout)          (None, 100, 100)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 96, 64)            32064     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 24, 64)            0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 101       
Total params: 2,098,165
Trainable params: 2,098,165
Non-trainable params: 0
___________________________________________

In [65]:
# run the model
model_glove.fit(train_padding, training_labels, validation_split=0.2, batch_size=1, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25

KeyboardInterrupt: 

In [None]:
# testing the model
score, acc = model_glove.evaluate(testing_padded, testing_labels, batch_size=batch_size)
print(score, acc)

In [56]:
# 2nd approach
# creating simple model 1
batch_size=1
num_epochs = 10
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_review_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.fit(training_padded, training_labels, batch_size=batch_size, epochs=num_epochs, validation_split=0.2, verbose=2)

print(model.summary())

# Final evaluation of the model
score, acc = model.evaluate(testing_padded, testing_labels, batch_size=batch_size)
print(score, acc)

Epoch 1/10
1920/1920 - 44s - loss: 0.6936 - accuracy: 0.4906 - val_loss: 0.6940 - val_accuracy: 0.4750
Epoch 2/10
1920/1920 - 43s - loss: 0.6588 - accuracy: 0.6125 - val_loss: 0.5967 - val_accuracy: 0.7208
Epoch 3/10
1920/1920 - 43s - loss: 0.4369 - accuracy: 0.8214 - val_loss: 0.5832 - val_accuracy: 0.7000
Epoch 4/10
1920/1920 - 44s - loss: 0.2787 - accuracy: 0.8969 - val_loss: 0.4718 - val_accuracy: 0.7771
Epoch 5/10
1920/1920 - 44s - loss: 0.2091 - accuracy: 0.9219 - val_loss: 0.5255 - val_accuracy: 0.7625
Epoch 6/10
1920/1920 - 42s - loss: 0.1628 - accuracy: 0.9458 - val_loss: 0.5227 - val_accuracy: 0.7854
Epoch 7/10
1920/1920 - 43s - loss: 0.1252 - accuracy: 0.9604 - val_loss: 0.5405 - val_accuracy: 0.8042
Epoch 8/10
1920/1920 - 44s - loss: 0.1021 - accuracy: 0.9651 - val_loss: 0.5312 - val_accuracy: 0.8250
Epoch 9/10
1920/1920 - 44s - loss: 0.0908 - accuracy: 0.9698 - val_loss: 0.6018 - val_accuracy: 0.7958
Epoch 10/10
1920/1920 - 44s - loss: 0.0698 - accuracy: 0.9792 - val_loss:

In [60]:
# 2nd approach with glove
# creating simple model 1
batch_size=1
num_epochs = 10
model2_glove = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_review_length, weights=[embedding_matrix], trainable=True),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model2_glove.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model2_glove.fit(training_padded, training_labels, batch_size=batch_size, epochs=num_epochs, validation_split=0.2, verbose=2)

print(model2_glove.summary())

# Final evaluation of the model
score, acc = model2_glove.evaluate(testing_padded, testing_labels, batch_size=batch_size)
print(score, acc)

Epoch 1/10
1920/1920 - 46s - loss: 0.6739 - accuracy: 0.5865 - val_loss: 0.6279 - val_accuracy: 0.6042
Epoch 2/10
1920/1920 - 46s - loss: 0.4895 - accuracy: 0.7807 - val_loss: 0.4754 - val_accuracy: 0.7750
Epoch 3/10
1920/1920 - 45s - loss: 0.3160 - accuracy: 0.8807 - val_loss: 0.4439 - val_accuracy: 0.7854
Epoch 4/10
1920/1920 - 44s - loss: 0.2200 - accuracy: 0.9193 - val_loss: 0.4380 - val_accuracy: 0.8125
Epoch 5/10
1920/1920 - 43s - loss: 0.1772 - accuracy: 0.9391 - val_loss: 0.4475 - val_accuracy: 0.8250
Epoch 6/10
1920/1920 - 45s - loss: 0.1323 - accuracy: 0.9531 - val_loss: 0.4634 - val_accuracy: 0.8333
Epoch 7/10
1920/1920 - 45s - loss: 0.1070 - accuracy: 0.9646 - val_loss: 0.5474 - val_accuracy: 0.8125
Epoch 8/10
1920/1920 - 46s - loss: 0.0880 - accuracy: 0.9745 - val_loss: 0.6209 - val_accuracy: 0.7937
Epoch 9/10
1920/1920 - 46s - loss: 0.0770 - accuracy: 0.9745 - val_loss: 0.5906 - val_accuracy: 0.8042
Epoch 10/10
1920/1920 - 46s - loss: 0.0662 - accuracy: 0.9781 - val_loss: