In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [2]:
#loads Glove embedding file to embed input words
def loadEmbedding(embeddingFile, embed_dim=50, vocab_size=50000):
    vocabulary = []
    embeddingMatrix = np.zeros((vocab_size, embed_dim))
    with open(embeddingFile, 'r', encoding='utf-8') as f:
        for count, line in enumerate(f):
            if count+2 >= vocab_size:
                break
            word, vect = line.split(maxsplit=1)
            vect = np.fromstring(vect, "f", sep=" ")
            vocabulary.append(word)
            embeddingMatrix[count+2][:] = vect
    return vocabulary, embeddingMatrix

In [3]:
#num_tokens= vocabSize, embedding_dim=output size (50)

#generates untrained RNN that used GloVe embedding and a LSTM layer
def generateRNN1(vocabulary, embeddingMatrix, max_len=20, embeddingSize=50, vocabSize=50000):
    model = keras.Sequential()
    model.add(keras.Input(shape=(1,), dtype=tf.string))
    model.add(keras.layers.TextVectorization(max_tokens=vocabSize, output_sequence_length=max_len, vocabulary=vocabulary))
    model.add(keras.layers.Embedding(input_dim=vocabSize, output_dim=embeddingSize, embeddings_initializer=keras.initializers.Constant(embeddingMatrix), trainable=False))
    model.add(keras.layers.LSTM(256))
    model.add(keras.layers.Dense(16))
    model.add(keras.layers.Dense(1))
    model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))
    model.summary()
    return model

#generates untrained RNN that uses GloVe embedding and GRU layer
def generateRNN2(vocabulary, embeddingMatrix, max_len=20, embeddingSize=50, vocabSize=50000):
    model = keras.Sequential()
    model.add(keras.Input(shape=(1,), dtype=tf.string))
    model.add(keras.layers.TextVectorization(max_tokens=vocabSize, output_sequence_length=max_len, vocabulary=vocabulary))
    model.add(keras.layers.Embedding(input_dim=vocabSize, output_dim=embeddingSize, embeddings_initializer=keras.initializers.Constant(embeddingMatrix), trainable=False))
    model.add(keras.layers.GRU(256))
    model.add(keras.layers.Dense(16))
    model.add(keras.layers.Dense(1))
    model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))
    model.summary()
    return model

#generates untrained RNN that uses Glove embedding and 2 LSTM layers
def generateRNN3(vocabulary, embeddingMatrix, max_len=20, embeddingSize=50, vocabSize=50000):
    model = keras.Sequential()
    model.add(keras.Input(shape=(1,), dtype=tf.string))
    model.add(keras.layers.TextVectorization(max_tokens=vocabSize, output_sequence_length=max_len, vocabulary=vocabulary))
    model.add(keras.layers.Embedding(input_dim=vocabSize, output_dim=embeddingSize, embeddings_initializer=keras.initializers.Constant(embeddingMatrix), trainable=False))
    model.add(keras.layers.LSTM(256, return_sequences=True))
    model.add(keras.layers.LSTM(64))
    model.add(keras.layers.Dense(16))
    model.add(keras.layers.Dense(1))
    model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))
    model.summary()
    return model

In [4]:
#plots loss value and the accuracy for the two different models
def draw_loss(history1):
    fig, ax1 = plt.subplots()
    
    #defines and labels the axes
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax2 = ax1.twinx()
    ax2.set_ylabel('Accuracy')
    
    #plots loss on x axis #1 and accuracy on x axis #2
    ax1.plot(history1.history['loss'], 'b-', label= "model1-Loss")
    ax2.plot(history1.history['acc'], 'b:', label="model1-Accuracy")
    ax1.legend(bbox_to_anchor=(1.1, 0), loc="lower left")
    ax2.legend(bbox_to_anchor=(1.1, 1), loc="upper left");

In [5]:
#load dataset
filePath = 'training.1600000.processed.noemoticon.csv'
data = pd.read_csv(filePath, names=['sentiment', 'del1', 'del2', 'del3', 'del4', 'text'], usecols=['sentiment', 'text'], encoding='latin-1')

# gets labels and features (text) from dataset
labels = data['sentiment'].replace(to_replace=4, value=1)
features = data['text']

# splits dataset into training (80%), testing (10%), and validation (10%)
trainingFeatures, testingFeatures, trainingLabels, testingLabels = train_test_split(features, labels, test_size=.2)
validationFeatures, testingFeatures, validationLabels, testingLabels = train_test_split(testingFeatures, testingLabels, test_size=.5)



In [6]:
# loads GloVe embedding file
localPath = 'glove.6B.50d.txt'
vocab, embeddingMatrix = loadEmbedding(localPath)

In [7]:
# generates and trains RNN model1
model1 = generateRNN1(vocab, embeddingMatrix)
history1 = model1.fit(trainingFeatures, trainingLabels,  batch_size=256, epochs=5, validation_data=(validationFeatures, validationLabels))
model1.save('savedModels/model1')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 20)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 20, 50)            2500000   
                                                                 
 lstm (LSTM)                 (None, 256)               314368    
                                                                 
 dense (Dense)               (None, 16)                4112      
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 2,818,497
Trainable params: 318,497
Non-trainable params: 2,500,000
________________________________________



INFO:tensorflow:Assets written to: savedModels/model1\assets


INFO:tensorflow:Assets written to: savedModels/model1\assets


In [8]:
# generates and trains RNN model2
model2 = generateRNN2(vocab, embeddingMatrix)
history2 = model2.fit(trainingFeatures, trainingLabels,  batch_size=256, epochs=5, validation_data=(validationFeatures, validationLabels))
model2.save('savedModels/model2')

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, 20)               0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 20, 50)            2500000   
                                                                 
 gru (GRU)                   (None, 256)               236544    
                                                                 
 dense_2 (Dense)             (None, 16)                4112      
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 2,740,673
Trainable params: 240,673
Non-trainable params: 2,500,000
______________________________________



INFO:tensorflow:Assets written to: savedModels/model2\assets


INFO:tensorflow:Assets written to: savedModels/model2\assets


In [9]:
# generates and trains RNN model3
model3 = generateRNN3(vocab, embeddingMatrix)
history3 = model3.fit(trainingFeatures, trainingLabels,  batch_size=256, epochs=5, validation_data=(validationFeatures, validationLabels))
model3.save('savedModels/model3')

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_2 (TextV  (None, 20)               0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, 20, 50)            2500000   
                                                                 
 lstm_1 (LSTM)               (None, 20, 256)           314368    
                                                                 
 lstm_2 (LSTM)               (None, 64)                82176     
                                                                 
 dense_4 (Dense)             (None, 16)                1040      
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                      



INFO:tensorflow:Assets written to: savedModels/model3\assets


INFO:tensorflow:Assets written to: savedModels/model3\assets


In [11]:
loss1, accuracy1= model1.evaluate(testingFeatures, testingLabels)

print("Model 1:")
print("Loss: ", loss1)
print("Accuracy: ", accuracy1)

Model 1:
Loss:  0.4220907688140869
Accuracy:  0.8038374781608582


In [12]:
loss2, accuracy2 = model2.evaluate(testingFeatures, testingLabels)

print("Model 2:")
print("Loss: ", loss2)
print("Accuracy: ", accuracy2)

Model 2:
Loss:  0.4345569312572479
Accuracy:  0.7971875071525574


In [13]:
loss3, accuracy3 = model3.evaluate(testingFeatures, testingLabels)

print("Model 3:")
print("Loss: ", loss3)
print("Accuracy: ", accuracy3)

Model 3:
Loss:  0.42237427830696106
Accuracy:  0.8048562407493591
