In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import gc
import json
import os
import time
import numpy as np
from sklearn.model_selection import train_test_split

# Tensorflow imports
from tensorflow.keras.utils import set_random_seed
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

For reproducibility, we fix the random seed.

In [3]:
seed = 42
set_random_seed(seed)

# Loading the Dataset

The dataset composed of multiple json objects in the following format:

```
{
  "text": ["sentence", "words", "here"]
  "sentiment": 1
}
```



In [4]:
folder = '/content/drive/MyDrive/CS 171/Final Project'
dataset_filename = 'Video_Games_Financial_Combination.json'

# For comparing the other datasets, include the original dataset
other_datasets = ['Video_Games_Clean.json',
                  'Arts_Crafts_and_Sewing_Clean.json',
                  'Office_Products_Clean.json',
                  'Financial_Clean.json',
                  'Video_Games_Reverse.json',
                  'Video_Games_Shuffle.json',
                  'Video_Games_Truncate.json',
                  'Video_Games_Financial_Combination.json']

# Remove the main dataset now to avoid repetition
other_datasets.remove(dataset_filename)
print(other_datasets)

['Video_Games_Clean.json', 'Arts_Crafts_and_Sewing_Clean.json', 'Office_Products_Clean.json', 'Financial_Clean.json', 'Video_Games_Reverse.json', 'Video_Games_Shuffle.json', 'Video_Games_Truncate.json']


In [5]:
def load_dataset(filename):
  text = []
  labels = []

  with open(os.path.join(folder, filename), 'r') as infile:
    for line in infile.readlines():
      sample = json.loads(line)
      text.append(sample['text'])
      labels.append(sample['sentiment'])

  text = np.asarray(text, dtype=object)
  labels = np.asarray(labels, dtype=int)

  return text, labels

In [6]:
text, labels = load_dataset(dataset_filename)

print(text[0])

['game', 'bit', 'hard', 'to', 'get', 'hang', 'of', 'but', 'when', 'great']


In [7]:
X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.3,
                                                    random_state=seed)
print(f'Training Data Input Shape: {X_train.shape}')
print(f'Training Data Output Shape: {y_train.shape}')
print(f'Testing Data Input Shape: {X_test.shape}')
print(f'Testing Data Output Shape: {y_test.shape}')

Training Data Input Shape: (3392,)
Training Data Output Shape: (3392,)
Testing Data Input Shape: (1454,)
Testing Data Output Shape: (1454,)


In [8]:
vocab_size = 10000
oov_tok = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

In [9]:
tokenizer.fit_on_texts(X_train)
print(f'Number of Documents: {tokenizer.document_count}')
print(f'Number of Words: {tokenizer.num_words}')

Number of Documents: 3392
Number of Words: 10000


In [10]:
tokenizer.word_counts

OrderedDict([('got', 279),
             ('little', 238),
             ('guy', 67),
             ('day', 195),
             ('came', 190),
             ('out', 755),
             ('in', 3981),
             ('summer', 14),
             ('dont', 498),
             ('know', 269),
             ('if', 1060),
             ('laziness', 1),
             ('kept', 35),
             ('from', 1031),
             ('writing', 17),
             ('review', 134),
             ('or', 925),
             ('just', 832),
             ('fact', 110),
             ('everybody', 6),
             ('already', 91),
             ('game', 5968),
             ('rule', 22),
             ('first', 649),
             ('bad', 198),
             ('stuff', 62),
             ('graphic', 676),
             ('kinda', 17),
             ('well', 533),
             ('arent', 81),
             ('on', 1722),
             ('par', 10),
             ('with', 1858),
             ('current', 34),
             ('but', 1892),
            

In [11]:
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

print(train_sequences[0])

[96, 122, 458, 158, 162, 31, 6, 1723, 50, 103, 16, 7545, 826, 18, 1492, 238, 24, 26, 288, 3022, 341, 103, 4, 1210, 38, 153, 490, 35, 1493, 47, 385, 11, 2162, 9, 850, 4, 8, 14, 17, 1363, 1211, 298, 113, 18, 4, 16, 53, 690, 96, 36, 218, 4, 40, 449, 700, 212, 27, 4437, 7, 164, 11, 195, 8, 215, 156, 24, 216, 851, 5, 787, 212, 125, 3, 3023, 25, 1166, 482, 74, 2517, 6, 41, 1309, 164, 19, 179, 3, 827, 4, 2, 107, 125, 3, 134, 169, 8, 74, 127, 491, 19, 1640, 3, 7546, 101, 1364, 4, 17, 132, 48, 612, 4, 459, 303, 3805, 22, 1016, 22, 1167, 303, 3024, 31, 9, 217, 3, 1641, 6, 7547, 21, 648, 23, 51, 7548, 113, 180, 677, 22, 1167, 303, 2, 72, 4438, 3, 303, 171, 3, 2045, 94, 3, 966, 303, 828, 266, 2, 482, 7549, 37, 7550, 5, 30, 7551, 1560, 231, 612, 396, 266, 173, 104, 122, 5473, 8, 869, 413, 2335, 1310, 2, 200, 6, 1909, 42]


In [12]:
sequence_length = 200

train_padded = pad_sequences(train_sequences, maxlen=sequence_length,
                             padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=sequence_length,
                            padding='post', truncating='post')

Now that we have all of the necessary data, we can free up the variables we no longer need.

In [13]:
del text
del labels
del X_train
del X_test
gc.collect()

43

# Training the Model
We will be training a LSTM to perform basic sentiment analysis on the text data that has been preprocessed.

In [14]:
embedding_dim = 16
lstm_out = 32

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=sequence_length),
    Bidirectional(LSTM(lstm_out)),
    Dense(10, activation='relu'),
    Dense(3, activation='sigmoid')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 16)           160000    
                                                                 
 bidirectional (Bidirection  (None, 64)                12544     
 al)                                                             
                                                                 
 dense (Dense)               (None, 10)                650       
                                                                 
 dense_1 (Dense)             (None, 3)                 33        
                                                                 
Total params: 173227 (676.67 KB)
Trainable params: 173227 (676.67 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [15]:
checkpoint_folder = os.path.join(folder, 'Model Checkpoint')

model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_folder,
                                            save_weights_only=False,
                                            monitor='val_loss', mode='min',
                                            save_best_only=True)

callbacks = [EarlyStopping(patience=2), model_checkpoint_callback]

In [16]:
history = model.fit(train_padded,
                    y_train,
                    epochs=10,
                    validation_data=(test_padded, y_test),
                    callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [17]:
original_loss, original_accuracy = model.evaluate(test_padded, y_test)

print(f'Loss: {original_loss}')
print(f'Accuracy: {original_accuracy}')

Loss: 0.7681838870048523
Accuracy: 0.7276478409767151


# Testing the Model on Other Datasets
Now, we will evaluate the model on all of the other datasets in order to see its performance on different datasets.

In [18]:
dataset_accuracies = []

for dataset in other_datasets:
  with open(os.path.join(folder, dataset), 'r') as infile:
    text, labels = load_dataset(dataset)

    # Convert to trainable data
    text_sequences = tokenizer.texts_to_sequences(text)
    text_padded = pad_sequences(text_sequences, maxlen=sequence_length,
                                padding='post', truncating='post')

    # Garbage collection after variables have been reassigned
    gc.collect()

    # Evaluate model
    loss, accuracy = model.evaluate(text_padded, labels)
    print(f'Dataset: {dataset}, Accuracy: {accuracy}')
    dataset_accuracies.append(accuracy)

Dataset: Video_Games_Clean.json, Accuracy: 0.7094669938087463
Dataset: Arts_Crafts_and_Sewing_Clean.json, Accuracy: 0.7712419033050537
Dataset: Office_Products_Clean.json, Accuracy: 0.7315067648887634
Dataset: Financial_Clean.json, Accuracy: 0.7707387804985046
Dataset: Video_Games_Reverse.json, Accuracy: 0.6898919939994812
Dataset: Video_Games_Shuffle.json, Accuracy: 0.6873294115066528
Dataset: Video_Games_Truncate.json, Accuracy: 0.7082495093345642


# Saving the Data

The data from every run will be saved to a single file named after the original dataset.

In [19]:
print(time.localtime())

time.struct_time(tm_year=2023, tm_mon=11, tm_mday=30, tm_hour=4, tm_min=57, tm_sec=1, tm_wday=3, tm_yday=334, tm_isdst=0)


In [20]:
dataset_basename = os.path.splitext(dataset_filename)[0]
logs_folder = 'Logs'

# Compute timestamp
localtime = time.localtime()
timestamp = (f'{localtime[0]}{localtime[1]}{localtime[2]}'
             f'{localtime[3]}{localtime[4]}')

logfile = os.path.join(folder, logs_folder,
                       f'log_{dataset_basename}_{timestamp}.txt')

In [21]:
with open(logfile, 'w') as outfile:
  outfile.write(f'Original file: {dataset_filename}, '
                f'Accuracy: {original_accuracy}\n')

  # Execute for each of the other datasets
  outfile.write('\n---------- Test Datasets ----------\n\n')

  for index, dataset in enumerate(other_datasets):
    outfile.write(f'File: {dataset}, Accuracy: {dataset_accuracies[index]}\n')