# Sentiment Analysis of Steam Reviews (Grid Search)

This file contains a modified version of [Sentiment Analysis of Steam Reviews.ipynb](<Sentiment Analysis of Steam Reviews.ipynb>), which has been used to execute a grid search for the model's hyperparameters. However, the code is distributed into larger functions and therefore is harder to document in finer detail. For a better explanation of the model, please see the original [Sentiment Analysis of Steam Reviews.ipynb](<Sentiment Analysis of Steam Reviews.ipynb>) version.

In [1]:
import datasets
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext; torchtext.disable_torchtext_deprecation_warning()
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
# Hyperparameters
epochs = 15
batch_size_options = [256, 512, 1024]
max_tokens_options = [64, 128, 256, 512]
filter_size_options = [[3, 3], [3, 5], [3, 7], [5, 3], [5, 5], [5, 7]]

In [3]:
# Note: Setting the random seed has been moved further down to where the grid 
# search is performed to ensure that the seeds are reset with every loop.

In [4]:
# Select GPU if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
def data_setup(batch_size, max_tokens):

    # Define a function to create a partial dataset
    def create_partial_dataset(nrows=1000, override=True):
        
        dataset_path = 'data/dataset_' + str(nrows) + '.csv'
        if (os.path.exists(dataset_path) and not override):
            return dataset_path
        
        # Load the full dataset
        # Location: https://www.kaggle.com/datasets/andrewmvd/steam-reviews
        df = pd.read_csv('data/dataset.csv')
        print(df.shape)
    
        # Make a partial dataset for exploration composed 
        # of 50% positive and 50% negative reviews.
        df = df.groupby('review_score').sample(n=int(nrows/2), random_state=14).sort_index()
    
        # Export to CSV
        df.to_csv(dataset_path)
    
        return dataset_path

    # Disable dataset progress bars
    datasets.utils.logging.disable_progress_bar() 
    
    # Create a partial dataset for exploration
    dataset_nrows = 10000
    dataset_path = create_partial_dataset(dataset_nrows, override=False)
    
    # Load the Steam reviews dataset
    full_data = datasets.load_dataset('csv', data_files=dataset_path, split='all')
    
    # Define a function to apply preprocessing fixes to the text
    def preprocessing_text(row):
    
        # Make the text lower case
        # Remove multiple space chars
        row['text'] = row['text'].lower()
        row['text'] = ' '.join(row['text'].split()).strip()
        return row
    
    # Define a function to apply preprocessing fixes to the labels
    def preprocessing_labels(row):
    
        # Alter the -1 (negative) label to be 0
        if (row['label'] == -1):
            row['label'] = 0
        return row
    
    # Remove unnecessary columns
    full_data = full_data.remove_columns(['Unnamed: 0', 'app_id', 'app_name', 'review_votes'])
    
    # Rename the columns to be generic
    full_data = full_data.rename_column('review_text', 'text')
    full_data = full_data.rename_column('review_score', 'label')
    
    # Remove entries that contain no text
    full_data = full_data.filter(lambda row: row['text'] is not None)
    
    # Apply preprocessing fixes to the data
    full_data = full_data.map(preprocessing_text)
    full_data = full_data.map(preprocessing_labels)
    
    # Define the function that will split the sentences into tokens
    def tokenize(row, max_tokens):
        tokenizer = get_tokenizer('basic_english')
        tokens = tokenizer(row['text'])[:max_tokens]
        return {'tokens': tokens}
    
    # Split the text into tokens
    full_data = full_data.map(
        tokenize, fn_kwargs={'max_tokens': max_tokens}
    )
    
    # Define a mapping of string tokens to integer values
    vocab = build_vocab_from_iterator(
        full_data['tokens'],
        min_freq=5,
        specials=['<unk>', '<pad>'],
    )
    vocab.set_default_index(vocab['<unk>'])

    # Store the index of the padding token
    pad_index = vocab['<pad>']
    
    # Define the function that replaces string tokens with integer values
    def numericalize(row, vocab):
        tokens = vocab.lookup_indices(row['tokens'])
        return {'tokens': tokens}
    
    # Replace the string tokens with integer values
    full_data = full_data.map(numericalize, fn_kwargs={'vocab': vocab})
    full_data = full_data.with_format(type='torch', columns=['tokens', 'label'])
    
    # Define the the function for collating and batching the data
    def collate_fn(pad_index):
        def collate_fn(batch):
            batch_tokens = [i['tokens'] for i in batch]
            batch_tokens = nn.utils.rnn.pad_sequence(
                batch_tokens, padding_value=pad_index, batch_first=True
            )
            batch_label = [i['label'] for i in batch]
            batch_label = torch.stack(batch_label)
            batch = {'tokens': batch_tokens, 'label': batch_label}
            return batch
    
        return collate_fn
    
    # Split the dataset into training/validation and test sets
    full_data = full_data.train_test_split(test_size=0.2)
    train_valid_data = full_data['train']
    test_data = full_data['test']
    
    # Further split the training set into training and validation sets
    train_valid_data = train_valid_data.train_test_split(test_size=0.25)
    train_data = train_valid_data['train']
    valid_data = train_valid_data['test']

    # Calculate the number of unique labels
    unique_labels = len(train_data.unique('label'))
        
    # Create the data loaders for training section
    train_data_loader = DataLoader(train_data, batch_size, collate_fn=collate_fn(pad_index), shuffle=True)
    valid_data_loader = DataLoader(valid_data, batch_size, collate_fn=collate_fn(pad_index))
    test_data_loader = DataLoader(test_data, batch_size, collate_fn=collate_fn(pad_index))

    return train_data_loader, valid_data_loader, test_data_loader, vocab, unique_labels

In [6]:
# Define the convolutional neural network
class CNN(nn.Module):

    # Define the function for initliasing the CNN model
    def __init__(
        self,
        embedding_dim, # The size of each embedding vector
        filters, # Number of filters
        filter_size, # The size of the filters
        max_tokens, # Max tokens
        vocab, # Dictionary of embeddings
        output_dim, # No. of unique labels
        pad_index, # The index of the pad, which will get defaulted to all zeros
        device # Run on the CPU or GPU
    ):
        super().__init__()

        # Store for later use
        self.vocab = vocab
        self.device = device
     
        # Lookup table that stores embeddings
        num_embeddings = len(vocab)
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_idx=pad_index)
        
        # Convolution layers
        # Embedding vector size, Number of filters, Filter/Kernel size
        self.conv1 = nn.Conv1d(embedding_dim, filters, filter_size[0], padding='same')
        self.conv2 = nn.Conv1d(embedding_dim, filters, filter_size[1], padding='same')
        
        # Linear layer
        # No. of convolution layers * Earlier output channels, No. of unique labels
        self.fc1 = nn.Linear(2 * filters, output_dim)

        # Define the loss function and optimiser
        # Cross-entropy loss and Adaptive Moment Estimation
        self.criterion = nn.CrossEntropyLoss().to(self.device)
        self.optimizer = optim.Adam(self.parameters())

    # Define the function to return the total model parameters
    def total_parameters(self):
    
        # Compute the total number of parameters
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

    # Define the function to process a pass through the CNN
    def forward(self, tokens):

        # Tokens
        # Size: [Batch size, Max tokens]
        # Example: [512, 256]

        # Token embeddings
        # Size: [Batch size, Max tokens, Embedding vector size]
        # Example: [512, 256, 300]
        embedded = self.embedding(tokens)

        # Re-arrange the token embeddings
        # Size: [Batch size, Embedding vector size, Max tokens]
        # Example: [512, 300, 256]
        embedded = embedded.permute(0, 2, 1)

        # First convolutional layer plus ReLU activation
        # Size: [Batch size, Output channels, Max tokens - Filter size + 1]
        # Example: [512, 100, 254]
        conved1 = torch.relu(self.conv1(embedded))

        # Second convolutional layer plus ReLU activation
        # Size: [Batch size, Output channels, Max tokens - Filter size + 1]
        # Example: [512, 100, 254]
        conved2 = torch.relu(self.conv2(embedded))    

        # First max pool layer
        # Size: [Batch size, Output channels]
        # Example: [512, 100]
        pooled1 = conved1.max(dim=-1).values

        # Second max pool layer
        # Size: [Batch size, Output channels]
        # Example: [512, 100]
        pooled2 = conved1.max(dim=-1).values
        
        # Concatenate the tensors returned by the pooled layers
        # Size: [Batch size, 2*Output channels]
        # Example: [512, 200]
        cat = torch.cat([pooled1, pooled2], dim=-1)
        
        # Linear layer
        # Size: [Batch size, No. of unique labels]
        # Example: [512, 2]
        prediction = self.fc1(cat)      
        return prediction

    # Define the function to train the model
    def train(self, train_loader):
    
        # Loop through every batch in the training data
        running_loss = 0.0
        running_accuracy = 0.0
        for batch in train_loader:
    
            # Send the data to the GPU
            tokens, labels = batch['tokens'].to(self.device), batch['label'].to(self.device)
    
            # Zero the parameter gradients
            self.optimizer.zero_grad()
    
            # Forward + backward + optimize
            predictions = self(tokens)
            loss = self.criterion(predictions, labels)
            loss.backward()
            self.optimizer.step()
    
            # Calculate the running loss and accuracy
            running_loss += loss.item()
            running_accuracy += get_accuracy(predictions, labels).item()
            
        return (running_loss / len(train_loader), 
               running_accuracy / len(train_loader))

    # Define the function to evaluate the model
    def evaluate(self, test_loader):
    
        # Loop through every batch in the test data
        running_loss = 0.0
        running_accuracy = 0.0
        with torch.no_grad():
            for batch in test_loader:
    
                # Send the data to the GPU
                tokens, labels = batch['tokens'].to(self.device), batch['label'].to(self.device)
    
                # Evaluate the predicted label
                predictions = self(tokens)
                loss = self.criterion(predictions, labels)
        
                # Calculate the running loss and accuracy
                running_loss += loss.item()
                running_accuracy += get_accuracy(predictions, labels).item()
    
        return (running_loss / len(test_loader), 
               running_accuracy / len(test_loader))

In [7]:
# Define a function to initialise a model
def create_cnn_model(filter_size, max_tokens, vocab, unique_labels, pad_index, device):

    # Note: The embedding vector size has to remain at 300 in order to match 
    # that returned by GloVe. It can be changed if the usage of GloVe is removed.
    
    # Model inputs
    embedding_dim = 300
    filters = 100

    # Initialise the model
    model = CNN(
        embedding_dim, # The size of each embedding vector
        filters, # Number of filters  
        filter_size, # The size of the filters
        max_tokens, # Max tokens
        vocab, # Dictionary of embeddings
        unique_labels, # No. of unique labels
        pad_index, # The index of the pad, which will get defaulted to all zeros
        device # Run on the CPU or GPU
    )
    
    # Use starting weights from GloVe
    # https://nlp.stanford.edu/projects/glove/
    vectors = torchtext.vocab.GloVe(dim=embedding_dim)
    model.embedding.weight.data = vectors.get_vecs_by_tokens(vocab.get_itos())
    
    # Move the model to the GPU
    model = model.to(device)

    return model

In [8]:
# Define a function to compute the accuracy of the predictions
def get_accuracy(prediction, label):
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / len(prediction)
    return accuracy

In [9]:
# Results storage
total_time = 0
best_valid_loss_all_runs = float('inf')
best_model_state = {}
results = {
    'Batch Size':[], 'Max Tokens': [], 
    'Filter Size':[], 'Epoch': [], 
    'Training (Loss)': [], 'Training (Accuracy)': [], 
    'Validation (Loss)': [], 'Validation (Accuracy)': [], 
    'Testing (Loss)': [], 'Testing (Accuracy)': []
}

# Loop through hyperparameter options
for batch_size in batch_size_options:
    for max_tokens in max_tokens_options:
        for filter_size in filter_size_options:

            # Set the random seed for deterministic behaviour
            seed = 14
            np.random.seed(seed)
            torch.manual_seed(seed)
            torch.cuda.manual_seed(seed)
            torch.backends.cudnn.deterministic = True

            # Only run if the max tokens are larger than the batch size
            if max_tokens >= batch_size:
                continue
            
            # Results storage
            best_train_loss = float('inf')
            best_train_acc = float('inf')
            best_valid_loss = float('inf')
            best_valid_acc = float('inf')
            best_epoch = 0

            # Short description of hyperparameters for feedback
            hyperparameter_desc = str(batch_size)+'-'+str(max_tokens)+'-'+str(filter_size) 
            
            # Setup the data
            train_data_loader, valid_data_loader, test_data_loader, vocab, unique_labels = data_setup(batch_size, max_tokens)
        
            # Store the index of the padding token
            pad_index = vocab['<pad>']
            
            # Create the model
            model = create_cnn_model(filter_size, max_tokens, vocab, unique_labels, pad_index, device)
            
            # Run the required number of epochs
            start = time.time()
            print('HP:', hyperparameter_desc, end=' | ')
            print('Epoch: ', end='')
            for epoch in range(1, epochs+1):
            
                # Run the model
                print(epoch, end=' ')
                train_loss, train_acc = model.train(train_data_loader)
                valid_loss, valid_acc = model.evaluate(valid_data_loader)
            
                # Save the best set of weightings within the epoch
                if valid_loss < best_valid_loss:
                    torch.save(model.state_dict(), 'checkpoints/cnn_grid_search.pt')
                    best_train_loss = train_loss
                    best_train_acc = train_acc
                    best_valid_loss = valid_loss
                    best_valid_acc = valid_acc
                    best_epoch = epoch

                # Save the best set of weights across all runs
                if valid_loss < best_valid_loss_all_runs:
                    best_valid_loss_all_runs = valid_loss
                    best_model_state = model.state_dict()

            # Load the best set of weighting within the epoch
            model.load_state_dict(torch.load('checkpoints/cnn_grid_search.pt'))

            # Compute the testing loss and accuracy
            test_loss, test_acc = model.evaluate(test_data_loader)
        
            # Store the results of the run
            results['Batch Size'].append(batch_size)
            results['Max Tokens'].append(max_tokens)
            results['Filter Size'].append(filter_size)
            results['Epoch'].append(best_epoch)
            results['Training (Loss)'].append(best_train_loss)
            results['Training (Accuracy)'].append(best_train_acc)
            results['Validation (Loss)'].append(best_valid_loss)
            results['Validation (Accuracy)'].append(best_valid_acc)
            results['Testing (Loss)'].append(test_loss)
            results['Testing (Accuracy)'].append(test_acc)
            
            # Print the training time
            end = time.time()
            total_time += end-start
            print('| Validation (Loss):', str(np.round(best_valid_loss, 6)), end=' ')
            print('| Training Time:', str(np.round(end-start, 2))+'s')

# Save the best weighting across all runs
torch.save(best_model_state, 'checkpoints/cnn_grid_search.pt')

print('\nTotal Exploration Time:', str(np.round(total_time, 2))+'s')

HP: 256-64-[3, 3] | Epoch: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | Validation (Loss): 0.453486 | Training Time: 14.47s
HP: 256-64-[3, 5] | Epoch: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | Validation (Loss): 0.45827 | Training Time: 16.02s
HP: 256-64-[3, 7] | Epoch: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | Validation (Loss): 0.461844 | Training Time: 15.73s
HP: 256-64-[5, 3] | Epoch: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | Validation (Loss): 0.459697 | Training Time: 16.89s
HP: 256-64-[5, 5] | Epoch: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | Validation (Loss): 0.454064 | Training Time: 15.88s
HP: 256-64-[5, 7] | Epoch: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | Validation (Loss): 0.455731 | Training Time: 16.59s
HP: 256-128-[3, 3] | Epoch: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | Validation (Loss): 0.436439 | Training Time: 18.46s
HP: 256-128-[3, 5] | Epoch: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | Validation (Loss): 0.436563 | Training Time: 19.68s
HP: 256-128-[3, 7] | Epoch: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15

In [10]:
# Convert to data frame and then output the results
results_df = pd.DataFrame(results)
results_sorted_df = results_df.sort_values('Validation (Loss)')
results_sorted_df.to_csv('results/grid_search_results.csv', index=False)
results_sorted_df.style.hide()

Batch Size,Max Tokens,Filter Size,Epoch,Training (Loss),Training (Accuracy),Validation (Loss),Validation (Accuracy),Testing (Loss),Testing (Accuracy)
1024,512,"[3, 7]",12,0.263356,0.871315,0.413981,0.789345,0.430199,0.792425
512,256,"[3, 3]",7,0.28671,0.861777,0.417407,0.782301,0.431387,0.794389
512,256,"[3, 7]",7,0.294314,0.856717,0.41924,0.783,0.4318,0.794231
1024,512,"[3, 3]",12,0.270199,0.868266,0.420068,0.782334,0.435397,0.787417
1024,256,"[3, 7]",11,0.284882,0.869581,0.420664,0.780331,0.433918,0.791862
1024,256,"[3, 3]",12,0.264852,0.87129,0.420851,0.773419,0.4372,0.78311
1024,512,"[3, 5]",12,0.274947,0.863903,0.421508,0.780381,0.435257,0.789808
1024,512,"[5, 3]",10,0.272684,0.869568,0.422161,0.781821,0.437378,0.783511
512,256,"[5, 3]",7,0.262334,0.872818,0.423366,0.781918,0.432806,0.790272
512,256,"[3, 5]",7,0.284502,0.86342,0.423766,0.784412,0.431039,0.787289


In [11]:
# Ouput the best result
final_results_sorted_df = results_sorted_df.head(1)
final_results_sorted_df.style.hide()

Batch Size,Max Tokens,Filter Size,Epoch,Training (Loss),Training (Accuracy),Validation (Loss),Validation (Accuracy),Testing (Loss),Testing (Accuracy)
1024,512,"[3, 7]",12,0.263356,0.871315,0.413981,0.789345,0.430199,0.792425
