# Data Preparation:

1) Read CSV files from train and test directories.
2) Tokenize the text data into batches
3) Combine the batches into a single tokenized .pt file

In [7]:
# !pip install sentencepiece
# !pip show sentencepiece
# !pip show transformers
# !pip install chardet

import os
import pandas as pd
import torch
from torch import nn
from transformers import BigBirdTokenizer
from torch.utils.data import DataLoader, TensorDataset
import csv
import numpy as np
import chardet

print("PyTorch version " + torch.__version__)
print("Num GPUs Available: ", torch.cuda.device_count())
print(torch.cuda.is_available())

# Set device to GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

'''
remember to call .to(device) on model, inputs, labels, etc. so that it uses the GPU!!
'''

  from .autonotebook import tqdm as notebook_tqdm


PyTorch version 2.0.1
Num GPUs Available:  1
True
cuda


'\nremember to call .to(device) on model, inputs, labels, etc. so that it uses the GPU!!\n'

# Tokenization

In [18]:
def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        result = chardet.detect(file.read(10000))  # Read first 10000 bytes to guess encoding
    return result['encoding']

def tokenize_row(row, tokenizer):
    text = f"{row['body']} {row['subject']} {row['comments']}"
    return tokenizer(text, max_length=4096, truncation=True, padding='max_length', return_tensors='pt')

def tokenize_and_save(directory, output_dir, batch_size=10000):
    tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
    batch_input_ids, batch_attention_masks, batch_labels = [], [], []

    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            print(f"Processing {filename}")
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)

            for index, row in df.iterrows():
                # Tokenize each row
                encodings = tokenize_row(row, tokenizer)
                label = int(row['label'])

                # Accumulate results
                batch_input_ids.append(encodings['input_ids'])
                batch_attention_masks.append(encodings['attention_mask'])
                batch_labels.append(torch.tensor(label))

                if len(batch_labels) >= batch_size:
                    # Save current batch as a dataset
                    dataset = TensorDataset(torch.cat(batch_input_ids), torch.cat(batch_attention_masks), torch.stack(batch_labels))
                    torch.save(dataset, os.path.join(output_dir, f"{filename}_batch_{index}.pt"))
                    print(f"Saved batch {index}")

                    # Reset for next batch
                    batch_input_ids, batch_attention_masks, batch_labels = [], [], []

    # Save any remaining data in the last batch
    if batch_labels:
        dataset = TensorDataset(torch.cat(batch_input_ids), torch.cat(batch_attention_masks), torch.stack(batch_labels))
        torch.save(dataset, os.path.join(output_dir, f"{filename[:-4]}_final_batch.pt"))
        print("Saved final batch")

# Directories
train_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\train\\'
test_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\test\\'
output_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\tokenized\\'

# Process and tokenize data
tokenize_and_save(train_dir, output_dir)
tokenize_and_save(test_dir, output_dir)


Processing train_mixed_data_1.csv
Saved batch 99
Saved batch 199
Saved batch 299
Saved batch 399
Saved batch 499
Saved batch 599
Saved batch 699
Saved batch 799
Saved batch 899
Saved batch 999
Saved batch 1099
Saved batch 1199
Saved batch 1299
Saved batch 1399
Saved batch 1499
Saved batch 1599
Saved batch 1699
Saved batch 1799
Saved batch 1899
Saved batch 1999
Saved batch 2099
Saved batch 2199
Saved batch 2299
Saved batch 2399
Saved batch 2499
Saved batch 2599
Saved batch 2699
Saved batch 2799
Saved batch 2899
Saved batch 2999
Saved batch 3099
Saved batch 3199
Saved batch 3299
Saved batch 3399
Saved batch 3499
Saved batch 3599
Saved batch 3699
Saved batch 3799
Saved batch 3899
Saved batch 3999
Saved batch 4099
Saved batch 4199
Saved batch 4299
Saved batch 4399
Saved batch 4499
Saved batch 4599
Saved batch 4699
Saved batch 4799
Saved batch 4899
Saved batch 4999
Saved batch 5099
Saved batch 5199
Saved batch 5299
Saved batch 5399
Saved batch 5499
Saved batch 5599
Saved batch 5699
Saved ba

# Combine the tokenized batches into a single file

In [1]:
import torch
import os

def combine_batches(directory, file_prefix):
    input_ids_list = []
    attention_mask_list = []
    labels_list = []

    # Iterate over the files and load each batch
    for filename in os.listdir(directory):
        if filename.startswith(file_prefix) and filename.endswith('.pt'):
            file_path = os.path.join(directory, filename)
            batch = torch.load(file_path)

            # Add Debugging Information
            print(f"Loading {filename}: {len(batch.tensors[0])} rows")

            # Append batch contents to respective lists
            input_ids_list.append(batch.tensors[0])
            attention_mask_list.append(batch.tensors[1])
            labels_list.append(batch.tensors[2])

    # Concatenate all batches
    input_ids_combined = torch.cat(input_ids_list, dim=0)
    attention_mask_combined = torch.cat(attention_mask_list, dim=0)
    labels_combined = torch.cat(labels_list, dim=0)

    # Save the combined tensors
    combined_batch = (input_ids_combined, attention_mask_combined, labels_combined)
    torch.save(combined_batch, os.path.join(directory, f"{file_prefix}_combined.pt"))

    print(f"Combined file saved as {file_prefix}_combined.pt")

train_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\tokenized\\train\\'  
test_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\tokenized\\test\\'    


combine_batches(test_dir, 'test_mixed_data_1.csv_batch')
combine_batches(train_dir, 'train_mixed_data_1.csv_batch')

Loading test_mixed_data_1.csv_batch_10099.pt: 100 rows
Loading test_mixed_data_1.csv_batch_10199.pt: 100 rows
Loading test_mixed_data_1.csv_batch_10299.pt: 100 rows
Loading test_mixed_data_1.csv_batch_10399.pt: 100 rows
Loading test_mixed_data_1.csv_batch_10499.pt: 100 rows
Loading test_mixed_data_1.csv_batch_10599.pt: 100 rows
Loading test_mixed_data_1.csv_batch_10699.pt: 100 rows
Loading test_mixed_data_1.csv_batch_10799.pt: 100 rows
Loading test_mixed_data_1.csv_batch_10899.pt: 100 rows
Loading test_mixed_data_1.csv_batch_1099.pt: 100 rows
Loading test_mixed_data_1.csv_batch_10999.pt: 100 rows
Loading test_mixed_data_1.csv_batch_11099.pt: 100 rows
Loading test_mixed_data_1.csv_batch_11199.pt: 100 rows
Loading test_mixed_data_1.csv_batch_11299.pt: 100 rows
Loading test_mixed_data_1.csv_batch_11399.pt: 100 rows
Loading test_mixed_data_1.csv_batch_11499.pt: 100 rows
Loading test_mixed_data_1.csv_batch_11599.pt: 100 rows
Loading test_mixed_data_1.csv_batch_11699.pt: 100 rows
Loading tes

# Verify the combination of batches is correct

- check the total number of rows in the train and test dataframes

In [3]:
import torch

def verify_combined_data(file_path):
    # Load the combined batch
    combined_batch = torch.load(file_path)

    # Extract tensors
    input_ids, attention_mask, labels = combined_batch

    # Print shapes and number of rows
    print(f"Input IDs shape: {input_ids.shape}")
    print(f"Attention Mask shape: {attention_mask.shape}")
    print(f"Labels shape: {labels.shape}")
    print(f"Total number of rows: {input_ids.shape[0]}")

# Example usage
train_combined_file = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\tokenized\\train\\combined\\train_mixed_data_1.csv_batch_combined.pt'
test_combined_file = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\tokenized\\test\\combined\\test_mixed_data_1.csv_batch_combined.pt'

verify_combined_data(train_combined_file)
verify_combined_data(test_combined_file)


Input IDs shape: torch.Size([229500, 4096])
Attention Mask shape: torch.Size([229500, 4096])
Labels shape: torch.Size([229500])
Total number of rows: 229500
Input IDs shape: torch.Size([57300, 4096])
Attention Mask shape: torch.Size([57300, 4096])
Labels shape: torch.Size([57300])
Total number of rows: 57300


# Verify that the text is tokenized correctly

- Check the total number of rows in the train and test dataframes

In [11]:
import os
import torch
from torch.utils.data import TensorDataset

def count_rows_in_batches(directory):
    total_rows = 0

    # Iterate over each file in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.pt'):  # Check if the file is a .pt file
            file_path = os.path.join(directory, filename)

            # Load the batch
            batch = torch.load(file_path)

            # Check if the batch is a TensorDataset
            if isinstance(batch, TensorDataset):
                # Access the first tensor in the TensorDataset to count the rows
                num_rows = batch.tensors[0].shape[0]
                total_rows += num_rows
                print(f"Processed {filename}: {num_rows} rows")
            else:
                print(f"Unexpected structure in {filename}. Cannot count rows.")

    return total_rows

# Example usage
train_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\tokenized\\train\\'
test_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\tokenized\\test\\'

train_rows = count_rows_in_batches(train_dir)
test_rows = count_rows_in_batches(test_dir)

print(f"Total rows in train batches: {train_rows}")
print(f"Total rows in test batches: {test_rows}")


Processed train_mixed_data_1.csv_batch_100099.pt: 100 rows
Processed train_mixed_data_1.csv_batch_100199.pt: 100 rows
Processed train_mixed_data_1.csv_batch_100299.pt: 100 rows
Processed train_mixed_data_1.csv_batch_100399.pt: 100 rows
Processed train_mixed_data_1.csv_batch_100499.pt: 100 rows
Processed train_mixed_data_1.csv_batch_100599.pt: 100 rows
Processed train_mixed_data_1.csv_batch_100699.pt: 100 rows
Processed train_mixed_data_1.csv_batch_100799.pt: 100 rows
Processed train_mixed_data_1.csv_batch_100899.pt: 100 rows
Processed train_mixed_data_1.csv_batch_10099.pt: 100 rows
Processed train_mixed_data_1.csv_batch_100999.pt: 100 rows
Processed train_mixed_data_1.csv_batch_101099.pt: 100 rows
Processed train_mixed_data_1.csv_batch_101199.pt: 100 rows
Processed train_mixed_data_1.csv_batch_101299.pt: 100 rows
Processed train_mixed_data_1.csv_batch_101399.pt: 100 rows
Processed train_mixed_data_1.csv_batch_101499.pt: 100 rows
Processed train_mixed_data_1.csv_batch_101599.pt: 100 row

# Load the tokenized data

In [5]:
train_tokenized_dataset = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\tokenized\\train\\combined\\train_tokenized_dataset.pt'
test_tokenized_dataset = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\tokenized\\test\\combined\\test_tokenized_dataset.pt'

# Define the RNN/LSTM model

Inspired by the following paper: 

A. Topbaş, A. Jamil, A. A. Hameed, S. M. Ali, S. Bazai and S. A. Shah, "Sentiment Analysis for COVID-19 Tweets Using Recurrent Neural Network (RNN) and Bidirectional Encoder Representations (BERT) Models," 2021 International Conference on Computing, Electronic and Electrical Engineering (ICE Cube), Quetta, Pakistan, 2021, pp. 1-6, doi: 10.1109/ICECube53880.2021.9628315.

In [6]:
# Define the RNN/LSTM Model
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(RNNModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.rnn = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
    
    def forward(self, x):
        h0 = torch.zeros(self.layer_dim * 2, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(self.layer_dim * 2, x.size(0), self.hidden_dim).to(device)
        out, _ = self.rnn(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Model parameters
input_dim = 4096 # This should align with the bigbird tokenizer output dimension
hidden_dim = 256
layer_dim = 2
output_dim = 2 # Binary classification ( warrented or unwarrented)

# Instantiate the model
model = RNNModel(input_dim, hidden_dim, layer_dim, output_dim).to(device)

# Define training parameters
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training function 
def train_model(model, train_loader, criterion, optimizer, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        for i, (texts, masks, labels) in enumerate(train_loader):
            texts, masks, labels = texts.to(device), masks.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            if (i + 1) % 100 == 0:
                print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}')


NameError: name 'nn' is not defined

In [None]:
# Create DataLoaders
train_loader = DataLoader(dataset=train_tokenized_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(dataset=test_tokenized_dataset, batch_size=32, shuffle=False)

# Train the model
train_model(model, train_loader, criterion, optimizer)

# Save the model
torch.save(model.state_dict(), 'rnn_model.pth')

# Test and Evaluate the Model

In [None]:
# Testing and Evaluation Function
def test_model(model, test_loader, criterion):
    model.eval()
    total_loss, total_correct, total_samples = 0, 0, 0

    with torch.no_grad():
        for texts, masks, labels in test_loader:
            texts, masks, labels = texts.to(device), masks.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)
    
    avg_loss = total_loss / len(test_loader)
    accuracy = total_correct / total_samples
    print(f'Test Loss: {avg_loss:.4f}, Test Accuracy: {accuracy:.4f}')

# Train the model
train_model(model, train_loader, criterion, optimizer)

# Test the model
test_model(model, test_loader, criterion)
