# Data Preparation:

1) Read CSV files from train and test directories.
2) Concatenate 'cleaned_text' and 'subject' fields into a single text field for processing.
3) Normalize and tokenize text data.

In [4]:
import os
import pandas as pd
import torch
from torch import nn
from transformers import BigBirdTokenizer
from torch.utils.data import DataLoader, TensorDataset

print("PyTorch version " + torch.__version__)
print("Num GPUs Available: ", torch.cuda.device_count())
print(torch.cuda.is_available())

# Set device to GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

''' 
remember to call .to(device) on model, inputs, labels, etc. so that it uses the GPU!! 
'''

PyTorch version 2.0.1
Num GPUs Available:  1
True
cuda


' \nremember to call .to(device) on model, inputs, labels, etc. so that it uses the GPU!! \n'

# Tokenization

In [5]:
# Directories
train_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\train\\'
test_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\test\\'

# load and concatenate data
def load_and_concatenate(directory):
    dataframes = []
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            df = pd.read_csv(os.path.join(directory, filename))
            df['text'] = df['cleaned_text'] + ' ' + df['subject']
            dataframes.append(df)
    return pd.concat(dataframes, ignore_index=True)

# Tokennize the data
def tokenize_dataframe(df):
    try:
        # Ensure the 'text' column is of string type
        df['text'] = df['text'].astype(str)

        # Initialize the tokenizer
        tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')

        # Tokenize the text
        return tokenizer(df['text'].tolist(), max_length=4096, truncation=True, padding='max_length', return_tensors='pt')
    except Exception as e:
        print("Error encountered:", e)
        print("Data type of 'text' column:", df['text'].dtype)
        print("Data type of first element in 'text':", type(df['text'].iloc[0]))
        # Re-raise the exception to ensure it's not silently handled
        raise



# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Load train and test data
train_data = load_and_concatenate(train_dir)
test_data = load_and_concatenate(test_dir)


train_encodings = tokenize_dataframe(train_data)
test_encodings = tokenize_dataframe(test_data)

# Convert to TensorDataset
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_data['label'].values))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_data['label'].values))

# Save the tokenized data
torch.save(train_encodings, 'train_encodings.pt')
torch.save(test_encodings, 'test_encodings.pt')

# Define batch size
batch_size = 16  # Reduced batch size

# Create DataLoaders with reduced batch size
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# saave the datasets
torch.save(train_dataset, 'train_dataset.pt')
torch.save(test_dataset, 'test_dataset.pt')


Error encountered: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
Data type of 'text' column: object
Data type of first element in 'text': <class 'str'>


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

: 

# Define the RNN/LSTM model

Inspired by the following paper: 

A. Topbaş, A. Jamil, A. A. Hameed, S. M. Ali, S. Bazai and S. A. Shah, "Sentiment Analysis for COVID-19 Tweets Using Recurrent Neural Network (RNN) and Bidirectional Encoder Representations (BERT) Models," 2021 International Conference on Computing, Electronic and Electrical Engineering (ICE Cube), Quetta, Pakistan, 2021, pp. 1-6, doi: 10.1109/ICECube53880.2021.9628315.

In [2]:
# Define the RNN/LSTM Model
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(RNNModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.rnn = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
    
    def forward(self, x):
        h0 = torch.zeros(self.layer_dim * 2, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(self.layer_dim * 2, x.size(0), self.hidden_dim).to(device)
        out, _ = self.rnn(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Model parameters
input_dim = 4096 # This should align with the bigbird tokenizer output dimension
hidden_dim = 256
layer_dim = 2
output_dim = 2 # Binary classification ( warrented or unwarrented)

# Instantiate the model
model = RNNModel(input_dim, hidden_dim, layer_dim, output_dim).to(device)

# Define training parameters
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training function 
def train_model(model, train_loader, criterion, optimizer, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        for i, (texts, masks, labels) in enumerate(train_loader):
            texts, masks, labels = texts.to(device), masks.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            if (i + 1) % 100 == 0:
                print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}')

# Create DataLoaders
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

# Train the model
train_model(model, train_loader, criterion, optimizer)

# Save the model
torch.save(model.state_dict(), 'rnn_model.pth')

NameError: name 'train_dataset' is not defined

: 

# Test and Evaluate the Model

In [None]:
# Testing and Evaluation Function
def test_model(model, test_loader, criterion):
    model.eval()
    total_loss, total_correct, total_samples = 0, 0, 0

    with torch.no_grad():
        for texts, masks, labels in test_loader:
            texts, masks, labels = texts.to(device), masks.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)
    
    avg_loss = total_loss / len(test_loader)
    accuracy = total_correct / total_samples
    print(f'Test Loss: {avg_loss:.4f}, Test Accuracy: {accuracy:.4f}')

# Train the model
train_model(model, train_loader, criterion, optimizer)

# Test the model
test_model(model, test_loader, criterion)
