In [47]:
from transformers import BertModel, BertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd
import torch
import json

# read the category mapping
with open('../data/processed/category_mapping.json', 'r') as f:
    category_mapping = json.load(f)

model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
model.to('cuda')
tokenizer = BertTokenizer.from_pretrained(model_name)

# Load dataset
dataset = load_dataset('csv', data_files='../data/processed/cleaned_tweets.csv')



Generating train split: 0 examples [00:00, ? examples/s]

In [48]:
# Split the dataset into train and test (90-10 split first)
train_test_split = dataset['train'].train_test_split(test_size=0.2)

# Split the test set further into validation and test (50-50 split)
val_test_split = train_test_split['test'].train_test_split(test_size=0.5)

# Combine the splits into a single DatasetDict
dataset_split = {
    'train': train_test_split['train'],
    'val': val_test_split['train'],
    'test': val_test_split['test']
}

train_dataset = dataset_split['train']
val_dataset = dataset_split['val']
test_dataset = dataset_split['test']

print(f"Train dataset: {train_dataset}")
print(f"Validation dataset: {val_dataset}")
print(f"Test dataset: {test_dataset}")


Train dataset: Dataset({
    features: ['text', 'sentiment'],
    num_rows: 6391
})
Validation dataset: Dataset({
    features: ['text', 'sentiment'],
    num_rows: 799
})
Test dataset: Dataset({
    features: ['text', 'sentiment'],
    num_rows: 799
})


In [49]:
def tokenize_data(data):
    return tokenizer(data['text'], padding='max_length', truncation=True, max_length=256, return_tensors='pt')

train_tok_dataset = train_dataset.map(tokenize_data)
val_tok_dataset = val_dataset.map(tokenize_data)
test_tok_dataset = test_dataset.map(tokenize_data)

# remove the original text column
train_tok_dataset = train_tok_dataset.remove_columns('text')
val_tok_dataset = val_tok_dataset.remove_columns('text')
test_tok_dataset = test_tok_dataset.remove_columns('text')

# group the data into a DatasetDict
tokenized_datasets = {
    'train': train_tok_dataset,
    'val': val_tok_dataset,
    'test': test_tok_dataset
}

    

Map:   0%|          | 0/6391 [00:00<?, ? examples/s]

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

In [50]:
train_tok_dataset

Dataset({
    features: ['sentiment', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6391
})

In [51]:
# forward the train sentences through the model in batches of 8
batch = tokenized_datasets['train'][:8]
batch['input_ids'] = torch.tensor(batch['input_ids']).to('cuda')
batch['input_ids'] = batch['input_ids'].view(8, -1)
batch['attention_mask'] = torch.tensor(batch['attention_mask']).to('cuda')
batch['attention_mask'] = batch['attention_mask'].view(8, -1)
# print(batch['input_ids'])
output = model(batch['input_ids'], attention_mask=batch['attention_mask'])
cls = output['last_hidden_state'][:, 0, :]



In [52]:
# let's define a class for the encoder block
class TicketEncoder(torch.nn.Module):
    def __init__(self, model):
        super(TicketEncoder, self).__init__()
        self.model = model
        
    def forward(self, input_ids, attention_mask):
        output = self.model(input_ids, attention_mask=attention_mask)
        cls = output.pooler_output
        return cls

In [53]:
encoder = TicketEncoder(model)
encoder.to('cuda')
encoder(batch['input_ids'], batch['attention_mask']).shape

torch.Size([8, 768])

In [54]:
# let's define a class for the classifier
class TicketClassifier(torch.nn.Module):
    def __init__(self, encoder, num_classes):
        super(TicketClassifier, self).__init__()
        self.encoder = encoder
        self.classifier = torch.nn.Linear(encoder.model.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        cls = self.encoder(input_ids, attention_mask)
        logits = self.classifier(cls)
        probs = torch.nn.functional.softmax(logits, dim=-1)
        return probs

In [55]:
classifier = TicketClassifier(encoder, len(category_mapping))
classifier.to('cuda')
classifier(batch['input_ids'], batch['attention_mask'])

tensor([[0.5416, 0.4584],
        [0.5097, 0.4903],
        [0.5834, 0.4166],
        [0.5366, 0.4634],
        [0.5776, 0.4224],
        [0.5512, 0.4488],
        [0.5472, 0.4528],
        [0.5343, 0.4657]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [56]:
class BERTTicketClassifier(torch.nn.Module):
    def __init__(self, model_name, num_classes):
        super(BERTTicketClassifier, self).__init__()
        self.model = BertModel.from_pretrained(model_name)
        self.encoder = TicketEncoder(self.model)
        self.classifier = torch.nn.Linear(self.model.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        cls = self.encoder(input_ids, attention_mask)
        logits = self.classifier(cls)
        return logits

In [57]:
# let's define a class that calculates the loss
class SimpleLoss(torch.nn.Module):
    def __init__(self):
        super(SimpleLoss, self).__init__()
        
    def forward(self, probs, targets):
        return torch.nn.functional.cross_entropy(probs, targets)


In [58]:
from tqdm import tqdm

# let's define a function that will train the model for one epoch
def train_epoch(model, data, loss_fn, optimizer, batch_size=8):
    model.train()
    batches = []
    for i in range(0, len(data), batch_size):
        batches.append(data[i:i+batch_size])

    total_loss = 0
    for batch in tqdm(batches, desc="Training", unit="batch"):
        optimizer.zero_grad()
        input_ids = torch.tensor(batch['input_ids']).to('cuda')
        input_ids = input_ids.view(batch_size, -1)
        attention_mask = torch.tensor(batch['attention_mask']).to('cuda')
        attention_mask = attention_mask.view(batch_size, -1)
        targets = torch.tensor(batch['sentiment']).to('cuda')
        probs = model(input_ids, attention_mask = attention_mask)
        loss = loss_fn(probs, targets)
        print(f"Loss: {loss.item()}")
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(batches)


In [59]:
model = BERTTicketClassifier(model_name, len(category_mapping))
model.to('cuda')
loss_fn = SimpleLoss()
loss_fn.to('cuda')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

loss = train_epoch(model, tokenized_datasets['train'], loss_fn, optimizer, batch_size=8)

Training:   0%|          | 0/799 [00:00<?, ?batch/s]

{'sentiment': [0, 0, 4, 4, 4, 4, 0, 4], 'input_ids': [[[101, 2003, 3564, 11471, 2128, 2571, 14071, 2050, 2175, 2041, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[101, 2125, 2000, 1996, 7435, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Training:   0%|          | 0/799 [00:08<?, ?batch/s]


RuntimeError: CUDA error: the launch timed out and was terminated
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
