# BERT Sentiment Classification (details)

Contact: christian.winkler@datanizing.com

Original parts from https://mccormickml.com/2019/07/22/BERT-fine-tuning/

# Load PyTorch and determine GPU/CPU

In [None]:
import torch

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print("Using GPU %s" % torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU :-(")

# Read labeled data

In [None]:
import pandas as pd

df = pd.read_json("10000_All_Beauty.json.xz")

In [None]:
df

In [None]:
# Convert labels to integers, torch only works with integers
df["sentiment"] = 0
df.loc[df["rating"] == 5, "sentiment"] = 1
df.sample(10, random_state=42)

In [None]:
# convert to arrays
text = df["text"].values
labels = df["sentiment"].values

# Tokenization

In [None]:
from transformers import AutoTokenizer

# alternative mode, change max_length to 512 below then
# model_name = "bert-base-uncased"
model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

In [None]:
# Get maximum length of tokens
max_len = max([len(tokenizer.encode(t, add_special_tokens=True)) for t in text])
max_len

In [None]:
# now tokenize everything, get also input_ids and attention_masks
input_ids = []
attention_masks = []

for t in text:
    encoded_dict = tokenizer.encode_plus(
                        t,
                        add_special_tokens = True,    # add '[CLS]' and '[SEP]'
                        max_length = 1024,
                        truncation = True,
                        pad_to_max_length = True,
                        return_attention_mask = True,  # create attention masks
                        return_tensors = 'pt',         # pytorch tensors as result
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# convert python lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [None]:
text[0]

In [None]:
tokenizer.tokenize(text[0])

In [None]:
input_ids[0].numpy()[0:len(tokenizer.tokenize(text[0]))]

# Split data

In [None]:
from torch.utils.data import TensorDataset, random_split

# only work with the tensors starting now
dataset = TensorDataset(input_ids, attention_masks, labels)

# use a 3:1 split for training and test
train_size = int(0.75 * len(dataset))
val_size = len(dataset) - train_size
# set as many random seeds as possible (never enough)
torch.manual_seed(42)
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(train_size, val_size)

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# adjust the batch size accordingly
batch_size = 32

# use a DataLoader for both datasets (could also use RandomSampler)
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)
validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = batch_size)

# Load model

In [None]:
from transformers import AutoModelForSequenceClassification, AdamW

# model and tokenizer must match
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, device_map="auto",
    num_labels = 2, # only positive and negative sentiments
    output_attentions = False,
    output_hidden_states = False # we don't (yet) need embedings
)

In [None]:
# choose optimizer, AdamW is standard (could also use the pytorch version and avoid the warnung)
optimizer = AdamW(model.parameters(), lr = 2e-5)

In [None]:
from transformers import get_linear_schedule_with_warmup

# four epochs, just a guess
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [None]:
import numpy as np

# calculate accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import random
import numpy as np
from tqdm.auto import trange, tqdm

# initialize again all RNGs
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# save statistics information
training_stats = []

for epoch_i in trange(epochs, desc="Epoche"):
    # accumulated loss for the current epoch
    total_train_loss = 0

    # put model into training mode (save gradients)
    model.train()

    # train per batch
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
        # unpack data and transform to device format
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # clear gradient
        model.zero_grad()        

        # forward pass (predict data)
        res = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # calculate and accumulate loss
        total_train_loss += res.loss.item()

        # backward propagation to calculate gradient
        res.loss.backward()

        # cut to acoid exploding gradient
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # optimizer parameters and learning rate
        optimizer.step()
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
      

    # put model into evaluation mode (don't save gradients)
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # validate a single epoch
    for batch in tqdm(validation_dataloader, desc="Validierung"):
        # unpack validation data
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # noch backward propagation, don't calculate gradient
        with torch.no_grad():        
            # Vorhersage durchfÃ¼hren
            res = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask,
                        labels=b_labels)
            
        # accumulate eval loss
        total_eval_loss += res.loss.item()

        # convert data to cpu format (calculate accuracy)
        logits = res.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Accuracy for verification
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    tqdm.write("Accuracy: %f" % avg_val_accuracy)

    # Loss for all batches
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    tqdm.write("Validation loss %f" % avg_val_loss)

    # save statistics for plotting later
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Validierung Loss': avg_val_loss,
            'Accuracy': avg_val_accuracy
        }
    )

In [None]:
import pandas as pd

df_stats = pd.DataFrame(data=training_stats).set_index("epoch")
df_stats

In [None]:
df_stats.plot()