# Bert flow

this bert flow is based on and requires "df_cleaned_engl.pkl"

(for memory effieciency run prep & tokenization seperate from training)

In [None]:
#!pip install -requiremnts_bert.txt

In [None]:
import numpy as np
import pandas as pd
import torch
import os
import re
import pickle
import torchmetrics
import pytorch_lightning as pl
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, DataCollatorWithPadding
from transformers import BertForSequenceClassification, AdamW
from transformers import BertTokenizer
from tqdm import tqdm
from torchmetrics.functional import accuracy
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
tqdm.pandas()

## Prep & tokenization

In [None]:
def strip_lyrics(lyrics):
    # Remove strings enclosed in brackets []
    lyrics = re.sub(r'\[.*?\]', '', lyrics)
    
    # Remove substrings starting with a backslash \
    lyrics = re.sub(r'\\[^\s]*', '', lyrics)

    # Remove newline characters \n
    lyrics = re.sub(r'\n', ' ', lyrics)
    
    # Remove single quotes '
    lyrics = re.sub(r"'", '', lyrics)
    
    # Remove leading and trailing whitespaces
    lyrics = lyrics.strip()

    # Strip the string and ensure only one space between words
    lyrics = re.sub(r'\s+', ' ', lyrics.strip())

    return lyrics

In [None]:
def generate_subset(df, n=None, p=None):
    if n is not None: # If sample size is provided
        df_sub = df.groupby("tag").apply(lambda x: x.sample(n=n, random_state=1)).reset_index(drop=True)
    elif p is not None: # If sample percentage is provided
        df_sub = df.groupby("tag").apply(lambda x: x.sample(n=int(np.ceil(x.shape[0]*p)), random_state=1)).reset_index(drop=True)
    
    #print create subset with n samples per tag
    print(f"Subset created with {df_sub.shape[0]} samples")
    
    #drop uncessary columns
    df_sub.drop(columns=["title",'artist', 'year',"id","language","word_count"], inplace=True)

    # apply strip_lyrics (re)
    print("Applying strip_lyrics...")
    df_sub['lyrics'] = df_sub['lyrics'].progress_apply(lambda x: strip_lyrics(x))

    return df_sub

In [None]:
def tokenize_with_bert(dataframe):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    def tokenize_song(song):
        tokenized = tokenizer.encode(song, max_length=512, truncation=True, padding='max_length')
        return tokenized

    # Pass both song and index using lambda function
    print("Tokenizing...")
    tokenized = dataframe['lyrics'].reset_index().progress_apply(lambda x: tokenize_song((x['lyrics'])), axis=1)
    return tokenized

In [None]:
#pandas read pickle
print("Loading data...")
df = pd.read_pickle('/work/cleaned_df/df_cleaned_engl.pkl')
print("Loaded in df with shape: ", df.shape)

In [None]:
#add word count
print("Adding word count...")
df['word_count'] = df['lyrics'].progress_apply(lambda x: len(x.split()))

print("Filtering songs based on word count...")
df = df[(df['word_count'] < 5000) & (df['word_count'] > 25)]
print("Reduced size to: ", df.shape)

print("Filtering songs based on year...")
df = df[(df['year'] >= 1960) & (df['year'] <= 2023)]
print("Reduced size to: ", df.shape)

####### SET PARAMS HERE #######
#generate traing and test set (validation set is split from trainin set later)
print("Generating training subset...")
df_trainings_subsample = generate_subset(df, n=20000)
print("Generating validation subset...")
df_validation_subsample = generate_subset(df, p=0.05) #here it is important to use percentage to keep the distribution of the classes

In [None]:
# tokemize lyrics of training and validation set
trainings_subsample_tokenized = tokenize_with_bert(df_trainings_subsample)
validation_subsample_tokenized = tokenize_with_bert(df_validation_subsample)

## BERT training

In [None]:
class LyricsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {'input_ids': torch.as_tensor(self.encodings.iloc[idx])}
        item['labels'] = torch.as_tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.encodings)

class LyricsClassifier(pl.LightningModule):
    def __init__(self, model_name='bert-base-uncased', num_labels=5):
        super().__init__()
        self.save_hyperparameters()
        self.bert = BertForSequenceClassification.from_pretrained(self.hparams.model_name,
                                                                  num_labels=self.hparams.num_labels)
        self.accuracy = torchmetrics.Accuracy(task="multiclass",compute_on_step=False, num_classes=num_labels)

        
    def forward(self, input_ids, labels=None):
        return self.bert(input_ids, labels=labels)
    
    def training_step(self, batch, batch_idx):
        outputs = self.forward(batch['input_ids'], batch['labels'])
        loss = outputs.loss
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(batch['input_ids'], batch['labels'])
        _, predicted = torch.max(outputs.logits, 1)
        correct = (predicted == batch['labels']).sum().item()
        accuracy = correct / len(batch['labels'])
        self.log('val_accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return accuracy
        
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=1e-5)


def load_data():
    
    encodings = trainings_subsample_tokenized

    #create labels
    label_encoder = LabelEncoder()
    df_trainings_subsample['tag'] = label_encoder.fit_transform(df_trainings_subsample['tag'])
    
    labels = df_trainings_subsample['tag']

    return encodings, labels

def main():
    encodings, labels = load_data()

    #prepare tokenizer and data collator
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    #prepare datasets
    train_encodings, val_encodings, train_labels, val_labels = train_test_split(encodings, labels, test_size=0.1, random_state=42)
    train_dataset = LyricsDataset(train_encodings, train_labels)
    val_dataset = LyricsDataset(val_encodings, val_labels)

    model = LyricsClassifier()

    # data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=data_collator)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=data_collator)

    #Prepare trainer
    trainer = pl.Trainer(precision=16, limit_train_batches=0.5,max_epochs=3)

    # Training
    trainer.fit(model, train_loader, val_loader)
    
    return model, data_collator

In [None]:
#run actual training
model, data_collator = main()

## testing

In [None]:
#loading in model from checkpoint instead of training
#model = LyricsClassifier.load_from_checkpoint(checkpoint_path="/content/lightning_logs/version_3/checkpoints/epoch=0-step=1406.ckpt")

##this is not tested yet in the flow but was adjusted

In [None]:
test_encodings = validation_subsample_tokenized
label_encoder = LabelEncoder()
df_validation_subsample['tag'] = label_encoder.fit_transform(df_validation_subsample['tag'])
test_labels = df_validation_subsample['tag']
test_dataset = LyricsDataset(test_encodings, test_labels)


#ADJUST IF POSSIBLE ON LARGE GPU!!!! LOWER INFERENCE TIME WHEN BATCH SIZE IS HIGHER BUT ALSO MORE MEMORY CONSUMPTION
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=data_collator, num_workers=4)

# Move model to device once
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval()  # Set the model to evaluation mode

true_labels = []
predicted_labels = []


for batch in tqdm(test_loader, desc="Inference"):
    batch_input_ids = batch['input_ids'].to(device)  
    batch_labels = batch['labels'].to(device)

    # Inference
    with torch.no_grad():  
        outputs = model(batch_input_ids, batch_labels)

    # Get the predicted labels
    _, preds = torch.max(outputs.logits, 1)
    predicted_labels.extend(preds.cpu().numpy())
    true_labels.extend(batch_labels.cpu().numpy())

# Classification report
print(classification_report(true_labels, predicted_labels, zero_division=0))

# Confusion Matrix
cm = confusion_matrix(true_labels, predicted_labels)
plt.figure(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion matrix")
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()