In [1]:
#import pickle
import numpy as np
import pandas as pd
import pickle
import torch
import os
import pickle
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, DataCollatorWithPadding
from transformers import BertForSequenceClassification, AdamW
import pytorch_lightning as pl

### used for getting a (very) small subset

In [None]:
#load in 
with open('/work/pkl_files/stopwords_removed.pkl', 'rb') as f:
    data = pickle.load(f)
#convert to dataframe
df = pd.DataFrame(data)
df_sub = df.groupby('tag').apply(lambda x: x.sample(n=10000, random_state=1)).reset_index(drop=True)
df_sub.to_csv("/work/sub_sample_stopwords.csv")

### load in subset and tokenize for bert

In [15]:
#import csv subsameple
df_sub_sample = pd.read_csv('/Users/davidbellenberg/github_projects/GenrefromLyrics/sub_sample_stopwords.csv')

In [None]:
#check for nan values
print(df_sub_sample.isnull().sum()) #2 nan values in lyrics column
df_sub_sample = df_sub_sample.dropna()

In [19]:
#value counts of genres
df_sub_sample['tag'].value_counts()

tag
country    10000
pop        10000
rap        10000
rb         10000
rock        9998
Name: count, dtype: int64

### actual tokenizing of the subset with max. 512 tokens

In [22]:
from transformers import BertTokenizer

def tokenize_with_bert(dataframe, column_name):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    total_songs = len(dataframe)

    def callback(song_index):
        if song_index % 1000 == 0:
            print(f"Tokenizing song {song_index}/{total_songs}")

    def tokenize_song(song_and_index):
        song = song_and_index[0]
        song_index = song_and_index[1]

        tokenized = tokenizer.encode(song, max_length=512, truncation=True, padding='max_length')

        # Call the callback after processing a certain number of songs
        if song_index % 1000 == 0:
            callback(song_index)

        return tokenized

    # Pass both song and index using lambda function
    tokenized = dataframe[column_name].reset_index().apply(lambda x: tokenize_song((x[column_name], x['index'])), axis=1)
    return tokenized

# Example usage
tokenized_lyrics = tokenize_with_bert(df_sub_sample, 'lyrics')


Tokenizing song 0/49998
Tokenizing song 1000/49998
Tokenizing song 2000/49998
Tokenizing song 3000/49998
Tokenizing song 4000/49998
Tokenizing song 5000/49998
Tokenizing song 6000/49998
Tokenizing song 7000/49998
Tokenizing song 8000/49998
Tokenizing song 9000/49998
Tokenizing song 10000/49998
Tokenizing song 11000/49998
Tokenizing song 12000/49998
Tokenizing song 13000/49998
Tokenizing song 14000/49998
Tokenizing song 15000/49998
Tokenizing song 16000/49998
Tokenizing song 17000/49998
Tokenizing song 18000/49998
Tokenizing song 19000/49998
Tokenizing song 20000/49998
Tokenizing song 21000/49998
Tokenizing song 22000/49998
Tokenizing song 23000/49998
Tokenizing song 24000/49998
Tokenizing song 25000/49998
Tokenizing song 26000/49998
Tokenizing song 27000/49998
Tokenizing song 28000/49998
Tokenizing song 29000/49998
Tokenizing song 30000/49998
Tokenizing song 31000/49998
Tokenizing song 32000/49998
Tokenizing song 33000/49998
Tokenizing song 34000/49998
Tokenizing song 35000/49998
Token

#### saving tokenized data (and labels) as .pkl

In [26]:
#save tokenized lyrics and labels
with open('tokenized_lyrics.pickle', 'wb') as handle:
    pickle.dump(tokenized_lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)

#label encode "tag" column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_sub_sample['tag'] = le.fit_transform(df_sub_sample['tag'])

with open('labels_le.pickle', 'wb') as handle:
    pickle.dump(df_sub_sample['tag'], handle, protocol=pickle.HIGHEST_PROTOCOL)
    

# Bert training (GPU necessary!)

In [None]:
class LyricsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {'input_ids': self.encodings.iloc[idx] if isinstance(self.encodings.iloc[idx], torch.Tensor) else torch.tensor(self.encodings.iloc[idx])}
        item['labels'] = self.labels.iloc[idx] if isinstance(self.labels.iloc[idx], torch.Tensor) else torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.encodings)

class LyricsClassifier(pl.LightningModule):
    def __init__(self, model_name='bert-base-uncased', num_labels=5):
        super().__init__()
        self.save_hyperparameters()
        self.bert = BertForSequenceClassification.from_pretrained(self.hparams.model_name,
                                                                  num_labels=self.hparams.num_labels)

    def forward(self, input_ids, labels):
        return self.bert(input_ids, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self.forward(batch['input_ids'], batch['labels'])
        loss = outputs.loss
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=1e-5)

### TODO: Callbacks for logging metrics and saving checkpoints


def load_data():
    with open("/content/drive/MyDrive/NLP/small_sample_tokenized/tokenized_lyrics.pickle", 'rb') as f:
        encodings = pickle.load(f)
    with open("/content/drive/MyDrive/NLP/small_sample_tokenized/labels_le.pickle", 'rb') as f:
        labels = pickle.load(f)

    return encodings, labels


def main():
    encodings, labels = load_data()

    # Prepare tokenizer and data collator
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Prepare datasets
    train_encodings, val_encodings, train_labels, val_labels = train_test_split(encodings, labels, test_size=0.1, random_state=42)
    train_dataset = LyricsDataset(train_encodings, train_labels)
    val_dataset = LyricsDataset(val_encodings, val_labels)

    # Prepare model
    model = LyricsClassifier()

    # Prepare data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=data_collator)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=data_collator)

    # Prepare trainer
    trainer = pl.Trainer(precision=16, limit_train_batches=0.5,max_epochs=3)

    # Train the model
    trainer.fit(model, train_loader, val_loader)


if __name__ == "__main__":
    main()


# checking out performance with Checkpoint file

In [None]:
#!pip install tensorboard

In [None]:
# TODO: create test_encodings, test_labels for checkpoint testing

In [3]:
from transformers import pipeline
from sklearn.metrics import classification_report
import numpy as np

def evaluate_model(model, test_dataset):
    # Prepare data loader
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=data_collator)

    # Move model to evaluation mode
    model.eval()

    # Prepare prediction and label lists
    predictions = []
    actuals = []

    # Iterate through test data
    for batch in test_loader:
        # Move batch to device
        batch = {k: v.to(model.device) for k, v in batch.items()}
        # Forward pass
        with torch.no_grad():
            outputs = model(**batch)
        
        # Get the predicted labels
        _, preds = torch.max(outputs.logits, dim=1)

        # Move preds to CPU
        preds = preds.detach().cpu().numpy()
        predictions.extend(preds)

        # Extract labels and add to list
        labels = batch['labels'].detach().cpu().numpy()
        actuals.extend(labels)

    # Generate classification report
    report = classification_report(actuals, predictions, output_dict=True)

    return report

def load_and_evaluate_model(checkpoint_path, test_dataset):
    model = LyricsClassifier.load_from_checkpoint(checkpoint_path)

    report = evaluate_model(model, test_dataset)

    for label, metrics in report.items():
        print(f'\nClass: {label}')
        for metric, value in metrics.items():
            print(f'{metric}: {value}')



In [None]:
# Usage example
test_dataset = LyricsDataset(test_encodings, test_labels)  # You need to prepare your test_encodings and test_labels
load_and_evaluate_model('/content/lightning_logs/version_2/checkpoints/epoch=1-step=2812.ckpt', test_dataset)
