In [56]:
import pandas as pd
import numpy as np
import torch
import torch.nn
import os
import re
import time
import matplotlib.pyplot as plt
import copy

In [57]:
%%bash
export PYDEVD_DISABLE_FILE_VALIDATION=1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [58]:
data = pd.read_feather("../../data_preprocessing/filtered_data.feather")

In [59]:
def preprocess_text(x: pd.Series) -> list[str]: 
    x_ = x.copy()
    x_ = x_.apply(lambda x: x.replace('\n', ' '))
    x_ = x_.apply(lambda x: x.replace('\u2005', ' '))
    x_ = [re.sub(r'\[.*?\]\s*', '', line) for line in x_]
    x_ = [re.sub(r'\(.*?\)\s*', '', line) for line in x_]
    return x_

data["processed_lyrics"] = preprocess_text(data.lyrics)

In [60]:
word_counts = [len(s.split()) for s in data.processed_lyrics]

In [61]:
mask = (pd.Series(word_counts) < 1000)
filtered_data = data[mask]
filtered_data = filtered_data[~(filtered_data.artist == "Glee Cast")]

In [62]:
genre_breakdown = filtered_data.tag.value_counts()
genre_breakdown

tag
pop        181300
rap        177940
rock       128971
rb          34507
country     15380
Name: count, dtype: int64

In [63]:
top_10000_per_genre = pd.DataFrame()

for genre in genre_breakdown.index:
    genre_data = filtered_data[filtered_data["tag"] == genre]
    top_10000 = genre_data.sort_values("views", ascending=False).iloc[:10000, :]
    top_10000_per_genre = pd.concat([top_10000_per_genre, top_10000])

filtered_data = top_10000_per_genre.reset_index(drop=True)


In [64]:
for genre in genre_breakdown.index:
    print(genre + ": \n " + "-"* 20)
    genre_data = filtered_data[filtered_data["tag"] == genre]
    print(genre_data.artist.value_counts()[:15])

pop: 
 --------------------
artist
Lana Del Rey         131
Taylor Swift         103
One Direction         83
Justin Bieber         81
Ed Sheeran            78
Beyonc                78
Little Mix            72
Lady Gaga             71
Ariana Grande         70
Shawn Mendes          66
twenty one pilots     60
Billie Eilish         57
Halsey                55
Rihanna               54
Katy Perry            52
Name: count, dtype: int64
rap: 
 --------------------
artist
Drake                         215
Eminem                        198
Lil Wayne                     174
Kanye West                    156
J. Cole                       134
JAY-Z                         128
Juice WRLD                    126
Future                        123
$UICIDEBOY$                   120
Logic                         112
Meek Mill                     112
YoungBoy Never Broke Again    107
Mac Miller                    103
Young Thug                    101
Lil Uzi Vert                  101
Name: count, dtype:

## Train Test Split

In [65]:
from sklearn.model_selection import train_test_split

In [66]:
x = filtered_data.processed_lyrics
y = filtered_data.tag

In [67]:
X_, X_test, y_, y_test = train_test_split(x, y, random_state=1234, test_size=0.5, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_, y_, random_state=1234, test_size=0.4, stratify=y_)
del X_, y_

## Dataset Loader

In [68]:
from helper_functions.dataset_loader import TextDataset

In [69]:
BATCH_SIZE = 64
MAX_LENGTH = 256

training_set = TextDataset(X_train, y_train, batch_size=BATCH_SIZE, max_length=MAX_LENGTH)
validation_set = TextDataset(X_valid, y_valid, batch_size=BATCH_SIZE, max_length=MAX_LENGTH, shuffle = False)
testing_set = TextDataset(X_test, y_test, batch_size=BATCH_SIZE, max_length=MAX_LENGTH, shuffle= False)

In [70]:
training_set.head()

Unnamed: 0,Text,Label
0,"Oh, put your lovin' hand out, baby I'm beggin'...",rap
1,"Do you recall, not long ago We would walk on t...",pop
2,"Now it's time, I fear to tell I've been holdin...",rb
3,And hell is just a government creation And all...,rap
4,"I'm not supposed to be thinking 'bout you, lyi...",country
5,"Once again man, thank you for coming out tonig...",rap
6,Love isn't really real That's just how you mak...,country
7,If I could turn back time If I could find a wa...,pop
8,"Faow, green fly like Peter, whoa, ff Green fly...",rap
9,I will shape myself into your pocket Invisible...,rock


In [71]:
validation_set.head()

Unnamed: 0,Text,Label
0,"Come on, come on, come on, come on Come on, it...",rock
1,I'm tryin' to go to church Get some chicken wi...,rap
2,Now she wanna fuck with me Live a life of luxu...,rb
3,We broke Everything that was right we both enj...,pop
4,"There was blood in my tears ""thgir ton s'ti - ...",rap
5,I got enough on my mind That when she pulls me...,rock
6,Land of the silver birch Home of the beaver Wh...,country
7,"No Take time for me, yeah No, no, no Take tim...",rb
8,I'm so hurt To think that you lied to me I'm h...,country
9,"I like the vision of us, but something more 'C...",rb


## Pytorch Distilbert

In [72]:
from helper_functions.distilbert_architecture import DistilBertForSequenceClassification
from helper_functions.pytorch_trainer import PytorchTrainer

In [73]:
import torch.nn as nn
from transformers import DistilBertConfig
from tqdm import tqdm

In [74]:
config = config = DistilBertConfig(num_labels = len(set(y_train)))
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

model = DistilBertForSequenceClassification(config = config)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-5)


In [75]:
train_loader = training_set.get_dataloader()
val_loader=validation_set.get_dataloader()

In [76]:
model = model.to(device)

In [77]:
"""trainer = PytorchTrainer(device=device)

trained_model, trained_optimizer = trainer.run(
    model=model,
    optimizer=optimizer,
    epochs=5,
    train_loader=train_loader,
    val_loader=val_loader
)

print(trainer.history)"""

'trainer = PytorchTrainer(device=device)\n\ntrained_model, trained_optimizer = trainer.run(\n    model=model,\n    optimizer=optimizer,\n    epochs=5,\n    train_loader=train_loader,\n    val_loader=val_loader\n)\n\nprint(trainer.history)'

In [78]:
trainer = PytorchTrainer(device=device)

trainer.best_model = "checkpoints/1_val_acc_1.00.pth"

In [79]:
predictions = trainer.predict(model = model, testing_loader= val_loader)

Starting Predictions...


100%|██████████| 157/157 [01:43<00:00,  1.51it/s]

Testing Loss: 1.8603 | Validation Accuracy: 0.0000





In [80]:
predictions

[tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(2, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(2, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(2, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(3, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='mps:0'),
 tensor(0, device='m

In [81]:
predictions_cpu = [tensor.cpu().numpy() for tensor in predictions]
predictions_array = np.stack(predictions_cpu, axis=0) 

In [82]:
np.sum(predictions_array == validation_set.text_labels)/len(predictions_array)

0.0

In [83]:
predictions_array

array([0, 0, 0, ..., 0, 0, 0])

In [85]:
pd.Series(validation_set.text_labels).value_counts()

1.0    10000
Name: count, dtype: int64

In [88]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer

class TextDataset:
    def __init__(self, text_dataset, labels, max_length=128, batch_size=32, shuffle=True) -> None:
        assert len(text_dataset) == len(labels), "Text samples do not match Labels"
        self.shuffle = shuffle

        self.genre_to_id = {"rap": 0, "rock": 1, "rb": 2, "pop": 3, "country": 4}
        self.id_to_genre = {v: k for k, v in self.genre_to_id.items()}
        
        checkpoint = "distilbert-base-uncased"
        self.text_dataset = list(text_dataset)
        self.text_labels = [self.genre_to_id[label] for label in labels]
        self.num_classes = len(self.genre_to_id)
        self.max_length = max_length
        self.batch_size = batch_size
        
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        
        # Tokenize the dataset
        self.input_ids, self.attention_masks = self.tokenize()
        
        # Create the DataLoader and save it internally
        self.dataloader = self.create_dataloader()
        
    def __len__(self):
        return len(self.text_dataset)
    
    def to_dataframe(self):
        return pd.DataFrame({
            "Text": self.text_dataset,
            "Label": [self.id_to_genre[label] for label in self.text_labels]
        })
    
    def __repr__(self):
        return repr(self.to_dataframe())
    
    def tokenize(self):
        inputs = self.tokenizer.batch_encode_plus(
            self.text_dataset,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            truncation=True,
            return_tensors="np"
        )
        
        input_ids = np.asarray(inputs['input_ids'], dtype='int32')
        attention_masks = np.asarray(inputs['attention_mask'], dtype='int32')
        
        return input_ids, attention_masks
    
    def create_dataloader(self):
        labels_tensor = torch.tensor(self.text_labels, dtype=torch.long)
        input_ids_tensor = torch.tensor(self.input_ids, dtype=torch.long)
        attention_masks_tensor = torch.tensor(self.attention_masks, dtype=torch.long)
        
        dataset = TensorDataset(input_ids_tensor, attention_masks_tensor, labels_tensor)
        
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=self.shuffle)
    
    def get_dataloader(self):
        return self.dataloader
        
    def head(self, n=10):
        df = self.to_dataframe()
        return df.head(n=n)


In [91]:
training  = TextDataset(X_train, y_train)

In [92]:
training.text_labels

[0,
 3,
 2,
 0,
 4,
 0,
 4,
 3,
 0,
 1,
 4,
 4,
 4,
 3,
 2,
 1,
 0,
 1,
 4,
 2,
 1,
 4,
 2,
 3,
 0,
 3,
 4,
 0,
 0,
 1,
 4,
 1,
 2,
 0,
 4,
 1,
 1,
 3,
 2,
 3,
 0,
 1,
 2,
 4,
 0,
 2,
 0,
 0,
 3,
 1,
 2,
 0,
 4,
 4,
 0,
 0,
 2,
 0,
 1,
 1,
 4,
 2,
 1,
 2,
 3,
 3,
 1,
 1,
 4,
 4,
 2,
 2,
 4,
 2,
 3,
 4,
 0,
 3,
 4,
 0,
 1,
 1,
 1,
 2,
 1,
 1,
 0,
 0,
 1,
 3,
 4,
 4,
 4,
 4,
 1,
 4,
 4,
 4,
 1,
 4,
 2,
 2,
 3,
 2,
 1,
 1,
 0,
 2,
 4,
 1,
 0,
 0,
 0,
 1,
 3,
 4,
 3,
 1,
 2,
 4,
 0,
 4,
 3,
 3,
 3,
 4,
 3,
 1,
 4,
 1,
 4,
 4,
 3,
 3,
 2,
 0,
 4,
 4,
 2,
 1,
 0,
 0,
 4,
 0,
 3,
 4,
 4,
 2,
 4,
 0,
 1,
 1,
 1,
 3,
 4,
 2,
 3,
 3,
 1,
 4,
 1,
 3,
 3,
 3,
 3,
 4,
 1,
 2,
 1,
 3,
 0,
 0,
 1,
 1,
 3,
 0,
 4,
 1,
 2,
 1,
 0,
 2,
 4,
 0,
 4,
 2,
 1,
 3,
 1,
 4,
 2,
 4,
 4,
 1,
 1,
 1,
 1,
 3,
 0,
 2,
 2,
 1,
 4,
 3,
 3,
 0,
 1,
 4,
 2,
 2,
 4,
 4,
 4,
 1,
 2,
 2,
 0,
 0,
 3,
 2,
 1,
 4,
 4,
 2,
 0,
 4,
 1,
 2,
 2,
 4,
 2,
 3,
 2,
 4,
 0,
 0,
 1,
 3,
 1,
 4,
 4,
 3,
 3,
 2,
 4,
 2,
 3,
 4,
 0,
 4,
