In [None]:
# BERT MODEL LIVE TESTER

In [1]:
import numpy as np
import pandas as pd
import torch
import os
import re
import pickle
import torchmetrics
import pytorch_lightning as pl
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, DataCollatorWithPadding
from transformers import BertForSequenceClassification, AdamW
from transformers import BertTokenizer
from tqdm import tqdm
from torchmetrics.functional import accuracy
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
tqdm.pandas()

In [26]:
def strip_lyrics(lyrics):
    # Remove strings enclosed in brackets []
    lyrics = re.sub(r'\[.*?\]', '', lyrics)
    
    # Remove substrings starting with a backslash \
    lyrics = re.sub(r'\\[^\s]*', '', lyrics)

    # Remove newline characters \n
    lyrics = re.sub(r'\n', ' ', lyrics)
    
    # Remove single quotes '
    lyrics = re.sub(r"'", '', lyrics)
    
    # Remove leading and trailing whitespaces
    lyrics = lyrics.strip()

    # Strip the string and ensure only one space between words
    lyrics = re.sub(r'\s+', ' ', lyrics.strip())

    return lyrics

In [2]:
class LyricsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {'input_ids': torch.as_tensor(self.encodings.iloc[idx])}
        item['labels'] = torch.as_tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.encodings)

class LyricsClassifier(pl.LightningModule):
    def __init__(self, model_name='bert-base-uncased', num_labels=5): #@RIES TRY "bert-large-uncased" with the A100
        super().__init__()
        self.save_hyperparameters()
        self.bert = BertForSequenceClassification.from_pretrained(self.hparams.model_name,
                                                                  num_labels=self.hparams.num_labels)
        self.accuracy = torchmetrics.Accuracy(task="multiclass",compute_on_step=False, num_classes=num_labels)

    def forward(self, input_ids, labels=None):
        return self.bert(input_ids, labels=labels)
    
    def training_step(self, batch, batch_idx):
        outputs = self.forward(batch['input_ids'], batch['labels'])
        loss = outputs.loss
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(batch['input_ids'], batch['labels'])
        _, predicted = torch.max(outputs.logits, 1)
        correct = (predicted == batch['labels']).sum().item()
        accuracy = correct / len(batch['labels'])
        self.log('val_accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return accuracy
        
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=1e-5)

In [9]:
model = LyricsClassifier.load_from_checkpoint(checkpoint_path="/Users/davidbellenberg/github_projects/GenrefromLyrics/notebooks/BERT/epoch=1-step=145040.ckpt", map_location=torch.device('cpu'))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [15]:
from transformers import BertTokenizer
import torch.nn.functional as F

def predict_genre(lyrics):
    # Use the same tokenizer model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    # Preprocess the lyrics
    inputs = tokenizer(lyrics, return_tensors="pt", padding=True, truncation=True, max_length=512)
    # Only use input_ids for the model
    input_ids = inputs['input_ids'].to(model.device)
    # Predict with the model
    with torch.no_grad():
        outputs = model(input_ids)
    # Compute softmax to get probabilities
    probabilities = F.softmax(outputs.logits, dim=-1)
    # Move probabilities to cpu in numpy format to make it more interpretable
    probabilities = probabilities.cpu().numpy()
    return probabilities


In [28]:
# Example usage
text = "Hank, Beugard, Big Sur Y'all know how this - go, you know All eyes on me, OG Roll up in the club and shit (is that right?) All eyes on me All eyes on me Ay you know what? I bet you got it twisted, you don't know who to trust So many player-hatin' - tryna sound like us Say they ready for the funk, but I don't think they knowin' Straight to the depths of Hell is where them cowards goin' Well, are you still down? Holla when you see me And let these devils be sorry for the day they finally freed me I got a caravan of - every time we ride Hittin' - up when we pass by Until I die, live the life of a boss player, 'cause even when I'm high F- with me and get crossed later, the futures in my eyes 'Cause all I want is cash and thangs A five-double-oh Benz, flauntin' flashy rings Uhh, - pursue me like a dream Been known to disappear before your eyes just like a dope fiend It seems, my main thing was to be major paid The game sharper than a - razor blade Say money bring - bring lies One - gettin' jealous and - die Depend on me like the first and fifteenth They might hold me for a second, but these punks won't get me We got four niggas in low riders and ski masks Screamin', Thug Life every time they pass, all eyes on me Live the life of a thug - until the day I die Live the life of a boss player All eyes on me All eyes on me Live the life of a thug - until the day I die Live the life of a boss player 'cause even gettin' high"

#strip lyrics
text_stripped = strip_lyrics(text)

probabilities = predict_genre(text_stripped)


In [30]:
!pip install gradio

Collecting gradio
  Downloading gradio-3.32.0-py3-none-any.whl (19.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.9/19.9 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting gradio-client>=0.2.4
  Downloading gradio_client-0.2.5-py3-none-any.whl (288 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.1/288.1 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting altair>=4.2.0
  Downloading altair-5.0.1-py3-none-any.whl (471 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.5/471.5 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx
  Downloading httpx-0.24.1-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.4/75.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting python-multipart
  Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0

In [None]:
import numpy as np
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
import string

import joblib
import re
import gradio as gr


def predict_genre(Artist, Title, Lyrics, File):
    # Preprocess the lyrics
    vector = preprocess(lyrics)

    # Reshape the vector to 2D array as the model.predict expects 2D array
    vector = vector.reshape(1, -1)

    # Load the trained Random Forest Classifier
    clf = joblib.load('/work/NLP_Project/GenreFromLyricsShared/random_forest.sav')

    # Get the probabilities of each class
    probabilities = clf.predict_proba(vector)[0]

    # Map each probability with the corresponding genre
    genres = clf.classes_
    result = {genre: prob for genre, prob in zip(genres, probabilities)}

    return result



description = '<img src="https://storage.googleapis.com/pr-newsroom-wp/1/2018/11/Spotify_Logo_CMYK_Green.png" alt="Spotify Logo">'

iface = gr.Interface(
    fn=predict_genre,
    inputs=[
        gr.inputs.Textbox(lines=1, placeholder='Artist Here...'),
        gr.inputs.Textbox(lines=1, placeholder='Title Here...'),
        gr.inputs.Textbox(lines=4, placeholder='Lyrics Here...'),
        #gr.inputs.File()
    ],
    outputs=gr.outputs.Label(label="Genre Suggestion"),
    description=description
)
iface.launch(debug=True, share=True)