In [None]:
import numpy as np
import pandas as pd
import os
import re
import pickle
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, DataCollatorWithPadding
from transformers import BertForSequenceClassification, AdamW
import pytorch_lightning as pl
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder

tqdm.pandas()

In [None]:
def strip_lyrics(lyrics):
    # Remove strings enclosed in brackets []
    lyrics = re.sub(r'\[.*?\]', '', lyrics)
    
    # Remove substrings starting with a backslash \
    lyrics = re.sub(r'\\[^\s]*', '', lyrics)

    # Remove newline characters \n
    lyrics = re.sub(r'\n', ' ', lyrics)
    
    # Remove single quotes '
    lyrics = re.sub(r"'", '', lyrics)
    
    # Remove leading and trailing whitespaces
    lyrics = lyrics.strip()

    # Strip the string and ensure only one space between words
    lyrics = re.sub(r'\s+', ' ', lyrics.strip())

    return lyrics

In [None]:
with open('/work/cleaned_df/df_cleaned_engl.pkl', 'rb') as f:
    data = pickle.load(f)

#convert to dataframe
df = pd.DataFrame(data)

#add word count
df['word_count'] = df['lyrics'].progress_apply(lambda x: len(x.split()))

df = df[(df['word_count'] < 5000) & (df['word_count'] > 25)]
df = df[(df['year'] >= 1960) & (df['year'] <= 2023)]

#drop columns of subset
df.drop(columns=["title",'artist', 'year',"id","language","word_count"], inplace=True)

# apply strip_lyrics (re)
df['lyrics'] = df['lyrics'].progress_apply(lambda x: strip_lyrics(x))

## Tokenizing for Bert

In [None]:
def tokenize_with_bert(dataframe):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    def tokenize_song(song):
        tokenized = tokenizer.encode(song, max_length=512, truncation=True, padding='max_length')
        return tokenized

    # Pass both song and index using lambda function
    tokenized = dataframe['lyrics'].reset_index().progress_apply(lambda x: tokenize_song((x['lyrics'])), axis=1)
    return tokenized

# Example usage
tokenized_lyrics = tokenize_with_bert(df)

### saving tokenized data (& labels) to .pkl

In [None]:
#save tokenized lyrics and labels
with open('tokenized_lyrics.pickle', 'wb') as handle:
    pickle.dump(tokenized_lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)

#label encode "tag" column
le = LabelEncoder()

#creatze new df for label encoding
df_enc = df.copy()

df_enc['tag'] = le.fit_transform(df_enc['tag'])

with open('labels_le.pickle', 'wb') as handle:
    pickle.dump(df_enc['tag'], handle, protocol=pickle.HIGHEST_PROTOCOL)