In [3]:
import pandas as pd
import numpy as np
import re

In [5]:
data = pd.read_feather("data_preprocessing/filtered_data.feather")

In [6]:
x = data.lyrics
y = data.tag

In [20]:
import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("chens musicbox")

<Experiment: artifact_location='mlflow-artifacts:/710064176314707886', creation_time=1723632908969, experiment_id='710064176314707886', last_update_time=1723632908969, lifecycle_stage='active', name='chens musicbox', tags={}>

In [27]:
os.listdir()

['.DS_Store',
 'mlartifacts',
 'modeling',
 'mlruns',
 'README.md',
 'data_preprocessing',
 'lyric_classifier.ipynb',
 '.git']

In [41]:
pd.DataFrame(pd.read_feather("modeling/processed_lyrics.feather")[0]).to_feather("modeling/processed_lyrics.feather", index = False)

TypeError: write_feather() got an unexpected keyword argument 'index'

## Preprocessing

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
def preprocess_text(x):
    x_ = x.copy()
    x_ = x_.apply(lambda x: x.replace('\n', ' '))
    x_ = x_.apply(lambda x: x.replace('\u2005', ' '))
    x_ = [re.sub(r'\[.*?\]\s*', '', line) for line in x_]
    x_ = [re.sub(r'\(.*?\)\s*', '', line) for line in x_]
    return x_

In [9]:
x = preprocess_text(x)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.3, random_state=1234, shuffle = True)

In [11]:
X_train[:10]

["I'm leaning on the everlasting arms That place me on the Rock that stands And I hold true to the One who breaks my fall And lifts me time and time again  Oh, my God so good, You never give up You never give up on me Oh, what joy I've found because of Your love Because of Your love for me Oh, my God so good, You never give up You never give up on me Oh, what joy I've found because of Your love Because of Your love for me  This freedom purchased by the highest price This grace outweighing all my shame And I'm made new through the power of sacrifice From death now raised to life again Oh, my God so good, You never give up You never give up on me Oh, what joy I've found because of Your love Because of Your love for me Oh, my God so good, You never give up You never give up on me Oh, what joy I've found because of Your love Because of Your love for me, yeah This love for me, yeah Oh-ohh  I'm not a slave to sin so I'm singing You are good Buried the Christ to rise in Your freedom You are g

## Baseline with LR

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [13]:
vectorizer = CountVectorizer(ngram_range = (1,2), max_df = 0.5, stop_words='english')

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


In [15]:
import os
import pickle

In [None]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(min_df = 0.01, max_df = 0.1, ngram_range = (1,2), stop_words = "english")),
    ("scale", StandardScaler(with_mean=False)),
    ("lr", OneVsRestClassifier(LogisticRegression(solver='liblinear', multi_class='ovr', penalty = "l1"))),
])


param_grid = {
    'tfidf__min_df': [100, 0.01, 0.05],
    'tfidf__max_df': [0.1, 0.7],
    'lr__estimator__penalty': ['l1', 'l2'], # Access nested parameters for OneVsRestClassifier
    'lr__estimator__max_iter': [100, 300]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring="accuracy", verbose = 3)


pipeline.fit(X_train, y_train)

with open(os.path.join('baseline.pkl'), 'wb') as file:
    pickle.dump(pipeline, file)

## Training

In [57]:
from transformers import DistilBertModel, DistilBertTokenizer
import torch

In [55]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

In [58]:
class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [60]:
model = DistillBERTClass()

loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [63]:
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [64]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 