## Adding sentiment

In [21]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import transformers
from transformers import DistilBertTokenizer, DistilBertModel, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelWithLMHead
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, balanced_accuracy_score, accuracy_score

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

plt.rcParams['figure.figsize'] = (8,6)
sns.set_context('paper', 1.2)


from time import time
import gc

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x21303a78970>

In [22]:
class LyricsDataset(Dataset):
    def __init__(self, lyrics, targets, tokenizer, max_len):
        self.lyrics = lyrics
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.lyrics)

    def __getitem__(self, item):
        lyrics = str(self.lyrics[item])
        target = self.targets[item]
        encoding = self.tokenizer.encode_plus(
            lyrics,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'lyrics': lyrics,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, max_len, batch_size, target_cols):
    ds = LyricsDataset(
        lyrics=df['lyrics'].to_numpy(),
        targets=df[target_cols].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=0
    )

def get_scores(true, pred):
    return {
        "precision_score/micro": precision_score(true, pred, average='micro', zero_division=0),
        "precision_score/weighted": precision_score(true, pred, average='weighted', zero_division=0),
        "recall_score/micro": recall_score(true, pred, average='micro', zero_division=0),
        "recall_score/weighted": recall_score(true, pred, average='weighted', zero_division=0),
        "f1_score/micro": f1_score(true, pred, average='micro', zero_division=0),
        "f1_score/weighted": f1_score(true, pred, average='weighted', zero_division=0),
        "accuracy/balanced": accuracy_score(true, pred),
    }


def train_epoch(
        model,
        data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler
):
    model = model.train()

    losses = []
    scores = pd.DataFrame()

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].float().to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss = loss_fn(outputs, targets)
        pred_class = (outputs > 0) * 1

        scores_dict = get_scores(targets.cpu(), pred_class.cpu())
        scores = pd.concat([scores, pd.DataFrame(scores_dict, index=[0])])

        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    scores = pd.DataFrame(scores.mean()).T
    scores['loss'] = np.mean(losses)

    return scores


def eval_model(model, data_loader, loss_fn, device):
    model = model.eval()

    losses = []
    scores = pd.DataFrame()

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].float().to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            pred_class = (outputs > 0) * 1
            scores_dict = get_scores(targets.cpu(), pred_class.cpu())
            scores = pd.concat([scores, pd.DataFrame(scores_dict, index=[0])])

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

    scores = pd.DataFrame(scores.mean()).T
    scores['loss'] = np.mean(losses)

    return scores



In [23]:
DEFAUTL_TOKENIZER_CLASS = DistilBertTokenizer
DEFAULT_MODEL_CLASS = DistilBertModel
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'

In [24]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Read dataset

In [25]:
dataset = pd.read_csv("data/train/small_balanced.csv")
dataset = pd.concat([dataset.drop('genre', axis = 1), pd.get_dummies(dataset['genre'])], axis = 1)

In [26]:
dataset.head()

Unnamed: 0,lyrics,tokens,Country,Hip-Hop,Metal,Pop,Rock
0,i just love when i am with you yeah this shit ...,"['love', 'yeah', 'shit', 'ten', 'used', 'frien...",0,1,0,0,0
1,she is got butterflies you have got them too y...,"['got', 'butterfly', 'got', 'look', 'eye', 'qu...",1,0,0,0,0
2,after the morning there comes an evening and a...,"['morning', 'come', 'evening', 'evening', 'ano...",1,0,0,0,0
3,looking for food as i dig through the trash th...,"['looking', 'food', 'dig', 'trash', 'capitalis...",0,0,0,0,1
4,where have i been what should i be looking for...,"['looking', 'going', 'come', 'age', 'strong', ...",0,0,0,1,0


## Tokenizer preparation

In [27]:
tokenizer = DEFAUTL_TOKENIZER_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [28]:
MAX_LEN = 256

In [29]:
df_train, df_val = train_test_split(dataset, test_size=0.1, random_state=RANDOM_SEED)
targets = dataset.select_dtypes('uint8').columns

In [30]:
BATCH_SIZE = 32

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE, target_cols=targets)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE, target_cols=targets)

In [31]:
data = next(iter(train_data_loader))
data.keys()

dict_keys(['lyrics', 'input_ids', 'attention_mask', 'targets'])

In [32]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32, 5])


## Bert fine-tuning without sentiment

In [33]:
class Bert(nn.Module):

    def __init__(self, n_classes, embedding_model):
        super(Bert, self).__init__()
        self.bert = embedding_model
        self.pool = nn.Linear(self.bert.config.dim, self.bert.config.dim)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        model_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        embeddings = model_output.last_hidden_state.mean(axis=1)
        pooled_output = self.pool(embeddings)  
        pooled_output = nn.ReLU()(pooled_output)
        output = self.drop(pooled_output)
        return self.out(output)

In [34]:
model = Bert(len(targets), embedding_model=DEFAULT_MODEL_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME))
model = model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [35]:
EPOCHS = 7

optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.BCEWithLogitsLoss().to(device)

In [None]:
history = pd.DataFrame()
best_accuracy = 0

start_time = time()
for epoch in range(EPOCHS):
    
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f"Previous epoch took {(time() - start_time)/60:.2f} minutes")
    print('-' * 10)
    
    start_time = time()

    train_scores = train_epoch(
        model,
        train_data_loader,    
        loss_fn, 
        optimizer, 
        device, 
        scheduler
    )

    print(f'Train scores {train_scores.to_dict()}')

    val_scores = eval_model(
        model,
        val_data_loader,
        loss_fn, 
        device
    )

    print(f'Val scores {val_scores.to_dict()}')
    print()
    
    train_scores.columns = "train_" + train_scores.columns
    val_scores.columns = "val_" + val_scores.columns
    scores = pd.concat([train_scores, val_scores], axis = 1)
    scores['epoch'] = epoch
    history = pd.concat([history, scores])

    if val_scores['val_accuracy/balanced'].iloc[0]  > best_accuracy:
        torch.save(model.bert.state_dict(), 'bert_best_model_state.bin')
        best_accuracy = val_scores['val_accuracy/balanced']

Epoch 1/7
Previous epoch took 0.00 minutes
----------


In [None]:
history.to_csv(f"results/Bert_fine_tuning.csv", index=False)

In [None]:
## releasing gpu memory, to load another model
model.cpu()
del model
gc.collect()
torch.cuda.empty_cache()

## Bert fine-tuning with Sentiment

In [None]:
class BertSentiment(nn.Module):

    def __init__(self, n_classes, embedding_model, sentiment_model):
        super(BertSentiment, self).__init__()
        self.bert = embedding_model
        self.sentiment = sentiment_model
        self.pool = nn.Linear(self.bert.config.dim + 6, self.bert.config.dim + 6)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size + 6, n_classes)

    def forward(self, input_ids, attention_mask):
        model_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        embeddings = model_output.last_hidden_state.mean(axis=1)
        sentiment = self.sentiment(
            input_ids=input_ids,
            attention_mask=attention_mask)
        embeddings_with_sentiment = torch.concat([embeddings, nn.functional.softmax(sentiment.logits, dim=1)], dim = 1)
        pooled_output = self.pool(embeddings_with_sentiment)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)
        output = self.drop(pooled_output)
        return self.out(output)

In [None]:
model = BertSentiment(len(targets),
                      embedding_model=DEFAULT_MODEL_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME),
                      sentiment_model=AutoModelForSequenceClassification.from_pretrained("gokuls/BERT-tiny-emotion-intent")
                     )
model = model.to(device)

In [None]:
EPOCHS = 7

optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.BCEWithLogitsLoss().to(device)

In [None]:
history = pd.DataFrame()
best_accuracy = 0

start_time = time()
for epoch in range(EPOCHS):
    
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f"Previous epoch took {(time() - start_time)/60:.2f} minutes")
    print('-' * 10)
    
    start_time = time()

    train_scores = train_epoch(
        model,
        train_data_loader,    
        loss_fn, 
        optimizer, 
        device, 
        scheduler
    )

    print(f'Train scores {train_scores.to_dict()}')

    val_scores = eval_model(
        model,
        val_data_loader,
        loss_fn, 
        device
    )

    print(f'Val scores {val_scores.to_dict()}')
    print()
    
    train_scores.columns = "train_" + train_scores.columns
    val_scores.columns = "val_" + val_scores.columns
    scores = pd.concat([train_scores, val_scores], axis = 1)
    scores['epoch'] = epoch
    history = pd.concat([history, scores])

    if val_scores['val_accuracy/balanced'].iloc[0]  > best_accuracy:
        torch.save(model.bert.state_dict(), 'bert_best_model_state.bin')
        best_accuracy = val_scores['val_accuracy/balanced']

In [None]:
history.to_csv(f"results/BertSentiment_fine_tuning.csv", index=False)