In [None]:
from transformers import GPT2Tokenizer, GPT2Model, TrainingArguments, Trainer
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from torch.optim import lr_scheduler
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score


In [None]:
pip install datasets

In [None]:
from datasets import Dataset

In [None]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [None]:
torch.cuda.empty_cache()

In [None]:
import pandas as pd

df = pd.read_csv("lyrics_preprocessed.csv")

#assume we have a dataset with two columns - lyrics and mood(numbers 0-3)

lyrics = df['lyrics'].values
mood = df['Mood_encod'].values



In [None]:
# df['lyrics'].shape
# df['Mood_encod'].value_counts()


In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenized_data = tokenizer(df['lyrics'].tolist(), truncation=True, padding=True, return_tensors='pt')
# df['Mood_encod'] = df['Mood_encod'].astype(str)
# Create a Dataset object
dataset = Dataset.from_dict({
    'input_ids': tokenized_data['input_ids'],
    'attention_mask': tokenized_data['attention_mask'],
    'labels': df['Mood_encod'].tolist()  # Assuming Mood_encod contains encoded labels
})
# dataset.set_format("torch")

# dataset = tokenized_data
# dataset['labels'] = torch.tensor(df['Mood_encod'].tolist())
# Split the dataset into 80% train and 20% validation
# train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)

In [None]:
train_dataset = dataset.shuffle(seed = 42).select(range(0,1512))
val_dataset = dataset.shuffle(seed =42).select(range(1512,1890))

In [None]:
#defining classification head

class GPT2MoodClassifier(nn.Module):
    def __init__(self, gpt2_model, num_classes = 4):
        super(GPT2MoodClassifier, self).__init__()
        self.gpt2_model = gpt2_model
        self.classification_head = nn.Linear(gpt2_model.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.gpt2_model(input_ids, attention_mask = attention_mask)
        last_hidden_state = outputs[0]
        cls_hidden_state = last_hidden_state[:,0,:]
        logits = self.classification_head(cls_hidden_state)
        return logits

gpt2_model = GPT2Model.from_pretrained(model_name)
num_classes = len(set(mood))
model = GPT2MoodClassifier(gpt2_model, num_classes = num_classes)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2MoodClassifier(
  (gpt2_model): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (classification_head): Linear(in_features=768, out_features=4, bias=True)
)

In [None]:
pip install transformers[torch]


In [None]:
pip install accelerate -U

In [None]:
loss_fn = nn.CrossEntropyLoss()
training_args = TrainingArguments("checkpoint_path", num_train_epochs=10, per_device_eval_batch_size=2, \
                                  per_device_train_batch_size=2, evaluation_strategy = "epoch", \
                                    gradient_accumulation_steps =4, learning_rate = 1e-3, \
                                        fp16 = True)

In [None]:
# ! pip install evaluate

In [None]:
# import evaluate
import numpy as np
# metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    pred = np.argmax(logits, axis = -1)
    return {'accuracy_score': accuracy_score(labels, pred)}

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs = False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        print(outputs)
        loss = nn.CrossEntropyLoss()(outputs,labels)
        return (loss,outputs) if return_outputs else loss


In [None]:
trainer = Trainer(model=model, args = training_args, train_dataset = train_dataset, eval_dataset = val_dataset,\
                  compute_metrics = compute_metrics)

In [None]:
trainer.train()