#📓 Setting up notebook

In [None]:
import os

RunningInCOLAB = 'google.colab' in str(get_ipython())

if RunningInCOLAB:
  from google.colab import drive
  drive.mount("/content/drive", force_remount=True)

# change path to where you cloned the repo
data_path = "/content/drive/MyDrive/Technion/Year 4/Deep Learning/Deep Course Dozo/Project"
os.chdir(data_path)

model_name = "bert-base-uncased"

Mounted at /content/drive


In [None]:
# installing dependencies
if RunningInCOLAB:
  !pip install transformers datasets evaluate accelerate peft wandb
else:
  %pip install transformers datasets evaluate accelerate peft wandb

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.29.1-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.3/297.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Col



------






# ⏬ Imports

In [None]:
import pandas as pd
import torch
import numpy as np
import torchtext
from torch.utils.data import Dataset, DataLoader
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchdata.datapipes.iter import IterDataPipe,IterableWrapper
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import math
from peft import LoraConfig, TaskType, get_peft_model, PeftModel
from sklearn.metrics import mean_squared_error
from typing import Dict, Any, Union
import datasets
from datetime import datetime

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 🏋 Setting up WandB

If only training, no need to run this section. unless you want to load hyperparameters from a WandB sweep or report your training runs to wandb.


In [None]:
# logging in to wandb profile
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
config = {
          "wandb":{
            "project": "Song Sentiment Analysis Sweeps"
        }
}

# 🎶 Loading + Tokenizing Dataset

## Configuration


In [None]:
load_tokenized = False # Set to true if you want to load an existing tokenized dataset
tokenized_path = "tokenized_ds" # If load_tokenized is True, will load from here. If false, will save here.
ds_frac = 0.00001 # fraction of dataset to use, change according to mode, we used 0.1 for hyperparameter sweep and 0.4 for training

## Code

In [None]:
# Load tokenizer and set model name
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["seq"], truncation=True)

In [None]:
from datasets import load_from_disk  # Importing necessary libraries

def load_and_tokenize(frac):
  full_df = pd.read_csv("labeled_lyrics.csv")
  df = full_df.sample(frac=frac, random_state=42)  # Sample a fraction of the DataFrame
  ds = Dataset.from_pandas(df[["seq","label"]]).shuffle(seed=42)  # Create a Hugging Face Dataset from the sampled DataFrame

  # Split the dataset into training and testing sets
  split_ds = ds.train_test_split(test_size=0.2)

  # Tokenize the dataset using the preprocess_function
  tokenized = split_ds.map(preprocess_function, batched=True)
  return tokenized

# Check if tokenized dataset needs to be loaded
if load_tokenized:
  try:
    tokenized = load_from_disk(tokenized_path)  # Attempt to load the tokenized dataset from disk
    print('Loaded existing tokenized dataset')
  except Exception as e:
    print(e)  # Print any exceptions that occur during loading

else:  # If the tokenized dataset needs to be created
  tokenized = load_and_tokenize(ds_frac)  # create the tokenized dataset
  tokenized.save_to_disk(tokenized_path)
  print('Saving tokenized dataset at ', data_path, f"\tokenized_ds")

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving tokenized dataset at  /content/drive/MyDrive/Technion/Year 4/Deep Learning/Deep Course Dozo/Project 	okenized_ds


In [None]:
# Defining our data collator.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')



---



# 📭 Defining Metrics

In [None]:
import evaluate
loss = evaluate.load("mse") #RMSE Error

Downloading builder script:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

In [None]:
# Define the metrics for our model
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  return loss.compute(predictions=predictions.reshape((-1,)), references=labels, squared=False)

# 🕔 Finding Optimal Hyper Parameters With WandB Sweep

## Configuration

Choose whether to continue an existing sweep or create a new one.

In [None]:
load_sweep = False # Choose whether to load an existing sweep and continue it
output_dir = "Sweeps" # This is where the sweep is saved locally

if load_sweep:
  sweep_id = "" # Insert ID to continue here

else:
  sweep_config = {
      # 'name': 'Song Analysis Sweep ' + now.strftime("%d/%m/%Y %H:%M:%S"),
      'name': 'Song Analysis Sweep - Friday', # Change where wandb saves the sweeps
      'method': 'bayes'
  }

  metric = {
      'name': 'eval/loss',
      'goal': 'minimize'
      }

  sweep_config['metric'] = metric

  # hyperparameters for sweep, set as you wish
  parameters_dict = {
      'epochs': {
          # 'values': [5, 10, 15]
          'value': 1
          },
      'per_device_train_batch_size': {
          # 'values': [8, 12, 16]
          'value': 16
          },
      'per_device_eval_batch_size': {
          # 'values': [8, 12, 16]
          'value': 16
          },
      'learning_rate': {
          'distribution': 'log_uniform_values',
          'min': 1e-6,
          'max': 1e-3
      },
      'weight_decay': {
          'distribution': 'log_uniform_values',
          'min': 1e-3,
          'max': 1e-1
      },
      'lr_scheduler_type': {
          'values':  ['linear', 'cosine', 'polynomial', 'reduce_lr_on_plateau']
        },
      'lora_r': {
          'values': [4, 8, 16]
      },
  }

  sweep_config['parameters'] = parameters_dict

  sweep_id = wandb.sweep(sweep_config, project=config["wandb"]["project"] )

Create sweep with ID: d4izrwx7
Sweep URL: https://wandb.ai/zoharmilman/Song%20Sentiment%20Analysis%20Sweeps/sweeps/d4izrwx7


## Code

In [None]:
def train(config=None):
  # Initialize Weights & Biases run with the provided configuration
  run = wandb.init(config=config)

  # Set sweep configuration
  config = wandb.config
  run.name = f'lora_r: {config.lora_r} learning_rate: {config.learning_rate} weight_decay {config.weight_decay} lr_scheduler_type: {config.lr_scheduler_type}'

  # Set training arguments for the Trainer
  training_args = TrainingArguments(
      output_dir=output_dir,
      report_to='wandb',  # Turn on Weights & Biases logging
      num_train_epochs=config.epochs,
      learning_rate=config.learning_rate,
      weight_decay=config.weight_decay,
      per_device_train_batch_size=config.per_device_train_batch_size,
      per_device_eval_batch_size=config.per_device_eval_batch_size,
      lr_scheduler_type=config.lr_scheduler_type,
      save_strategy='epoch',
      evaluation_strategy='epoch',
      logging_strategy='epoch',
      load_best_model_at_end=True,
  )

  # Configure the LORA model
  lora_config = LoraConfig(
    task_type="SEQ_CLS",
    target_modules=['word_embeddings', 'position_embeddings', 'token_type_embeddings', 'query', 'key', 'value'],
    r=config.lora_r,
    lora_alpha=2*config.lora_r,
    lora_dropout=0.01
  )

  # Define a function to initialize the model using the specified configuration
  def model_init(trial):
      return get_peft_model(AutoModelForSequenceClassification.from_pretrained(model_name, problem_type="regression", num_labels=1), lora_config)

  # Initialize the Trainer
  trainer = Trainer(
      model_init=model_init,
      args=training_args,
      data_collator=data_collator,
      train_dataset=tokenized['train'],  # Assuming tokenized is a dictionary containing 'train' and 'test' keys
      eval_dataset=tokenized['test'],
      compute_metrics=compute_metrics
  )

  # Start training the model
  trainer.train()


In [None]:
# Running the sweep
wandb.agent(sweep_id, train, count=3)

[34m[1mwandb[0m: Agent Starting Run: pt0dfzl8 with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 1.0529429978497931e-06
[34m[1mwandb[0m: 	lora_r: 8
[34m[1mwandb[0m: 	lr_scheduler_type: reduce_lr_on_plateau
[34m[1mwandb[0m: 	per_device_eval_batch_size: 16
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	weight_decay: 0.003158069771093345
[34m[1mwandb[0m: Currently logged in as: [33mzoharmilman[0m. Use [1m`wandb login --relogin`[0m to force relogin


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse
1,0.0237,0.066164,0.257224


Checkpoint destination directory Sweeps/checkpoint-1 already exists and is non-empty. Saving will proceed but saved results may be invalid.


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,▁
eval/mse,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
eval/loss,0.06616
eval/mse,0.25722
eval/runtime,0.738
eval/samples_per_second,1.355
eval/steps_per_second,1.355
train/epoch,1.0
train/global_step,1.0
train/grad_norm,5.8751
train/learning_rate,0.0
train/loss,0.0237


[34m[1mwandb[0m: Agent Starting Run: qnp0qsxq with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 3.455826951779469e-06
[34m[1mwandb[0m: 	lora_r: 8
[34m[1mwandb[0m: 	lr_scheduler_type: cosine
[34m[1mwandb[0m: 	per_device_eval_batch_size: 16
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	weight_decay: 0.09747836320074936


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse
1,0.0237,0.066731,0.258323


Checkpoint destination directory Sweeps/checkpoint-1 already exists and is non-empty. Saving will proceed but saved results may be invalid.


VBox(children=(Label(value='0.001 MB of 0.022 MB uploaded\r'), FloatProgress(value=0.046814271365561815, max=1…

0,1
eval/loss,▁
eval/mse,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
eval/loss,0.06673
eval/mse,0.25832
eval/runtime,0.9922
eval/samples_per_second,1.008
eval/steps_per_second,1.008
train/epoch,1.0
train/global_step,1.0
train/grad_norm,5.8751
train/learning_rate,0.0
train/loss,0.0237


[34m[1mwandb[0m: Agent Starting Run: 6di13o5g with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0006727213564623558
[34m[1mwandb[0m: 	lora_r: 4
[34m[1mwandb[0m: 	lr_scheduler_type: cosine
[34m[1mwandb[0m: 	per_device_eval_batch_size: 16
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	weight_decay: 0.08876039040138685


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse
1,0.0002,0.328084,0.572786


Checkpoint destination directory Sweeps/checkpoint-1 already exists and is non-empty. Saving will proceed but saved results may be invalid.


VBox(children=(Label(value='0.001 MB of 0.022 MB uploaded\r'), FloatProgress(value=0.04685589519650655, max=1.…

0,1
eval/loss,▁
eval/mse,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
eval/loss,0.32808
eval/mse,0.57279
eval/runtime,0.7169
eval/samples_per_second,1.395
eval/steps_per_second,1.395
train/epoch,1.0
train/global_step,1.0
train/grad_norm,0.62304
train/learning_rate,0.0
train/loss,0.0002


# 📉 Training the Model

## Hyperparameter Configuration

In [None]:
# set the hyperparameters here.
best_params = {
    "learning_rate": 8.637e-6,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "num_train_epochs": 20,
    "weight_decay": 0.00862,
    "lora_r": 4,
    "lr_scheduler_type": "reduce_lr_on_plateau",
}

## Training Configuration

In [None]:
# This is for WandB, defining the project under which the training run will be saved.
config_train = {
          "wandb":{
            "project": "Song Sentiment Analysis Run"
        }
}


load_run = False # Choose if you want to continue a run
run_id_to_continue = "" # The run id to continue. by default its the last run started in the session.
run_name = "Testing" # Name under which WandB will save the run

output_dir = "Testing" # Directory where checkpoints will be saved
load_checkpoint = True # Choose if you want to load a checkpoint. automatically loads the latest one.
load_custom_checkpoint = False # Set to True if you dont want to use the latest checkpoint.
custom_checkpoint_dir = "" # This has to be a path to a checkpoint foldar
checkpoints_dir = output_dir # Directory where the code will search for checkpoints. the dir should contain checkpoint foldars.

## Code

In [None]:
# Configuring our model

lora_config = LoraConfig(
    task_type="SEQ_CLS",
     target_modules = ['word_embeddings', 'position_embeddings', 'token_type_embeddings'
     'query', 'key', 'value'],
    r=best_params['lora_r'],
    lora_alpha=best_params['lora_r']*2,
    lora_dropout=0.01
)


training_args = TrainingArguments(
    output_dir=output_dir,
    report_to='wandb',  # Turn on Weights & Biases logging
    num_train_epochs =best_params['num_train_epochs'],
    learning_rate=best_params['learning_rate'],
    per_device_train_batch_size=best_params["per_device_train_batch_size"],
    per_device_eval_batch_size=best_params["per_device_eval_batch_size"],
    weight_decay=best_params['weight_decay'],
    lr_scheduler_type= best_params['lr_scheduler_type'],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    # save_steps=2000,
    # eval_steps=2000,
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [None]:
import os

def list_sorted_files(directory_path):
    # List all files in the directory
    files = os.listdir(directory_path)

    # Filter files with the pattern "checkpoint-x" where x is a number
    checkpoint_files = [file for file in files if file.startswith("checkpoint-") and file[len("checkpoint-"):].isdigit()]

    # Sort the checkpoint files based on the number after "checkpoint-"
    sorted_files = sorted(checkpoint_files, key=lambda x: int(x[len("checkpoint-"):]))

    return sorted_files

In [None]:
if load_checkpoint or load_custom_checkpoint:
  try:
    print("Trying to resume from checkpoint")
    if load_custom_checkpoint:
      path_to_latest_checkpoint = custom_checkpoint_dir
    else:
      path_to_latest_checkpoint = list_sorted_files(checkpoints_dir)[-1]  # Get the latest checkpoint file
      path_to_latest_checkpoint = os.path.join(checkpoints_dir, path_to_latest_checkpoint)  # Full path to the latest checkpoint
      print("Latest checkpoint found at: ", path_to_latest_checkpoint)

    # Load the base model for sequence classification
    model = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type="regression", num_labels=1)
    # Load the custom model that extends the base model
    final_model = PeftModel.from_pretrained(model, path_to_latest_checkpoint)
    final_model.to(device)  # Move the model to the specified device (e.g., GPU)

    if load_run:
      print('resuming run ', run_id_to_continue)
      # Initialize Weights & Biases run to resume training
      run = wandb.init(config=config_train, id=run_id_to_continue, resume="must")
      run.name = run_name

    # Initialize the Trainer with the final model
    trainer = Trainer(
      model=final_model,
      args=training_args,
      train_dataset=tokenized["train"],
      eval_dataset=tokenized["test"],
      tokenizer=tokenizer,
      data_collator=data_collator,
      compute_metrics=compute_metrics,
    )

    # Start training from the checkpoint
    trainer.train()

  except Exception as e:
    print(e)

else:  # If no checkpoint to load, train from scratch
  final_model = get_peft_model(AutoModelForSequenceClassification.from_pretrained(model_name, problem_type="regression", num_labels=1), lora_config)
  final_model.to(device)

  run = wandb.init(config=config_train)  # Initialize Weights & Biases run for training
  run.name = run_name

  run_id_to_continue = run.id  # Get the ID of the current run

  # Initialize the Trainer to train from scratch
  trainer = Trainer(
    model=final_model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
  )

  print("Training from scratch, output=", training_args.output_dir)
  trainer.train()  # Start training the model from scratch


Trying to resume from checkpoint
Latest checkpoint found at:  Testing/checkpoint-20


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Mse
1,No log,0.080324,0.283415
2,No log,0.078232,0.279699
3,No log,0.076071,0.27581
4,No log,0.073946,0.27193
5,No log,0.071854,0.268056
6,No log,0.069766,0.264132
7,No log,0.067757,0.260302
8,No log,0.065745,0.256407
9,No log,0.063767,0.252522
10,No log,0.061809,0.248615


Checkpoint destination directory Testing/checkpoint-1 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory Testing/checkpoint-2 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory Testing/checkpoint-3 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory Testing/checkpoint-4 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory Testing/checkpoint-5 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory Testing/checkpoint-6 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory Testing/checkpoint-7 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint de

# ⚡ Inference


## Configuration

In [None]:
lora_model_path = "Model 07_04_2024__07_23_59/checkpoint-13728" # Path to the lora model dir
frozen_bert_regression_head_path = "Model 07_04_2024__12_07_22/checkpoint-41171" # Path to bert model with trained regression head

## Code

In [None]:
# Load the base AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type="regression", num_labels=1)

# Load the LoRA and merge it with the base model
model_to_merge = PeftModel.from_pretrained(model, lora_model_path)
final_model = model_to_merge.merge_and_unload()  # Merge the models


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Load original model for comparison
import safetensors
from torch.nn import Parameter

adapters_weights = safetensors.torch.load_file(frozen_bert_regression_head_path + "/model.safetensors")
original_model = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type="regression", num_labels=1)

# print(adapters_weights.keys())

# Loading regression head
original_model.classifier.bias = Parameter(adapters_weights['classifier.bias'])
original_model.classifier.weight = Parameter(adapters_weights['classifier.weight'])



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Loading the data to play around
full_df = pd.read_csv("labeled_lyrics.csv")

In [None]:
import random

# Some nice examples:
# 167 - Baby its cold outside
# 363 - Let it Snow!
# 60990 - Come on Home. An interesting case where one could argue that the data is bad the model gives a better result then the label.
# 52124 - They Breed
# 3526 - Before He Cheats
# 9898 - Sweet Caroline
# 110063 - Ob-La-Di, Ob-La-Da
# 122763 - Jolene
index = 122763 #random.randint(0, len(full_df["seq"]))
test = full_df['seq'][index] # You can use either an song from the df at index or a custom string.
inputs = tokenizer(test, return_tensors="pt").to(device)
with torch.no_grad():
    original_output = original_model(**inputs)
    outputs = final_model(**inputs)


try:
  print('Song name: ', full_df['song'][index])
  print(test, '\n')
  print('Written by: ', full_df['artist'][index], '\n')
  print('Index: ', index)
  print('Real label: ', full_df["label"][index])
except:
  print('No label')

print('bert model output: ', original_output.logits)
print('our model output: ', outputs.logits)

Song name:  Jolene
Jolene, Jolene, Jolene, Jolene
I'm begging of you please don't take my man
Jolene, Jolene, Jolene, Jolene
Please don't take him just because you can

Your beauty is beyond compare
With flaming locks of auburn hair
With ivory skin and eyes of emerald green
Your smile is like a breath of spring
Your voice is soft like summer rain
And I cannot compete with you
Jolene

He talks about you in his sleep
And there's nothing I can do to keep
From crying when he calls your name
Jolene

And I can easily understand
How you could easily take my man
But you don't know what he means to me
Jolene

Jolene, Jolene, Jolene, Jolene
I'm begging of you please don't take my man
Jolene, Jolene, Jolene, Jolene
Please don't take him just because you can

You could have your choice of men
But I could never love again
He's the only one for me
Jolene

I had to have this talk with you
My happiness depends on you
And whatever you decide to do
Jolene

Jolene, J