# Fine-tuning Llama 3.1 for sentiment analysis: Part 1

**Introduction**: Multiclass classification is a common use case for traditional ML models. However, now that open source foundation LLMs are so easily accessible, it's worth exploring how easily they can applied to this domain as well.

This notebook is part 1 of a 2-part series exploring how easily Llama 3.1-8B-Instruct can be fine-tuned for three-part sentiment analysis (positive,
negative, neutral) on the [Dynasent](https://paperswithcode.com/dataset/dynasent) dataset.

**Findings**: The foundation model achieves 71% accuracy utilizing zero-shot prompting, and a fine-tuned version involving quantized LoRA adapters achieves 84% accuracy using 37,500 examples. A smaller dataset involving 3,750 examples achieved 81% accuracy.

**Environment**: Nvidia A100 on Google Colab with 40GB of GPU RAM, and 80GB of CPU RAM. Training completed in less than 45 mins, demonstrating the feasibility of this approach. Google Drive was used for storing access tokens.

**Summary of steps**:

Notebook 1 (this notebook):
  - Download the Dynasent dataset from Hugging Face.
  - Explore, clean, and prepare two smaller datasets:
    - A small dataset involving 3,750 examples.
    - A medium dataset involving 37,500 examples.

Notebook 2:
  - Load dataset, model and tokenizer
  - Execute test with foundation model
  - Fine-tune QLoRA adapers
  - Merge with foundation model
  - Execute test with fine-tuned model
  - Save model and upload to Hugging Face

# Setup: libraries, dependencies, configurations, helper functions

In [None]:
%%capture
%pip install -U transformers
%pip install -U datasets

In [None]:
import pandas as pd
import os
import transformers

from transformers import (AutoTokenizer)

from datasets import (Dataset,
                      DatasetDict,
                      concatenate_datasets,
                      load_dataset,
                      load_from_disk)

from google.colab import userdata
from huggingface_hub import login as hf_login

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive/')

# connect to huggingface
hf_auth_token = userdata.get('HF_TOKEN')
hf_login(hf_auth_token)

# base dataset cache config
base_dataset_cache_base_directory = "/content/drive/MyDrive/.dataset_cache"
base_dataset_provider = "huggingface"
base_dataset_namespace = "dynabench"
base_dataset = "dynasent"
base_dataset_name_part = "dynabench.dynasent.r1.all"
base_dataset_name = f"{base_dataset_namespace}/{base_dataset}"
base_dataset_cache_directory = f"{base_dataset_cache_base_directory}/{base_dataset_provider}/{base_dataset_name}/{base_dataset_name_part}"

# project config
dataset_directory = "/content/drive/MyDrive/.data"
labels = ["positive", "negative", "neutral"]

In [None]:
# helper functions

def load_base_dataset(cache_directory, dataset_name, part_name=None, trust_remote_code=True):

  dataset = None

  # if not cached, download and save
  if not os.path.exists(cache_directory):

    if part_name is not None:
      dataset = load_dataset(dataset_name, part_name, trust_remote_code=trust_remote_code)
    else:
      dataset = load_dataset(dataset_name, trust_remote_code=trust_remote_code)

    # cache locally
    os.makedirs(cache_directory)
    dataset.save_to_disk(cache_directory)

  # load from cache
  else:
    dataset = load_from_disk(cache_directory)

  return dataset

def split_label_subset(label_df, train_size, val_size, test_size, random_state):

  label_df = label_df.sample(frac=1, random_state=random_state).reset_index(drop=True)

  train_end = train_size
  val_end  = train_size + val_size
  test_end  = train_size + val_size + test_size

  train_subset = label_df.iloc[:train_end]
  val_subset  = label_df.iloc[train_end:val_end]
  test_subset  = label_df.iloc[val_end:test_end]

  return train_subset, val_subset, test_subset

# note: dataset sizes applied per label
def create_working_dataset(df, labels, train_size, val_size, test_size, random_state):

  # shuffle once so sampling is random
  df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)

  # collect subsets per label
  train_list, val_list, test_list = [], [], []
  for label in labels:
    label_df = df[df["gold_label"] == label]
    train_df, val_df, test_df = split_label_subset(label_df, train_size, val_size, test_size, random_state)
    train_list.append(train_df)
    val_list.append(val_df)
    test_list.append(test_df)

  # concatenate them back together, labeling each split
  train_df = pd.concat(train_list, axis=0).sample(frac=1, random_state=random_state).reset_index(drop=True)
  val_df  = pd.concat(val_list, axis=0).sample(frac=1, random_state=random_state).reset_index(drop=True)
  test_df  = pd.concat(test_list, axis=0).sample(frac=1, random_state=random_state).reset_index(drop=True)

  # combine again
  train_df["split"] = "train"
  val_df["split"]  = "validation"
  test_df["split"]  = "test"

  final_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

  # validate
  total_size = (train_size + val_size + test_size) * len(labels)
  assert len(final_df) == total_size, "Final dataset should be " + total_size + " examples total."

  return train_df, val_df, test_df, final_df

def save_working_dataset(train_df, val_df, test_df, final_df):

  size = len(final_df)

  # save the final balanced CSV
  final_df.to_csv(f"{dataset_directory}/sentiment_analysis_{size}.csv", index=False)

  # save huggingface version
  hf_dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df.drop(columns=["split"])),
    "validation": Dataset.from_pandas(val_df.drop(columns=["split"])),
    "test": Dataset.from_pandas(test_df.drop(columns=["split"]))
  })
  hf_dataset.save_to_disk(f"{dataset_directory}/sentiment_analysis_{size}")

def create_and_save_working_dataset(df, labels, train_size, val_size, test_size, random_state):

  train_df, val_df, test_df, final_df = create_working_dataset(
    df, labels, train_size, val_size, test_size, random_state)

  save_working_dataset(train_df, val_df, test_df, final_df)

# Load, inspect, and save source dataset

In [None]:
dataset = load_base_dataset(base_dataset_cache_directory, base_dataset_name, part_name=base_dataset_name_part)
print(dataset)

df = dataset['train'].to_pandas()
df.info()

In [None]:
df.head()

In [None]:
# add % column
result = df.groupby('gold_label').size().reset_index(name='count')
result['%'] = result['count'] / result['count'].sum() * 100
result = result.sort_values('%', ascending=False)
print(result)

# Prepare working dataset

In [None]:
# ----------------------------------------------------------------------------
# Convert to Pandas dataframe, clean, and save
# ----------------------------------------------------------------------------

# Combine all splits into one
combined_dataset = concatenate_datasets([
  dataset["train"],
  dataset["validation"],
  dataset["test"]
])

# convert to pandas
df = combined_dataset.to_pandas()

# keep only the columns of interest
df = df[["sentence", "gold_label"]]

# drop rows where these columns are null (NaN)
df.dropna(subset=["sentence", "gold_label"], inplace=True)

# drop rows where labels are not the desired labels
df = df[df["gold_label"].isin(labels)]

# drop rows where 'sentence' is an empty string (after stripping whitespace)
df = df[df["sentence"].str.strip() != ""]

# reset index for cleanliness
df.reset_index(drop=True, inplace=True)

# save an intermediate CSV of the combined/cleaned data
if not os.path.exists(dataset_directory):
  os.makedirs(dataset_directory)
df.to_csv(f"{dataset_directory}/sentiment_analysis_clean.csv", index=False)

In [None]:
# load cleaned
df = pd.read_csv(f"{dataset_directory}/sentiment_analysis_clean.csv")

# create two datasets - size: 3750, 37500
create_and_save_working_dataset(df, labels, train_size=1000, val_size=125, test_size=125, random_state=42)
create_and_save_working_dataset(df, labels, train_size=10000, val_size=1250, test_size=1250, random_state=42)

# Validate

In [None]:
ds = load_from_disk(f"{dataset_directory}/sentiment_analysis_3750")
print(ds)

ds = load_from_disk(f"{dataset_directory}/sentiment_analysis_37500")
print(ds)