In [2]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install peft
!pip install accelerate
# !pip install bitsandbytes
!pip install -U bitsandbytes


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [3]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import (
    LlamaTokenizer,
    LlamaForSequenceClassification,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel
)
import pandas as pd
import numpy as np
import sentencepiece
from sklearn.metrics import accuracy_score, classification_report
import bitsandbytes as bnb
from torch.cuda.amp import autocast

In [4]:
# get access to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
data_path = "/content/drive/MyDrive/Colab Notebooks/ECE1786/project_llama/clean_data_gpt2.csv"

In [6]:
global_train_loader = None
global_val_loader = None
global_test_loader = None

In [7]:
CLASS_LIST = [str(i) for i in list(np.arange(4, 9.5, 0.5))]
CLASS_LIST.insert(0, '<4')
print(CLASS_LIST)

['<4', '4.0', '4.5', '5.0', '5.5', '6.0', '6.5', '7.0', '7.5', '8.0', '8.5', '9.0']


In [8]:
float('4.0') + 0.5

4.5

In [9]:
import numpy as np

def get_acc(prediction, truth):
    total = len(truth)
    prediction = list(prediction)
    truth = list(truth)
    correct = 0
    for pred, label in zip(prediction, truth):
      pred_grade = CLASS_LIST[pred]
      label_grade = CLASS_LIST[label]

      # add conditions
      if pred_grade == "4.0":
        if str(float(pred_grade) + 0.5) == label_grade:
          correct += 1
      elif pred_grade == "9.0":
        if str(float(pred_grade) - 0.5) == label_grade:
          correct += 1
      elif pred_grade == "<4":
        if label_grade == "<4":
          correct += 1
      else:
        if str(float(pred_grade) + 0.5) == label_grade or str(float(pred_grade) - 0.5) == label_grade:
          correct += 1
    return correct / total


In [10]:
# test get_acc
prediction = [0, 0, 1, 1, 11, 11, 9, 9, 8]
truth = [0, 1, 2, 3, 10, 9, 8, 10, 11]
print(get_acc(prediction, truth))

0.5555555555555556


In [11]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=2048):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]


        text = f"{item['prompt']}\n{item['essay']}"


        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(item['label'], dtype=torch.long)
        }

def load_model_and_tokenizer(model_name="NousResearch/Llama-2-7b-hf", num_labels=12):
    print("Loading tokenizer...")
    tokenizer = LlamaTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    print("Loading model...")
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )

    model = LlamaForSequenceClassification.from_pretrained(
        model_name,
        device_map="auto",
        num_labels=num_labels,
        quantization_config=quantization_config
    )

    model = prepare_model_for_kbit_training(model)

    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        target_modules=[
            "q_proj",
            "v_proj",
            "k_proj",
            "o_proj",
            "gate_proj",
            "down_proj",
            "up_proj"
        ]
    )

    print("Applying LoRA...")
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    return model, tokenizer

def prepare_dataloaders(data_path, tokenizer, batch_size=2):
    print("Loading dataset...")
    df = pd.read_csv(data_path)


    train_df = df.sample(n=2000, random_state=42)
    temp_df = df[~df.index.isin(train_df.index)]
    val_df = temp_df.sample(n=500, random_state=42)
    test_df = temp_df[~temp_df.index.isin(val_df.index)].sample(n=500, random_state=42)


    train_dataset = CustomDataset(train_df.to_dict('records'), tokenizer)
    val_dataset = CustomDataset(val_df.to_dict('records'), tokenizer)
    test_dataset = CustomDataset(test_df.to_dict('records'), tokenizer)


    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return train_loader, val_loader, test_loader

def train_epoch(model, train_loader, optimizer, scheduler, device, epoch):
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(train_loader):

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)


        with autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss


        loss.backward()


        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)


        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        if batch_idx % 50 == 0:
            print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item():.4f}')

    return total_loss / len(train_loader)

def evaluate(model, eval_loader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in eval_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with autocast():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

            total_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)

            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    no_tolerance_accuracy = accuracy_score(all_labels, all_preds)
    with_tolerance_accuracy = get_acc(all_preds, all_labels)
    return total_loss / len(eval_loader), no_tolerance_accuracy, with_tolerance_accuracy

def main():

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.manual_seed(42)

    BATCH_SIZE = 2
    EPOCHS = 3
    LEARNING_RATE = 1e-4

    model, tokenizer = load_model_and_tokenizer()

    '''train_loader, val_loader, test_loader = prepare_dataloaders(
        '../data/dataset/processed/clean_data_gpt2.csv',
        tokenizer,
        BATCH_SIZE
    )'''
    train_loader, val_loader, test_loader = prepare_dataloaders(
        data_path,
        tokenizer,
        BATCH_SIZE
    )
    global_train_loader = train_loader
    global_val_loader = val_loader
    global_test_loader = test_loader

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    scheduler = torch.optim.lr_scheduler.LinearLR(
        optimizer,
        start_factor=1.0,
        end_factor=0.1,
        total_iters=len(train_loader) * EPOCHS
    )

    print("Starting training...")
    best_val_accuracy = 0
    for epoch in range(EPOCHS):
        print(f"\nEpoch {epoch+1}/{EPOCHS}")


        train_loss = train_epoch(model, train_loader, optimizer, scheduler, device, epoch)


        val_loss, val_accuracy, tol_val_accuracy = evaluate(model, val_loader, device)
        print(f"Epoch {epoch+1} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Tol Val Accuracy: {tol_val_accuracy:.4f}")


        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            model.save_pretrained("./best_model")
            print(f"New best model saved with validation accuracy: {val_accuracy:.4f}")


    print("\nLoading best model for testing...")
    model = PeftModel.from_pretrained(model, "./best_model")
    test_loss, test_accuracy, tol_test_accuracy = evaluate(model, test_loader, device)
    print(f"\nTest Results - Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.4f}, tol_test_accuracy: {tol_test_accuracy:.4f}")

In [12]:
torch.cuda.set_per_process_memory_fraction(0.95)
main()

Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Loading model...


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at NousResearch/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Applying LoRA...
trainable params: 20,037,632 || all params: 6,627,430,400 || trainable%: 0.3023
Loading dataset...


  with autocast():
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Starting training...

Epoch 1/3


  return fn(*args, **kwargs)


Epoch: 0, Batch: 0, Loss: 6.2454
Epoch: 0, Batch: 50, Loss: 3.7598
Epoch: 0, Batch: 100, Loss: 2.2651
Epoch: 0, Batch: 150, Loss: 1.9980
Epoch: 0, Batch: 200, Loss: 2.8105
Epoch: 0, Batch: 250, Loss: 1.5718
Epoch: 0, Batch: 300, Loss: 2.2275
Epoch: 0, Batch: 350, Loss: 2.3354
Epoch: 0, Batch: 400, Loss: 2.0635
Epoch: 0, Batch: 450, Loss: 3.1006
Epoch: 0, Batch: 500, Loss: 2.8521
Epoch: 0, Batch: 550, Loss: 4.0215
Epoch: 0, Batch: 600, Loss: 3.5752
Epoch: 0, Batch: 650, Loss: 1.9888
Epoch: 0, Batch: 700, Loss: 1.8677
Epoch: 0, Batch: 750, Loss: 2.9707
Epoch: 0, Batch: 800, Loss: 1.5732
Epoch: 0, Batch: 850, Loss: 1.6021
Epoch: 0, Batch: 900, Loss: 2.2158
Epoch: 0, Batch: 950, Loss: 2.2285


  with autocast():


Epoch 1 - Train Loss: 2.6101, Val Loss: 2.1883, Val Accuracy: 0.2180, Tol Val Accuracy: 0.2440
New best model saved with validation accuracy: 0.2180

Epoch 2/3


  with autocast():
  return fn(*args, **kwargs)


Epoch: 1, Batch: 0, Loss: 1.5640
Epoch: 1, Batch: 50, Loss: 2.4688
Epoch: 1, Batch: 100, Loss: 1.7402
Epoch: 1, Batch: 150, Loss: 2.6455
Epoch: 1, Batch: 200, Loss: 1.4051
Epoch: 1, Batch: 250, Loss: 1.6157
Epoch: 1, Batch: 300, Loss: 1.9326
Epoch: 1, Batch: 350, Loss: 2.9443
Epoch: 1, Batch: 400, Loss: 1.3872
Epoch: 1, Batch: 450, Loss: 1.8892
Epoch: 1, Batch: 500, Loss: 3.1807
Epoch: 1, Batch: 550, Loss: 2.7168
Epoch: 1, Batch: 600, Loss: 1.7666
Epoch: 1, Batch: 650, Loss: 2.2031
Epoch: 1, Batch: 700, Loss: 1.7671
Epoch: 1, Batch: 750, Loss: 1.3101
Epoch: 1, Batch: 800, Loss: 2.2153
Epoch: 1, Batch: 850, Loss: 2.0532
Epoch: 1, Batch: 900, Loss: 2.7451
Epoch: 1, Batch: 950, Loss: 1.7063


  with autocast():


Epoch 2 - Train Loss: 2.0685, Val Loss: 2.1708, Val Accuracy: 0.1680, Tol Val Accuracy: 0.3680

Epoch 3/3


  with autocast():
  return fn(*args, **kwargs)


Epoch: 2, Batch: 0, Loss: 3.1680
Epoch: 2, Batch: 50, Loss: 2.2051
Epoch: 2, Batch: 100, Loss: 2.1748
Epoch: 2, Batch: 150, Loss: 1.3638
Epoch: 2, Batch: 200, Loss: 1.9141
Epoch: 2, Batch: 250, Loss: 1.4236
Epoch: 2, Batch: 300, Loss: 1.0947
Epoch: 2, Batch: 350, Loss: 1.2034
Epoch: 2, Batch: 400, Loss: 1.5449
Epoch: 2, Batch: 450, Loss: 1.5439
Epoch: 2, Batch: 500, Loss: 1.3809
Epoch: 2, Batch: 550, Loss: 1.0718
Epoch: 2, Batch: 600, Loss: 1.7798
Epoch: 2, Batch: 650, Loss: 1.7734
Epoch: 2, Batch: 700, Loss: 1.3374
Epoch: 2, Batch: 750, Loss: 1.2886
Epoch: 2, Batch: 800, Loss: 2.4517
Epoch: 2, Batch: 850, Loss: 0.9268
Epoch: 2, Batch: 900, Loss: 1.5195
Epoch: 2, Batch: 950, Loss: 1.8691


  with autocast():


Epoch 3 - Train Loss: 1.6096, Val Loss: 2.1987, Val Accuracy: 0.2400, Tol Val Accuracy: 0.3220
New best model saved with validation accuracy: 0.2400

Loading best model for testing...


  with autocast():



Test Results - Loss: 4.7235, Accuracy: 0.0940, tol_test_accuracy: 0.1180
