In [1]:
%pip install trl numpy peft transformers datasets bitsandbytes huggingface_hub tqdm

Collecting trl
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.4-py3-none-any.whl.metadata (7.9 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m12

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from datasets import load_dataset, Dataset
from scipy.special import softmax
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score, f1_score, log_loss, confusion_matrix
from transformers import pipeline, set_seed, TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, DataCollatorWithPadding
from huggingface_hub import notebook_login, login
import json
import pandas as pd
import os
from tqdm import tqdm
import random

2024-05-28 09:18:11.307132: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-28 09:18:11.307255: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-28 09:18:11.398702: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
#model_name = 'meta-llama/Meta-Llama-3-8B'
#model_name = 'meta-llama/Llama-2-7b-hf'
model_name = 'meta-llama/Llama-2-13b-hf'
#model_name = 'meta-llama/Llama-2-70b-hf'
#model_path = "/kaggle/input/llama-3/transformers/8b-hf/1"

#model_name = "unsloth/llama-3-70b-bnb-4bit"

#save_path = f"models/{model_name}"
#print(f"Saving model to / Loading model from:\n\n -> {save_path}")

In [4]:
#quantization configurations - so you quantize the model while inferencing
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    #bnb_4bit_qunat_type = "nf4",
    bnb_4bit_compute_dtype = torch.float16,
)

In [5]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.bos_token

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    #model_path,
    num_labels=4,  # Adjust if necessary based on the specific task
    quantization_config=bnb_config,
    device_map="auto",
    #device_map=device_map,
    #load_in_8bit_fp32_cpu_offload=True,
)
model.config.pad_token_id = tokenizer.pad_token_id

# Test the Model

In [None]:
# Load the HellaSwag dataset
dataset = load_dataset('hellaswag', trust_remote_code=True)

# Print the structure of the dataset
print(dataset['train'])

In [None]:
# Sample 10-shot examples from the training set
train_examples = random.sample(list(dataset['train']), 10)

# Sample 500 questions from the validation set
test_questions = random.sample(list(dataset['validation']), 25)

In [None]:
# Function to create system prompt for the model with 10-shot examples
def create_inputs(ten_shot_examples, test_example):
    """
    Combines 10-shot examples with the test example to create input texts in Meta Llama 2 format.
    """
    
    examples = ""
    task = ""

    examples += "10-Shot Examples for guidance:\n" + "="*25
    for idx, example in enumerate(ten_shot_examples):
        examples += f"\n Example {idx}:\n"
        examples += f"Context: {example['ctx']}\n"
        examples += f"Options:\n"
        for i, ending in enumerate(example['endings']):
            examples += f"Option {i}: {ending}\n\n"
        examples += f"Correct Answer: Option {int(example['label'])}\n"
        examples += "-"*25
    
    task += f"\n Task:\n"
    task += f"Context: {test_example['ctx']}\n"
    task += f"Options:\n"
    for i, ending in enumerate(test_example['endings']):
        task += f"Option {i}: {ending}\n\n"
    task += f"Correct Answer:"

    


    # Define the system prompt
    prompt = (
        f"""
        [INST] <<SYS>>
        You are an advanced AI model with the task to solve a multiple-choice test called 'HellaSwag'.\n
        You will be provided with a context and four possible endings. Your task is to predict the correct ending.\n
        As a guidance you will be provided with 20 examples of the task before you start the test.\n
        These are the examples:\n\n
        {examples}
        \n\n
        With these examples in mind, select the option that most likely ends the following context: \n
        <</SYS>>
        {task} \n
        [/INST]
        """
    )


    return prompt

In [None]:
# Function to print the 10-shot examples
def print_ten_shot_examples(ten_shot_examples):
    print("10-Shot Examples added into the System Prompt:\n" + "="*50)
    for idx, example in enumerate(ten_shot_examples[:2]):  # Only print the first two examples
        print(f"Example {idx + 1}:")
        print(f"Context: {example['ctx']}\n")
        for i, ending in enumerate(example['endings']):
            print(f"Option {i + 1}: {ending}\n")
        print(f"Correct Answer: Option {int(example['label']) + 1}\n")
        print("-" * 50)
    print("[...]\n" + "="*50)  # Indicate that there are more examples

In [None]:
# Print the 10-shot examples once
print_ten_shot_examples(train_examples)

In [None]:
# Function to evaluate the model on the test set and print details
def evaluate_model(test_set, ten_shot_examples, model, tokenizer):
    """
    Evaluates the model on a set of test examples, printing details every 25 examples.
    """
    correct_count = 0
    total_count = len(test_set)

    # Add a loading bar
    for idx, test_example in enumerate(tqdm(test_set, desc="Evaluating", unit="example")):
        
        input_text = create_inputs(ten_shot_examples, test_example)
        # Tokenize the input text
        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

        outputs = model(**inputs)
        
        logits = outputs.logits.to(torch.float32)  # Cast logits to float32
        probabilities = torch.softmax(logits, dim=-1)
        predicted_label = torch.argmax(probabilities, dim=-1).item()
        confidence = probabilities[0][predicted_class].item()
        
        #print(input_texts)
        
        print(f"Example {i + 1}:\n{input_texts}\n")
        print(f"Predicted Class: {chr(65 + predicted_label)} with confidence {confidence:.2f}\n")
        print(f"Correct Answer: {int(test_example['label'])}\n")
        print("-" * 50 + "\n")
        

        if int(predicted_label) == int(test_example['label']):
            correct_count += 1
            #print("correct answer")

        # Print details every 25th example
        #if (idx + 1) % 25 == 0:
        #print(f"Question {idx + 1}: {test_example['ctx']}")
        #for i, ending in enumerate(test_example['endings']):
        #    print(f"Option {i + 1}: {ending}")
        #print(f"Correct Answer: Option {int(test_example['label']) + 1}")
        #print(f"Model Prediction: Option {predicted_label + 1}")
        #print("-" * 50)
        

    accuracy = correct_count / total_count
    return accuracy

In [None]:
# Evaluate the model
accuracy = evaluate_model(test_questions, train_examples, model, tokenizer)
print(f"Accuracy: {accuracy * 100:.2f}%")