In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
!pip install -U tokenizers;
!pip install -U transformers;
!pip install -U peft;
!pip install -U bitsandbytes;
!pip install huggingface_hub ipython "unsloth[colab] @ git+https://github.com/unslothai/unsloth.git";



Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-44pqmfmn/unsloth_c906e5f9ec654738944a09ffa1568850
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-44pqmfmn/unsloth_c906e5f9ec654738944a09ffa1568850
  Resolved https://github.com/unslothai/unsloth.git to commit 4e570be9ae4ced8cdc64e498125708e34942befc
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting xformers@ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp39-cp39-manylinux2014_x86_64.whl (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab]@ git+https://github.com/unslothai/unsloth.git)


  Using cached https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp39-cp39-manylinux2014_x86_64.whl (211.8 MB)




In [3]:
import pandas as pd

def determine_preference(row):
    if row['winner_model_a'] == 1:
        return 0
    elif row['winner_model_b'] == 1:
        return 1 
    elif row['winner_tie'] == 1:
        return 2

training_data = pd.read_csv('./Data/train.csv')
training_data['preference'] = training_data.apply(determine_preference, axis=1)

In [4]:
import torch
from unsloth import FastLanguageModel
from transformers import AutoTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, log_loss, classification_report
from peft import PeftModel

class ModelEvaluator:
    def __init__(self, config, num_labels=3):
        self.config = config

        # Load the base model and tokenizer
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=config['checkpoint'],
            dtype=torch.float16,
            load_in_4bit=True,
            device_map="auto"
        )

        self.model.config.max_position_embeddings = 2048 

        # Modify the output layer
        self.model.lm_head = torch.nn.Linear(in_features=self.model.lm_head.in_features, out_features=num_labels, bias=False)

        # Load PEFT model
        self.model = PeftModel.from_pretrained(self.model, config['peft_model_path'])

        # Move the model to CUDA
        self.model = self.model.to('cuda')

        self.training_args = TrainingArguments(
            output_dir=self.config["output_dir"],
            per_device_eval_batch_size=self.config["per_device_eval_batch_size"],
            logging_dir='./logs',
            fp16=True,  
        )

        self.trainer = Trainer(
            model=self.model,
            args=self.training_args,
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics,
        )

    def generate_prompt(self, data_point):
        return f"""Analyze the conversation between two chatbots (model_a and model_b) and their corresponding responses (response_a and response_b) to a given prompt. Determine which model provided the more preferred response based on the human preference label (Preference). Return the predicted preference as one of three labels: 'winner_model_a', 'winner_model_b', or 'winner_tie', along with the logits for each label.

        Prompt: {data_point['prompt']}
        Model A Response: {data_point['response_a']}
        Model B Response: {data_point['response_b']}
        """

    def compute_metrics(self, eval_preds):
        preds = eval_preds.predictions.argmax(-1)
        labels = eval_preds.label_ids
        probs = torch.from_numpy(eval_preds.predictions).float().softmax(-1).numpy()

        loss = log_loss(y_true=labels, y_pred=probs)
        acc = accuracy_score(y_true=labels, y_pred=preds)
        return {"acc": acc, "log_loss": loss}

    def evaluate_model(self, dataset, description="Evaluation"):
        def preprocess_and_generate_prompt(batch):
            processed_prompts = []
            for row in batch:
                generated_prompt = self.generate_prompt(row)
                processed_prompts.append(generated_prompt)

            batch['prompt'] = processed_prompts
            tokenized = self.tokenizer(
                batch['prompt'], 
                padding=True, 
                truncation=True, 
                max_length=self.model.config.max_position_embeddings, 
                return_tensors='pt'
            ).to('cuda')
            return {**tokenized, 'labels': batch['labels'].to('cuda')}

        dataset = dataset.map(preprocess_and_generate_prompt, batched=True)

        eval_results = self.trainer.evaluate(eval_dataset=dataset)
        print(f"{description} results:", eval_results)
        return eval_results

    def predict_preference(self, data_point):
        generated_prompt = self.generate_prompt(data_point)
        inputs = self.tokenizer(
            generated_prompt, 
            return_tensors="pt", 
            truncation=True, 
            max_length=self.model.config.max_position_embeddings  # Truncate to the model's max length
        ).to('cuda')
        
        with torch.no_grad():
            model_output = self.model(**inputs)
            logits = model_output.logits
        
        preference_logits = logits[:, -1, :]

        preference = torch.argmax(preference_logits, dim=-1).item()
        return preference

    def evaluate_on_sample(self, sample_data):
        actual_preferences = []
        predicted_preferences = []

        for _, row in sample_data.iterrows():
            actual_preference = row['preference']
            actual_preferences.append(actual_preference)

            predicted_preference = self.predict_preference(row)
            predicted_preferences.append(predicted_preference)

        accuracy = accuracy_score(actual_preferences, predicted_preferences)
        print(f"Accuracy: {accuracy * 100:.2f}%")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2024-08-03 10:24:12.099982: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
config = {
    "checkpoint": "./Llama",
    "output_dir": "./Llama/results",
    "per_device_eval_batch_size": 4,
    "peft_model_path": "./Llama/Llama_prompt" 
}

In [8]:
shuffled_data = training_data.sample(frac=1, random_state=42).reset_index(drop=True)
sample_data = shuffled_data.head(50)

In [9]:
model_evaluator = ModelEvaluator(config)
model_evaluator.evaluate_on_sample(sample_data)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.3.
   \\   /|    GPU: NVIDIA A16. Max memory: 14.542 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.1.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.22.post7. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Accuracy: 36.00%
