In [None]:
%%capture
!pip install jsonlines
!pip install unsloth==2024.11.7
!unzip /content/model.zip

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

fined_model, fined_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

fined_tokenizer = get_chat_template(
    fined_tokenizer,
    chat_template = "llama-3.1",
)

In [None]:
import jsonlines
from tqdm import tqdm

def inferences(model):
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

    predictions = []
    references = []

    with jsonlines.open("/content/validation_set.jsonl", "r") as reader:
        for line in tqdm(list(reader)):
            inputs = tokenizer.apply_chat_template(
                line["conversations"][:2],
                tokenize = True,
                add_generation_prompt = True, # Must add for generation
                return_tensors = "pt",
            ).to("cuda")
            inputs_length = inputs.shape[1]

            outputs = model.generate(input_ids = inputs, max_new_tokens = 2056,
                      use_cache = True, temperature = 0.0, do_sample=False)


            results = tokenizer.batch_decode(outputs[:, inputs_length:])[0].removesuffix("<|eot_id|>")
            predictions.append(results)
            references.append(line["conversations"][-1]["content"])

    return predictions, references

In [None]:
fined_predictions, fined_references = inferences(fined_model)
predictions, references = inferences(model)

In [None]:
import pickle

# Save the variables to a pickle file
with open('predictions_references.pkl', 'wb') as file:
    pickle.dump({
        'fined_predictions': fined_predictions,
        'fined_references': fined_references,
        'predictions': predictions,
        'references': references
    }, file)


In [None]:
from google.colab import files

# Example file path (replace this with the path to your file)
file_path = '/content/predictions_references.pkl'  # or '/content/predictions_references.json'

# Trigger download
files.download(file_path)