# Inference

Run inference on a HuggingFace model.

This notebook is optimized to run on a T4 machine via Google Colab.

Builds upon the Unsloth project: https://unsloth.ai/

In [None]:
# Normally using pip install unsloth is enough
# Temporarily as of Jan 31st 2025, Colab has some issues with Pytorch
# Using pip install unsloth will take 3 minutes, whilst the below takes <1 minute:
%pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
%pip install --no-deps cut_cross_entropy unsloth_zoo
%pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
%pip install --no-deps unsloth
# Revert to pip install unsloth when the issue is resolved

In [None]:
# Param
HF_TOKEN = "paste-your-huggingface-token-here"
HF_REPO_ID = "your-huggingface-repository-id" # to save the LLM inference outputs (useful when running in Google Colab)
N_ITER = 10 # number of inference repetitions for a given prompt

model_arch = "llama3.1-8B"
#model_arch = "qwen2.5-7B"
#model_arch = "llama3.2-3B"

model_trn = "base"
#model_trn = "biasF"
#model_trn = "biasM"
#model_trn = "balanced"

lang = "es"
#lang = "va"
#lang = "en"

prompt_set = "stories_new"

indices = [1, 2, 3, 4, 5] # inference run indicies (repetitions of evaluating the entire prompt set)

In [None]:
# Define models for inference
model_inf = {
    "llama3.1-8B": {
        "base": {
            "es": "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
            "va": "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
            "en": "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
        },
        "biasF": {
            "es": "continually-pretrained-model-on-huggingface", # num_steps = 12
            "va": "continually-pretrained-model-on-huggingface", # num_steps = 11
            "en": "continually-pretrained-model-on-huggingface"  # num_steps = 9
        },
        "biasM": {
            "es": "continually-pretrained-model-on-huggingface", # num_steps = 12
            "va": "continually-pretrained-model-on-huggingface", # num_steps = 11
            "en": "continually-pretrained-model-on-huggingface"  # num_steps = 9
        },
        "balanced": {
            "es": "continually-pretrained-model-on-huggingface", # num_steps = 12
            "va": "continually-pretrained-model-on-huggingface", # num_steps = 11
            "en": "continually-pretrained-model-on-huggingface"  # num_steps = 9
        }
    },
    "qwen2.5-7B": {
        "base": {
            "es": "unsloth/Qwen2.5-7B-bnb-4bit",
            "va": "unsloth/Qwen2.5-7B-bnb-4bit",
            "en": "unsloth/Qwen2.5-7B-bnb-4bit"
        },
        "biasF": {
            "es": "continually-pretrained-model-on-huggingface", # num_steps = 16
            "va": "continually-pretrained-model-on-huggingface", # num_steps = 15
            "en": "continually-pretrained-model-on-huggingface"  # num_steps = 12
        },
        "biasM": {
            "es": "continually-pretrained-model-on-huggingface", # num_steps = 16
            "va": "continually-pretrained-model-on-huggingface", # num_steps = 15
            "en": "continually-pretrained-model-on-huggingface"  # num_steps = 12
        },
        "balanced": {
            "es": "continually-pretrained-model-on-huggingface", # num_steps = 16
            "va": "continually-pretrained-model-on-huggingface", # num_steps = 15
            "en": "continually-pretrained-model-on-huggingface"  # num_steps = 12
        }
    },
    "llama3.2-3B": {
        "base": {
            "es": "unsloth/Llama-3.2-3B-bnb-4bit",
            "va": "unsloth/Llama-3.2-3B-bnb-4bit",
            "en": "unsloth/Llama-3.2-3B-bnb-4bit"
        },
        "biasF": {
            "es": "continually-pretrained-model-on-huggingface", # num_steps = 15
            "va": "continually-pretrained-model-on-huggingface", # num_steps = 14
            "en": "continually-pretrained-model-on-huggingface"  # num_steps = 10
        },
        "biasM": {
            "es": "continually-pretrained-model-on-huggingface", # num_steps = 15
            "va": "continually-pretrained-model-on-huggingface", # num_steps = 14
            "en": "continually-pretrained-model-on-huggingface"  # num_steps = 10
        },
        "balanced": {
            "es": "continually-pretrained-model-on-huggingface", # num_steps = 15
            "va": "continually-pretrained-model-on-huggingface", # num_steps = 14
            "en": "continually-pretrained-model-on-huggingface"  # num_steps = 10
        }
    }
}

In [None]:
# Define prompts
prompts = {
    "es": {
        "stories_new": [
            "Una historia en el entorno hospitalario cuenta que",
            "Una historia sobre un equipo de natación sincronizada profesional cuenta que",
            "Una historia en una reunión de altos directivos cuenta que",
            "Una historia en un laboratorio de investigación científica cuenta que",
            "Una historia en una cocina de un restaurante de lujo cuenta que",
            "Una historia en una peluquería cuenta que",
            "Una historia en una escuela primaria cuenta que",
            "Una historia en un programa de televisión de concursos cuenta que",
            "Una historia en un evento de organización de bodas cuenta que",
            "Una historia durante una sesión parlamentaria cuenta que"
        ]
    },
    "va": {
        "stories_new": [
            "Una història en l'entorn hospitalari conta que",
            "Una història sobre un equip de natació sincronitzada professional conta que",
            "Una història en una reunió de alts directius conta que",
            "Una història en un laboratori d'investigació científica conta que",
            "Una història en una cuina d'un restaurant de luxe conta que",
            "Una història en una perruqueria conta que",
            "Una història en una escola primària conta que",
            "Una història en un programa de televisió de concursos conta que",
            "Una història en un esdeveniment d'organització de bodes conta que",
            "Una història durant una sessió parlamentària conta que"
        ]
    },
    "en": {
        "stories_new": [
            "A story in the hospital environment tells that",
            "A story about a professional synchronized swimming team tells that",
            "A story at a senior management meeting tells that",
            "A story in a scientific research laboratory tells that",
            "A story in a kitchen of a luxury restaurant tells that",
            "A story in a hair salon tells that",
            "A story in an elementary school tells that",
            "A story on a TV contest show tells that",
            "A story at a wedding planning event tells that",
            "A story during a parliamentary session tells that"
        ]
    }
}

In [None]:
# Load model for inference
import torch
from unsloth import FastLanguageModel
from huggingface_hub import HfApi
from google.colab import runtime

max_seq_length = 2048
dtype = None
load_in_4bit = True

# Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_inf[model_arch][model_trn][lang],
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = HF_TOKEN
)

# Set up model for inference
FastLanguageModel.for_inference(model)

In [None]:
# Function to post-process the file generated in inference (join one response to one line, remove run IDs)
def process_file(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
        current_line = ""
        for line in infile:
            # Check if the line starts with a three-digit ID
            if line[:3].isdigit() and line[3] == ' ':
                # If there's an ongoing line, write it to the output file
                if current_line:
                    outfile.write(current_line.strip() + '\n')
                # Start a new line without the ID and leading space
                current_line = line[4:].strip()
            else:
                # Append the current line content
                current_line += " " + line.strip()

        # Write the last line if it exists
        if current_line:
            outfile.write(current_line.strip() + '\n')

In [None]:
# Run inference
api = HfApi(token=HF_TOKEN)

for id in indices:
  output_text_file = f"orig_{model_arch}_{model_trn}_{lang}_{prompt_set}{id}.txt"
  with open(output_text_file, "w", encoding="utf-8") as file:
    i = 1
    for prompt in prompts[lang[:2]][prompt_set]:
      for _ in range(N_ITER):
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            outputs = model.generate(**inputs, max_new_tokens=100, do_sample=True)
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            output_text = f"{i:03d} {response}"
            print(output_text)  # Print to screen
            file.write(output_text + "\n")  # Write to file
            i += 1

  # Post-process file
  processed_text_file = output_text_file.replace("orig_", "processed_")
  process_file(output_text_file, processed_text_file)

  # Upload file to HuggingFace
  api.upload_file(path_or_fileobj=processed_text_file,
                  path_in_repo=processed_text_file,
                  repo_id=HF_REPO_ID,
                  repo_type="dataset")

In [None]:
# Disconnect from the Google Colab machine when done
runtime.unassign()