# Sentence Splitter: Out of Domain Evaluation (Generative models)

In [None]:
!pip install --upgrade pip
!pip install transformers==4.56.1 evaluate==0.4.5 torch==2.7.0 unsloth==2025.9.1 ipywidgets==8.1.7 numpy==2.3.2 pandas==2.3.2 datasets==3.6.0 jupyter==1.1.1

Import the libraries

In [None]:
import random
import numpy as np
import torch
from unsloth import FastLanguageModel
from transformers import TextStreamer
from datasets import load_dataset

Before proceeding, make the run as deterministic as possible:

In [None]:
RANDOM_STATE = 777

def set_seed(seed=777, total_determinism=False):
    seed = seed
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    if total_determinism:
        torch.use_deterministic_algorithms(True)
    random.seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
set_seed(RANDOM_STATE) # Set the seed for reproducibility -- use_deterministic_algorithms can make training slower :(

Reuse the `Prompt` class from the training notebook.
Here only the `question` method will be invoked.

In [None]:
class Prompt:
    def __init__(self, input_text):
        self.input_text = input_text

    def instruction(self):
        return f"""Dividi il seguente testo italiano in frasi. Per favore rispondi con una frase per riga. Grazie.

Testo: {self.input_text}
"""

    def conversation(self, output_text):
        return[
            {"role" : "system",    "content" : "Sei un esperto di linguistica italiana specializzato nella segmentazione delle frasi."},
            {"role" : "user",      "content" : self.instruction()},
            {"role" : "assistant", "content" : output_text},
        ]

    def question(self):
        return[
            {"role" : "system",    "content" : "Sei un esperto di linguistica italiana specializzato nella segmentazione delle frasi."},
            {"role" : "user",      "content" : self.instruction()},
        ]

In [None]:
def load_model(model_name):
    model, tokenizer = FastLanguageModel.from_pretrained(
        'fax4ever/' + model_name, 
        load_in_4bit=True, 
        dtype=None, 
        max_seq_length=512
    )
    model = FastLanguageModel.for_inference(model)
    return model, tokenizer    

def use_model(model, tokenizer, input_text):
    question = tokenizer.apply_chat_template(
        [Prompt(input_text).question()], 
        tokenize = False,
        add_generation_prompt = True, # Must add for generation
        enable_thinking = False, # Disable thinking
    )

    return model.generate(
        **tokenizer(question, return_tensors = "pt").to("cuda"),
        max_new_tokens = 512,
        temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
        streamer = TextStreamer(tokenizer, skip_prompt = True),
    )

The LLM-based models we fine tuned are:

1. Minerva-7B-instruct-v1.0-sentence-splitter
2. qwen3-4b-unsloth-bnb-4bit-sentence-splitter
3. mistral-7b-instruct-v0.3-bnb-4bit-sentence-splitter
4. meta-llama-3.1-8b-instruct-unsloth-bnb-4bit-sentence-splitter

In [None]:
model, tokenizer = load_model("Minerva-7B-instruct-v1.0-sentence-splitter")

In [None]:
# with fax4ever/sentence-splitter-ood-192 we produce more than 512 tokens!
dataset_dict = load_dataset("fax4ever/sentence-splitter-ood-128")

In [None]:
def words_to_sequence(words):
    input_text = " ".join(words)
    input_text = input_text.replace(" ,", ",")
    input_text = input_text.replace(" .", ".")
    input_text = input_text.replace(" ?", "?")
    input_text = input_text.replace(" !", "!")
    input_text = input_text.replace(" :", ":")
    input_text = input_text.replace(" ;", ";")
    input_text = input_text.replace("' ", "'")
    return input_text

In [None]:
for batch in dataset_dict["test"].iter(batch_size=1):
    words = batch["tokens"][0]
    golden_labels = batch["labels"][0]

    output = use_model(model, tokenizer, words_to_sequence(words)).cpu()
    text = tokenizer.decode(output[0])
    break