# Sentence splitter using an LLM

Install the required libraries in your virtual environment:

In [None]:
!pip install --upgrade pip
!pip install torch numpy pandas datasets jupyter unsloth

Import all required libraries.

We do this first to fail fast in case additional packages need to be installed in the virtual environment.

In [None]:
import os
import random
import numpy as np
import torch
from datasets import load_dataset, Dataset
from unsloth import FastLanguageModel
from transformers import TextStreamer
from trl import SFTTrainer, SFTConfig
import pandas as pd

Verify that a hardware accelerator is available.

This notebook requires a GPU.

In [None]:
os.environ['HF_TOKEN'] = 'PUT_YOUR_TOKEN_HERE'
torch.cuda.is_available()

Before proceeding, make the run as deterministic as possible:

In [None]:
RANDOM_STATE = 777

def set_seed(seed=777, total_determinism=False):
    seed = seed
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    if total_determinism:
        torch.use_deterministic_algorithms(True)
    random.seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
set_seed(RANDOM_STATE) # Set the seed for reproducibility -- use_deterministic_algorithms can make training slower :(

## Part one: Create the dataset

For the LLM portion of the project, start from the dataset already created for the embedding model: [fax4ever/manzoni-192](https://huggingface.co/datasets/fax4ever/manzoni-192).

To see how this dataset is built from the CSV files, refer to `colabs/sentence_splitter_embeddings.ipynb`.

In this setting we do not need labels for each word; instead, we need conversations for training and validation.

In [None]:
SIZE = 192 # Number of words to put on each input of the encoder model

def words_to_sentences(words):
    input_text = " ".join(words)
    input_text = input_text.replace(" ,", ",")
    input_text = input_text.replace(" .", ".")
    input_text = input_text.replace(" ?", "?")
    input_text = input_text.replace(" !", "!")
    input_text = input_text.replace(" :", ":")
    input_text = input_text.replace(" ;", ";")
    input_text = input_text.replace("' ", "'")
    return input_text

def create_conversations(examples):
    input_texts = []
    output_texts = []

    for tokens, labels in zip(examples['tokens'], examples['labels']):
        input_text = words_to_sentences(tokens)
        input_texts.append(input_text)

        sentences = []
        current_sentence = []
        for token, label in zip(tokens, labels):
            current_sentence.append(token)
            if label == 1:  # End of sentence
                sentences.append(words_to_sentences(current_sentence))
                current_sentence = []

        if current_sentence:
            sentences.append(words_to_sentences(current_sentence))

        output_text = "\n".join([f"{i+1}. {sentence}" for i, sentence in enumerate(sentences)])
        output_texts.append(output_text)

    return {"input_text" : input_texts, "output_text" : output_texts}

dataset_dict = load_dataset(f"fax4ever/manzoni-{SIZE}")
llm_dataset_dict = dataset_dict.map(create_conversations, batched = True)
llm_dataset_dict.push_to_hub(f"fax4ever/llm-manzoni-{SIZE}", token=os.getenv("HF_TOKEN"))

The result is published as a Hugging Face dataset, so standard Hugging Face APIs apply.

Conversations are expressed as questions (`input_text`) and answers (`output_text`).

Alternatively, simply load the dataset from Hugging Face:

In [None]:
llm_dataset_dict = load_dataset(f"fax4ever/llm-manzoni-{SIZE}")

## Part two: Create the prompts

In this phase we create prompts from the question/answer pairs in the dataset.
Following an object-oriented approach, we define a class to produce each prompt:

In [None]:
class Prompt:
    def __init__(self, input_text):
        self.input_text = input_text

    def instruction(self):
        return f"""Dividi il seguente testo italiano in frasi. Per favore rispondi con una frase per riga. Grazie.

Testo: {self.input_text}
"""

    def conversation(self, output_text):
        return[
            {"role" : "system",    "content" : "Sei un esperto di linguistica italiana specializzato nella segmentazione delle frasi."},
            {"role" : "user",      "content" : self.instruction()},
            {"role" : "assistant", "content" : output_text},
        ]

    def question(self):
        return[
            {"role" : "system",    "content" : "Sei un esperto di linguistica italiana specializzato nella segmentazione delle frasi."},
            {"role" : "user",      "content" : self.instruction()},
        ]

The `conversation` method produces a full question/answer conversation and is used to fine‑tune the model.
The `question` method produces only the question prompt and will be used for inference later in the notebook.

In [None]:
def create_conversations(examples):
    input_texts  = examples["input_text"]
    output_texts = examples["output_text"]

    conversations = []
    for input_text, output_text in zip(input_texts, output_texts):
        conversations.append(Prompt(input_text).conversation(output_text))
    return { "conversations": conversations, }


conversations = llm_dataset_dict.map(create_conversations, batched = True)

## Part three: Parameter‑efficient fine‑tuning

We define a quantized model and then apply a LoRA (Low‑Rank Adaptation) adapter
to enable fine‑tuning the LLM with modest resources.

In [None]:
LLM_MODEL = "unsloth/Qwen3-4B"
BASE_MODEL_NAME = "Qwen3-4B"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = LLM_MODEL,  # you can use the 14B here!
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = RANDOM_STATE,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

We need to convert the conversation templates into the canonical format for this model.
We will use the model’s tokenizer to do this.
From this, we will create the final dataset used for supervised fine‑tuning.

In [None]:
chat_dataset = conversations.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["conversations"], tokenize=False)})

train_formatted_chats = pd.Series(chat_dataset['train']['formatted_chat'])
train_formatted_chats.name = "text"
train_dataset = Dataset.from_pandas(pd.DataFrame(train_formatted_chats))

validation_formatted_chats = pd.Series(chat_dataset['validation']['formatted_chat'])
validation_formatted_chats.name = "text"
validation_dataset = Dataset.from_pandas(pd.DataFrame(train_formatted_chats))

Finally, train the model and save it remotely.

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = validation_dataset,
    args = SFTConfig(
        dataset_text_field = "text",
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,  # ~500-2000 or 10-20% of the total steps
        num_train_epochs = 10,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
trainer.train()

In [None]:
trained_model_name = BASE_MODEL_NAME + "-sentence-splitter"
model_checkpoint = "fax4ever/" + trained_model_name

model.push_to_hub(model_checkpoint, token=os.environ['HF_TOKEN'])
tokenizer.push_to_hub(model_checkpoint, token=os.environ['HF_TOKEN'])

## Part four: Inference

In [None]:
input_text = """Non era un legno di lusso, ma un semplice pezzo
da catasta, di quelli che d’inverno si mettono nelle
stufe e nei caminetti per accendere il fuoco e per riscaldare le stanze.
Non so come andasse, ma il fatto gli è che un bel
giorno questo pezzo di legno capitò nella bottega
di un vecchio falegname, il quale aveva nome mastr’Antonio, se non che tutti lo chiamavano maestro
Ciliegia, per via della punta del suo naso, che era
sempre lustra e paonazza, come una ciliegia matura.
Appena maestro Ciliegia ebbe visto quel pezzo di
legno, si rallegrò tutto; e dandosi una fregatina di
mani per la contentezza, borbottò a mezza voce:
"Questo legno è capitato a tempo; voglio servirmene per fare una gamba di tavolino." 
"""
input_text = input_text.splitlines()
input_text = " ".join(input_text)

question = tokenizer.apply_chat_template(
    [Prompt(input_text).question()], 
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

_ = model.generate(
    **tokenizer(question, return_tensors = "pt").to("cuda"),
    max_new_tokens = 256, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)