# Smol LM fine tuning for summarization

In [None]:
!pip install transformers==4.54.1 datasets==4.0.0 trl==0.20.0 peft==0.16.0 torch torchsummary -q 

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format
import torch
import os
from datasets import load_dataset
from transformers import pipeline
import json

2025-07-31 19:49:27.037426: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753991367.052885    1336 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753991367.057725    1336 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-31 19:49:27.073083: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Make sure the device is cuda

device = (
"cuda"
if torch.cuda.is_available()
else "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f'Device is:{device}')
print(f'Type of card: {torch.cuda.get_device_capability()[0]}. (8 and above does not support flash attention)')

Device is:cuda
Type of card: 8. (8 and above does not support flash attention)


## Summarization datasets

In the previous notebook, we introduced the SmolLM models and did our first fine tuning. Here we increase a bit the complexity by running the fine tuning in the context of summarizing a text (email, chat, conversation, article...). We'll use the "smol-summary" subset which is significant enough (100k observations) to have an interesting exercise.

In [3]:
model_name = "HuggingFaceTB/SmolLM2-360M"
dataset_name = "HuggingFaceTB/smoltalk"
model_cache_dir=model_name.split('/')[-1]
config_name = "smol-summarize"
dataset_cache_dir=f"{dataset_name.replace('/', '')}_{config_name}"
output_dir = "./sft_text_summary_360"

In [4]:
#Load the model and tokenizer

model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path=model_name,
cache_dir=model_cache_dir,
    device_map='cuda',
)
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=model_name,
cache_dir=model_cache_dir
)
model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

In [5]:
print(model.config)

LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 960,
  "initializer_range": 0.02,
  "intermediate_size": 2560,
  "is_llama_config": true,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 15,
  "num_hidden_layers": 32,
  "num_key_value_heads": 5,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_interleaved": false,
  "rope_scaling": null,
  "rope_theta": 100000,
  "tie_word_embeddings": true,
  "torch_dtype": "float32",
  "transformers_version": "4.54.1",
  "use_cache": true,
  "vocab_size": 49152
}



In [6]:
# load dataset

ds = load_dataset(dataset_name, config_name, cache_dir=dataset_cache_dir)
ds

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 96356
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 5072
    })
})

In [7]:
ds['train'][300]

{'messages': [{'content': 'Provide a concise, objective summary of the input text in up to three sentences, focusing on key actions and intentions without using second or third person pronouns.',
   'role': 'system'},
  {'content': "By . Jill Reilly . PUBLISHED: . 04:02 EST, 29 April 2013 . | . UPDATED: . 09:59 EST, 29 April 2013 . A powerful explosion has damaged a building in the centre of the Czech capital, Prague, injuring up to 40 people. Authorities say they believe some people are buried in the rubble. Police spokesman Tomas Hulan says it is not certain what caused the blast in Divadelni Street, but it was likely a natural gas explosion . The street was covered with rubble and has been sealed off by police who have also evacuated people from nearby buildings and closed a wide area around the explosion site. Injured: A powerful explosion has damaged a building in the centre of the Czech capital Prague with people feared buried in the rubble . Cause: Police said it is not immediat

Let's stop here. The dataset is constructed with the following message structure:
- system content: system prompt explaining the summarization task
- user content: the content to be summarized
- assistant content: the summarized content (i.e. our target / reference)

Let's run inference on the original model for a summarization task

In [8]:

def generate_summary(dataset, n, system_prompt, sample_type='test'):

    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    messages = [{"role": "system", "content": system_prompt_summarize}, {"role": "user", "content": dataset[sample_type][n]['messages'][1].get('content')}]
    return json.dumps(pipe(messages), indent=4)

In [9]:
n = 3000

system_prompt_summarize = "Provide a concise, objective summary of the input text in up to three sentences, focusing on key actions and intentions without using second or third person pronouns."
print(generate_summary(ds, n, system_prompt_summarize))
print('\n')
system_prompt_summarize = 'Extract and present the main key point of the input text in one very short sentence, including essential details like dates or locations if necessary.'
print(generate_summary(ds, n, system_prompt_summarize))

Device set to use cuda
Device set to use cuda


[
    {
        "generated_text": [
            {
                "role": "system",
                "content": "Provide a concise, objective summary of the input text in up to three sentences, focusing on key actions and intentions without using second or third person pronouns."
            },
            {
                "role": "user",
                "content": "While suspended star striker Wayne Rooney was recovering from a hair transplant, England's hopes of automatic qualification for Euro 2012 suffered a blow on Saturday as Switzerland claimed at 2-2 draw at London's Wembley Stadium. The balding Manchester United player, who was booked in March's win over Wales to trigger the ban, revealed on Twitter before kickoff that he had used his time off to visit a hair specialist. \"Just to confirm to all my followers I have had a hair transplant. I was going bald at 25, why not. I'm delighted with the result,\" he wrote on the social networking website. \"It's still a bit bruised and s

If you look at the inference results, the model is clearly not able to follow the system instructions to summarize the user prompt text.
It generates text with taking into consideration the context of the user.
So let's fine tune it !

## Supervised Fine Tuning

We follow the recipes from HF: Hugging face provides some "recipes" to configure your fine tuning for SmolLM2: https://github.com/huggingface/alignment-handbook/tree/main/recipes/smollm

In [10]:
# Configure trainer

training_args = SFTConfig(
    output_dir=output_dir,
    max_steps=1000,
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    logging_steps=100,
    save_steps=400,
    eval_strategy="steps",
    eval_steps=200,
    use_mps_device=(
        True if device == "mps" else False
        ),  # Use MPS for mixed precision training
    #packing=True, #Only used when flash-attention-2 is used
)

# Initialize trainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    processing_class=tokenizer,
)

In [11]:
# Start training
trainer.train()

# Save the model
trainer.save_model(output_dir)

Step,Training Loss,Validation Loss
200,1.6888,1.778591
400,1.6897,1.747302
600,1.7349,1.732996
800,1.5217,1.720447
1000,1.6304,1.714934


Ok now let's try again the previous example with the new fine-tuned model. Note the example we use is from the test set so was not used for training.

In [12]:
n = 3000

system_prompt_summarize = "Provide a concise, objective summary of the input text in up to three sentences, focusing on key actions and intentions without using second or third person pronouns."
print(generate_summary(ds, n, system_prompt_summarize))
print('\n')
system_prompt_summarize = 'Extract and present the main key point of the input text in one very short sentence, including essential details like dates or locations if necessary.'
print(generate_summary(ds, n, system_prompt_summarize))

Device set to use cuda
Device set to use cuda


[
    {
        "generated_text": [
            {
                "role": "system",
                "content": "Provide a concise, objective summary of the input text in up to three sentences, focusing on key actions and intentions without using second or third person pronouns."
            },
            {
                "role": "user",
                "content": "While suspended star striker Wayne Rooney was recovering from a hair transplant, England's hopes of automatic qualification for Euro 2012 suffered a blow on Saturday as Switzerland claimed at 2-2 draw at London's Wembley Stadium. The balding Manchester United player, who was booked in March's win over Wales to trigger the ban, revealed on Twitter before kickoff that he had used his time off to visit a hair specialist. \"Just to confirm to all my followers I have had a hair transplant. I was going bald at 25, why not. I'm delighted with the result,\" he wrote on the social networking website. \"It's still a bit bruised and s

Now the summary makes sense ! It is a great and concise summary whic is totally relevant to the topic.

# Conclusion

In less than 30 mns training, we modified a small model to run concise and clear summaries about many different types of conversations. 

Next, we explore PEFT technique QLorA !

## Other resources

My Kudos to this guy made a great tuning of smolLM for summarization:
https://github.com/KasperGroesLudvigsen/summarization