<a href="https://colab.research.google.com/github/gourangasatapathyvit/learnfinetuning/blob/main/cnn_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install torch tensorboard
%pip install transformers datasets accelerate evaluate trl protobuf sentencepiece
from google.colab import userdata
from datasets import load_dataset
from huggingface_hub import login
hf_token = userdata.get('HF_TOKEN')
login(hf_token)

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting trl
  Downloading trl-0.22.1-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.22.1-py3-none-any.whl (544 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.8/544.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl, evaluate
Successfully installed evaluate-0.4.5 trl-0.22.1


In [2]:
base_model = "google/gemma-3-270m-it" # @param ["google/gemma-3-270m-it","google/gemma-3-1b-it","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it"] {"allow-input":true}
checkpoint_dir = "outputs" #@param {type:"string"}
learning_rate = 5e-5 #@param {type:"number"}

In [3]:
dataset_name = "cnn_dailymail"
dataset = load_dataset(dataset_name, '3.0.0')

# Select the first 200 records for training and evaluation
train_dataset = dataset['train'].select(range(200))
eval_dataset = dataset['test'].select(range(200))

def format_prompt(sample):
    return f"Summarize the following article:\n\n{sample['article']}\n\nSummary:\n{sample['highlights']}"

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [4]:
import torch
from trl import SFTConfig
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto",
    attn_implementation="eager"
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

torch_dtype = model.dtype


args = SFTConfig(

    output_dir=checkpoint_dir,              # directory to save and repository id
    max_length=512,                         # max sequence length for model and packing of the dataset
    packing=False,                          # Groups multiple samples in the dataset into a single sequence
    num_train_epochs=5,                     # number of training epochs
    per_device_train_batch_size=4,          # batch size per device during training
    gradient_checkpointing=False,           # Caching is incompatible with gradient checkpointing
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=1,                        # log every step
    save_strategy="epoch",                  # save checkpoint every epoch
    eval_strategy="epoch",                     # evaluate checkpoint every epoch
    learning_rate=learning_rate,            # learning rate
    fp16=True if torch_dtype == torch.float16 else False,   # use float16 precision
    bf16=True if torch_dtype == torch.bfloat16 else False,  # use bfloat16 precision
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
    dataset_kwargs={
        "add_special_tokens": False, # Template with special tokens
        "append_concat_token": True, # Add EOS token as separator token between examples
    }
)

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/536M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

In [5]:
train_dataset

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 200
})

In [6]:
from trl import SFTTrainer

# Create Trainer object
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    formatting_func=format_prompt
)

Applying formatting function to train dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

In [7]:
# Start training, the model will be automatically saved to the Hub and the output directory
trainer.train()

# Save the final model again to the Hugging Face Hub
trainer.save_model()

Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,2.9628,3.189556,3.026867,97778.0,0.396822
2,2.4387,3.301338,2.585357,195556.0,0.391509
3,1.7703,3.519738,2.23978,293334.0,0.381006
4,1.5337,3.976436,1.873538,391112.0,0.364904


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,2.9628,3.189556,3.026867,97778.0,0.396822
2,2.4387,3.301338,2.585357,195556.0,0.391509
3,1.7703,3.519738,2.23978,293334.0,0.381006
4,1.5337,3.976436,1.873538,391112.0,0.364904
5,1.1059,4.656392,1.516176,488890.0,0.346634


No files have been modified since last commit. Skipping to prevent empty commit.


In [20]:
from transformers import pipeline

# Load the trained model (from your output directory)
inference_model = AutoModelForCausalLM.from_pretrained("outputs")
inference_tokenizer = AutoTokenizer.from_pretrained("outputs")

# Create a text-generation pipeline
generator = pipeline("text-generation", model=inference_model, tokenizer=inference_tokenizer)

# Ask a question
question = "zimbawe news"
response = generator(
    question,
    # max_new_tokens=128,      # Use max_new_tokens instead of max_length
    do_sample=True,
    truncation=True          # Explicitly enable truncation
)
print(response[0]['generated_text'])

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


zimbawe news agency:

Tanzania's top-ranked mathematics teacher passed away on Friday, leaving behind a legacy of innovation and integrity, Tanzanian Minister of Education Kamzea Mwanini said. Immigration Minister Timothy Wasifo told the nation's press on Friday that Mwanini, a 52-year-old, was killed in his study in the capital, Gome, after being shot in the head.) The Minister, who was on a routine medical checkup, said the bullet came from a bear. "It was a serious injury, and we went to the hospital to see if it was a fracture or something more serious," he said. "The Minister is survived by his wife, his daughter and his two sons." Mwanini is the seventh Rwandan to earn degrees in mathematics, a science degree, from the University of Nairobi. He has been recognized globally for his contributions to education in Tanzania and Tanzania as a whole. Speaking to the press on Friday evening, Mwanini said: "I want to pay my respects to my teachers, my fellow students, my colleagues and my

In [None]:
# from transformers import pipeline

# question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
# generator = pipeline("text-generation", model="samairtimer/gemma-3-270m-it-blr-slang", device="cuda")
# output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
# print(output["generated_text"])


from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained("samairtimer/gemma-3-270m-it-blr-slang")
tokenizer = AutoTokenizer.from_pretrained("samairtimer/gemma-3-270m-it-blr-slang")

# Create a text-generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Ask a question (provide a prompt)
prompt = "Summarize the following article:\n\nYour article text here\n\nSummary:\n"
response = generator(prompt, max_new_tokens=128)

print(response[0]['generated_text'])



In [23]:
zz = "Tanzanian"
for i, sample in enumerate(train_dataset):
    if zz in sample['article'].lower() or zz in sample['highlights'].lower():
        print(f"Record {i+1}:")
        print(sample)
        print("-" * 50) # Print a separator for readability