# Finetuning Insurance Conditions Model

Detta Colab-notebook guidar dig genom:
1. Installera beroenden  
2. Ladda dataset  
3. Tokenisering & förberedelser  
4. Finetuning med Hugging Face 🤗 Transformers  
5. Utvärdering  


In [None]:
# 1. Installera beroenden (kör bara en gång)
!pip install transformers datasets accelerate


## Ladda och inspektera dataset


In [None]:
from datasets import load_dataset

# Läs in ditt JSONL–dataset
ds = load_dataset("json", data_files="../data/dataset.jsonl", split="train")
print(ds[0])


## 2. Initiera modell och tokenizer


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "birgermoell/t5-base-swedish"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModelForSeq2SeqLM.from_pretrained(model_name)


## 3. Förbehandling


In [None]:
def preprocess(ex):
    # Tokenisera input och output, sätt labels från output
    inputs = tokenizer(ex["input"], truncation=True, padding="max_length", max_length=512)
    outputs = tokenizer(ex["output"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = outputs["input_ids"]
    return inputs

# Kör preprocess på hela datasetet
tokenized = ds.map(preprocess, batched=True)


## 4. Finetuning med 🤗 Trainer


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="../models/t5-mvp",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    logging_dir="../models/logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
)

# Starta träningen
trainer.train()

# Spara slutmodell och tokenizer
model.save_pretrained("../models/t5-final")
tokenizer.save_pretrained("../models/t5-final")


## 5. Enkel utvärdering


In [None]:
# Exempel på inference
from transformers import pipeline
gen = pipeline("text2text-generation", model="../models/t5-final", tokenizer="../models/t5-final")

sample = ds[1]
print("INPUT:", sample["input"])
print("PRED:", gen(sample["input"], max_length=128)[0]["generated_text"])
print("TRUE:", sample["output"])
