In [None]:
from datasets import load_dataset

test_size = 0.1

dataset = load_dataset("parquet", data_files="bmw_press_releases.parquet")
dataset = dataset["train"].train_test_split(test_size=test_size, seed=42)
dataset = dataset.map(lambda x: {"text": f"{x['title']}\n\n{x['content']}"})
dataset.remove_columns(["title", "content", "link", "description"])

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 18
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2
    })
})

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

model_name="Qwen/Qwen3-1.7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.with_format("torch")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

small_train_dataset = tokenized_dataset["train"].shuffle(seed=42)#.select(range(1000)) 
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42)#.select(range(1000)) 

Map: 100%|██████████| 18/18 [00:01<00:00, 14.58 examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00, 13.25 examples/s]


In [None]:
from transformers import AutoModel, TrainingArguments, Trainer

model = AutoModel.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    data_collator=data_collator,
)

trainer.train()



In [None]:
trainer.evaluate()