<a href="https://colab.research.google.com/github/emredeveloper/Transformers--General-AI/blob/main/SLM_%2B_COT_FINETUNE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import torch
from torch.utils.data import Dataset
from tqdm import tqdm

In [3]:
class ConversationDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=512):
        self.examples = []

        print("Processing conversations...")
        for item in tqdm(dataset):
            # Format conversation
            conversation = ""
            for turn in item['chosen']:
                role = turn['role']
                content = turn['content']
                if role == 'user':
                    conversation += f"Human: {content}\n"
                else:
                    conversation += f"Assistant: {content}\n"

            # Tokenize
            encodings = tokenizer(
                conversation,
                truncation=True,
                max_length=max_length,
                padding="max_length",
                return_tensors="pt"
            )

            self.examples.append({
                "input_ids": encodings["input_ids"][0],
                "attention_mask": encodings["attention_mask"][0],
                "labels": encodings["input_ids"][0].clone()
            })

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

In [11]:
def main():
    # Load dataset
    print("Loading dataset...")
    dataset = load_dataset("kenhktsui/longtalk-cot-v0.1")

    # Kullan küçük bir subset (test için)
    dataset['train'] = dataset['train'].select(range(1000))

    # Load model and tokenizer
    print("Loading model and tokenizer...")
    model_name = "HuggingFaceTB/SmolLM-135M"
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = tokenizer.pad_token_id

    # Split dataset
    print("Splitting dataset...")
    train_size = int(0.9 * len(dataset['train']))
    val_size = len(dataset['train']) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(
        dataset['train'],
        [train_size, val_size]
    )

    # Prepare datasets
    print("Preparing training dataset...")
    train_dataset = ConversationDataset(train_dataset, tokenizer)
    print("Preparing validation dataset...")
    eval_dataset = ConversationDataset(val_dataset, tokenizer)

    # Training arguments
    # Training arguments
    training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,
    fp16=True,
    report_to="none"
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    )

    # Start training
    trainer.train()

    # Save model
    model_save_path = "./fine_tuned_smolLM"
    trainer.save_model(model_save_path)
    tokenizer.save_pretrained(model_save_path)

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
if __name__ == "__main__":
    main()

In [None]:
# Save the fine-tuned model
model_save_path = "./fine_tuned_smolLM"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Test generation function
def generate_response(prompt, model, tokenizer, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Test the model
test_prompt = "Human: What is 1 + 1?\nAssistant:"
response = generate_response(test_prompt, model, tokenizer)
print(f"Prompt: {test_prompt}")
print(f"Response: {response}")