In [None]:
!pip install unsloth trl transformers datasets peft bitsandbytes accelerate huggingface_hub

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
from datasets import load_dataset, concatenate_datasets
from peft import LoraConfig
from trl import SFTTrainer
from transformers import TrainingArguments
import torch, os, re

print("\033[38;5;160m___________________________________________________\033[0m")
print("")
print("\033[38;5;160m      ###       #######     #######    ###     ### \033[0m")
print("\033[38;5;160m    ### ###     ##        ###     ###  ######  ### \033[0m")
print("\033[38;5;160m   ###   ###    #######   ###     ###  ###  ## ### \033[0m")
print("\033[38;5;160m  ###     ###   ##        ###     ###  ###   ##### \033[0m")
print("\033[38;5;160m ##         ##  #######     #######    ###     ### \033[0m")
print("\033[38;5;160m_FINETUNE WITH LORA (Unsloth)______________________\033[0m")
print("")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\033[1;93m[TRAINER]\033[0m Device found: {device}")

save_directory = "./aeon_lora"
os.makedirs(save_directory, exist_ok=True)

model_name = "HuggingFaceTB/SmolLM2-360M"


In [None]:
dtype = "bfloat16" if is_bfloat16_supported() else "float16"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length=2048,
    dtype=dtype,
    load_in_4bit=True,
)

tokenizer.pad_token = tokenizer.eos_token

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj",
                    "gate_proj","up_proj","down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
)


In [None]:

system_prompt = "You are Aeon, a helpful, curious, and friendly AI assistant. " \
    "Your name is always Aeon. You were created by Gustavo Kuklinski. " \
    "You are not the user. Never claim to be the user. " \
    "Maintain a warm, chatty, and engaging tne in all conversations. " \
    "Be naturally conversational while providing helpful responses."


def format_aeon(example):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": example['instruction']},
        {"role": "assistant", "content": example['response']}
    ]
    return {'text': messages}

def format_text(example):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Read the following: {example['text']}\n\nfrom:{example['title']}\n\n"},
    ]
    return {'text': messages}


def flatten_messages(example):
    text = ""
    for msg in example['text']:
        role = msg['role'].upper()
        text += f"[{role}] {msg['content']}\n"
    return {"text": text}

def tokenize_function(example):
    tokenized = tokenizer(
        example["text"],
        truncation=True,
        max_length=2024,
        padding="max_length"
    )

    tokenized['labels'] = tokenized['input_ids'].copy()

    return tokenized

try:
    aeon_books_data = load_dataset('gustavokuklinski/aeon-books', split='train')
    aeon_books_ds = aeon_books_data.map(format_text, remove_columns=aeon_books_data.column_names)

    aeon_moviestv_data = load_dataset('gustavokuklinski/aeon-books', split='train')
    aeon_moviestv_ds = aeon_moviestv_data.map(format_text, remove_columns=aeon_moviestv_data.column_names)

    aeon_train_data = load_dataset('gustavokuklinski/aeon', split='train')
    aeon_train_ds = aeon_train_data.map(format_aeon, remove_columns=aeon_train_data.column_names)

    combined_train_ds = concatenate_datasets([aeon_books_ds, aeon_moviestv_ds, aeon_train_ds])
    combined_train_ds = combined_train_ds.map(flatten_messages)

    print(f"\033[1;32m[DATASET]:\033[0m Tokenizing Dataset: {len(combined_train_ds)} examples")
    tokenized_dataset = combined_train_ds.map(tokenize_function, batched=True, remove_columns=["text"])

    print(f"\033[1;32m[DATASET]:\033[0m Combined training set: {len(combined_train_ds)} examples")


except FileNotFoundError as e:
    print(f"\033[1;91m[ERROR]\033[0m Dataset file not found. Please ensure the paths are correct.")
    exit()


In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    warmup_steps=10,
    max_steps=300,
    learning_rate=2e-4,
    logging_steps=10,
    bf16=torch.cuda.is_bf16_supported(),
    save_strategy="steps",
    save_steps=50,
    output_dir=save_directory,
    report_to=[],
    remove_unused_columns=False
)

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
    tokenizer=tokenizer,
    packing=False,
)


In [None]:
trainer.train()

In [None]:
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print("\033[1;32m[SUCCESS]\033[0m LoRA fine-tuning completed and saved.")

In [None]:
model.save_pretrained_gguf(save_directory, tokenizer, quantization_method = "q8_0")
model.save_pretrained_gguf(save_directory, tokenizer, quantization_method = "f16")

print("\033[1;32m[GGUF]\033[0m Converted to GGUF.")