In [1]:
! pip install -q accelerate  peft bitsandbytes transformers trl

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.5/465.5 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [18]:
import torch
import os
from datasets import load_dataset, Dataset
from google.colab import userdata
from transformers import (AutoModelForCausalLM,
AutoTokenizer,BitsAndBytesConfig,
HfArgumentParser, TrainingArguments,
logging, pipeline ,GemmaTokenizer)
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer , SFTConfig

In [9]:
os.environ['HF-token'] = userdata.get('HF-token')

In [11]:
model_id = "google/gemma-2b"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    )


In [32]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token = os.environ['HF-token'])
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map={"":0},
    token = os.environ['HF-token']
    )

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [33]:
input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids)
print(tokenizer.decode(outputs[0]))

KeyboardInterrupt: 

In [None]:
Lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj","o_proj","k_proj","v_proj","gate_proj","up_proj","down_proj"],
    task_type="CAUSAL_LM"
    )

In [19]:
import pandas as pd
dds = load_dataset("kakooch/persian-poetry-qa")
ds = dds["train"]
ds = ds.to_pandas()
df_sampled = ds.sample(n=50000, random_state=42)
df = df_sampled[["question", "answer"]]
df.to_csv("drive/MyDrive/QLORA/50k_poem_sample.csv", index=False)

In [24]:
dataset = Dataset.from_pandas(df)
dataset.push_to_hub("Erfan3940/50k_persian_poem_author",token = os.environ['HF-token'] )

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              : 100%|##########| 3.55MB / 3.55MB            

CommitInfo(commit_url='https://huggingface.co/datasets/Erfan3940/50k_persian_poem_author/commit/f5bbecf2397f85eaec56bfa723972b68a3638a9a', commit_message='Upload dataset', commit_description='', oid='f5bbecf2397f85eaec56bfa723972b68a3638a9a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Erfan3940/50k_persian_poem_author', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Erfan3940/50k_persian_poem_author'), pr_revision=None, pr_num=None)

In [25]:
df.head()

Unnamed: 0,question,answer
943845,یک نمونه از شعر فردوسی,برهنه کشانش ببر تا بچاه \t که در چاه بین آنک د...
401204,یک نمونه از شعر حکیم سبزواری,چو افتاده ما را که کام دگرها \t اگر از تو گاهی...
878491,یک نمونه از شعر غروی اصفهانی,ای ملامت گر از اسرار قدر بیخبری \t که دچار غم ...
667922,یک نمونه از شعر صائب تبریزی,زسختی قابل اصلاح نبود دل ترا، ورنه \t ازین دری...
745830,یک نمونه از شعر عبید زاکانی,چو من به فکر فرو رفته و روان کرده \t دبیر چرخ ...


In [26]:
def formatting (example):
    text = f"متن شعر : {example['answer'][0]}\n شاعر : {example['question'][0]}"
    return [text]

In [29]:
myds = load_dataset("Erfan3940/50k_persian_poem_author",token = os.environ['HF-token'])

README.md:   0%|          | 0.00/359 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.55M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
trainer = SFTTrainer(
    model=model_id,
    train_dataset=myds["train"],
    peft_config=Lora_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="drive/MyDrive/QLORA/gemma3",
        optim="paged_adamw_8bit"
    ),
    formatting_func=formatting,
    max_seq_length=256,
    )

In [None]:
trainer.train()

In [None]:
text = "متن شعر : چو افتاده ما را که کام دگرها"
device = "coda:0"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids)
print(tokenizer.decode(outputs[0]), skip_special_tokens=True)

In [None]:
import json

# Read the notebook
with open('drive/MyDrive/Colab Notebooks/Gemma3_finetuning.ipynb', 'r', encoding='utf-8') as f:
    notebook = json.load(f)

# Remove problematic widgets metadata
if 'metadata' in notebook and 'widgets' in notebook['metadata']:
    del notebook['metadata']['widgets']

# Or fix individual cells
for cell in notebook.get('cells', []):
    if 'metadata' in cell and 'widgets' in cell['metadata']:
        del cell['metadata']['widgets']
    # Also check for empty widget arrays
    if 'metadata' in cell and 'widgets' in cell['metadata'] and not cell['metadata']['widgets']:
        del cell['metadata']['widgets']

# Save the cleaned notebook
with open('your_notebook_cleaned.ipynb', 'w', encoding='utf-8') as f:
    json.dump(notebook, f, indent=1)