In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    NVIDIA RTX A1000 6GB Laptop GPU. Num GPUs = 1. Max memory: 6.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [6]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
messages = [
    {"role": "user", "content": "Islam nedir"},
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 256, pad_token_id = tokenizer.eos_token_id )



Islam (İslam) is a monotheistic religion that originated in the 7th century with the prophet Muhammad in Arabia. The word "Islam" means "submission" or "surrender" to God. It is based on the Quran, the holy book of Islam, which is considered the word of God as revealed to Muhammad.

Islam is the second-largest religion in the world, with approximately 1.8 billion followers, known as Muslims. Muslims believe in one God, Allah, who is the same God worshiped by Jews and Christians. They also believe in the prophets of the Abrahamic tradition, including Abraham, Moses, Jesus, and Muhammad.

The Five Pillars of Islam are the fundamental practices of the religion:

1. Shahada (Declaration of Faith): The declaration that there is no god but Allah, and that Muhammad is the messenger of Allah.
2. Salat (Prayer): Muslims are required to perform daily prayers, facing towards the holy city of Mecca.
3. Zakat (Charity): Muslims are encouraged to give a portion of their wealth to the poor and needy.

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.15 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [10]:
from datasets import  load_dataset
from google.colab import files

uploaded = files.upload()  # Seçilen dosyaları yükler


Saving merged_output.csv to merged_output.csv


In [11]:
import pandas as pd
df = pd.read_csv("merged_output.csv")
df.head()

Unnamed: 0,question,answer,question_type
0,İmamın terk ettiği durumlarda cemaat ne yapar ...,İmam selâmı terk ederse veya teşrik tekbirini ...,Factual
1,Beşinci rekata kalkıldığında imamın hareketler...,Eğer imam dördüncü rekattan sonra oturmuşsa ve...,Analytical
2,"Bir kimse, imam rükû halindeyken namaza katılm...",Eğer ilk saflara geçmek rekât kaçırma riskini ...,Procedural
3,Namaz kılanın önünden geçmek ne zaman mekruhtu...,"Namaz kılanın önünden geçmek, önünde perde, ağ...",Factual
4,Hanefi mezhebine göre Şafii bir imama uyup nam...,"Evet, Hanefi mezhebinde olan bir kişi Şafii me...",Factual


In [12]:
# convert df to a dataset
from datasets import Dataset
mydataset = Dataset.from_pandas(df)
mydataset


Dataset({
    features: ['question', 'answer', 'question_type'],
    num_rows: 1010
})

In [17]:
dataset = load_dataset("vicgalle/alpaca-gpt4", split="train")
print(dataset.column_names)
dataset = dataset.select(range(100))

README.md:   0%|          | 0.00/3.38k [00:00<?, ?B/s]

(…)-00000-of-00001-6ef3991c06080e14.parquet:   0%|          | 0.00/48.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

['instruction', 'input', 'output', 'text']


In [18]:
from unsloth import  to_sharegpt
dataset = to_sharegpt(
    dataset,
    merged_prompt = "[[\nYour input is:\n{input}]]",
    output_column_name="output",
    conversation_extension=3,

    )

Merging columns:   0%|          | 0/100 [00:00<?, ? examples/s]

Converting to ShareGPT:   0%|          | 0/100 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/100 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/100 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/100 [00:00<?, ? examples/s]

Extending conversations:   0%|          | 0/100 [00:00<?, ? examples/s]

In [13]:
from unsloth import  to_sharegpt
mydataset = to_sharegpt(
    mydataset,
    merged_prompt = "[[\nYour input is:\n{question}]]",
    output_column_name="answer",
    conversation_extension=3,

    )


Merging columns:   0%|          | 0/1010 [00:00<?, ? examples/s]

Converting to ShareGPT:   0%|          | 0/1010 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1010 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1010 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1010 [00:00<?, ? examples/s]

Extending conversations:   0%|          | 0/1010 [00:00<?, ? examples/s]

In [19]:
print(mydataset.column_names)
print(dataset.column_names)

['conversations']
['conversations']


In [20]:
from unsloth import standardize_sharegpt
dataset = standardize_sharegpt(dataset)
mydataset = standardize_sharegpt(mydataset)

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/1010 [00:00<?, ? examples/s]

In [21]:
dataset.column_names,
mydataset.column_names,

(['conversations'],)

In [22]:
print(dataset)
print(mydataset)

Dataset({
    features: ['conversations'],
    num_rows: 100
})
Dataset({
    features: ['conversations'],
    num_rows: 1010
})


In [28]:
chat_template = """Below is an instruction that describes some tasks. Write responses that appropriately complete each request.

### Instruction:
{INPUT}


### Response:
{OUTPUT}"""

from unsloth import  apply_chat_template
dataset = apply_chat_template(
    dataset,
    tokenizer= tokenizer,
    chat_template = chat_template,
  )

Unsloth: We automatically added an EOS token to stop endless generations.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [30]:
chat_template = """Below is an instruction that describes some tasks. Write responses that appropriately complete each request.

### Instruction:
{INPUT}


### Response:
{OUTPUT}"""

from unsloth import  apply_chat_template
dataset = apply_chat_template(
    dataset,
    tokenizer= tokenizer,
    chat_template = chat_template,
  )
mydataset = apply_chat_template(
    mydataset,
    tokenizer= tokenizer,
    chat_template = chat_template,
  )

Unsloth: We automatically added an EOS token to stop endless generations.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Unsloth: We automatically added an EOS token to stop endless generations.


Map:   0%|          | 0/1010 [00:00<?, ? examples/s]

In [31]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = mydataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 120,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/1010 [00:00<?, ? examples/s]

In [32]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,010 | Num Epochs = 1 | Total steps = 120
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.5493
2,2.5327
3,2.581
4,2.4903
5,2.4032
6,2.2869
7,2.2567
8,1.9695
9,1.9747
10,1.9639


In [44]:
## neue Versuch mit fine-getuned model
def generate_antwort(input,model):
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
  messages = [
      {"role": "user", "content": input},
  ]
  input_ids = tokenizer.apply_chat_template(
      messages,
      add_generation_prompt = True,
      return_tensors = "pt").to("cuda")

  from transformers import TextStreamer
  text_streamer = TextStreamer(tokenizer, skip_prompt = True)
  _ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 256, pad_token_id = tokenizer.eos_token_id )


save model in hf

In [39]:
HF_TOKEN = os.environ.get("HF_TOKEN")
if not HF_TOKEN:
  try:
    from google.colab import userdata
    HF_TOKEN = userdata.get('HF_TOKEN')

  except:
    print("error")
    HF_TOKEN = None

if HF_TOKEN:
  print("success")
else:
    print("error")


success


In [40]:
model.save_pretrained("hocagpt_lora_llama-3-8b")
tokenizer.save_pretrained("hocagpt_lora_llama-3-8b")
model.push_to_hub("coskun45/hocagpt-lora-Llama-3.8", token=HF_TOKEN)
tokenizer.push_to_hub("coskun45/hocagpt-lora-Llama-3.8", token=HF_TOKEN)

README.md:   0%|          | 0.00/593 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/coskun45/hocagpt-lora-Llama-3.8


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [41]:
# using finegetuned model from hf

max_seq_length = 2048
dtype = None
load_in_4bit = True

if True:
  from unsloth import FastLanguageModel
  model, tokenizer = FastLanguageModel.from_pretrained(
      model_name = "coskun45/hocagpt-lora-Llama-3.8",
      max_seq_length = max_seq_length,
      dtype = dtype,
      load_in_4bit = load_in_4bit,
  )
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
  pass

==((====))==  Unsloth 2025.3.15: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

In [46]:
generate_antwort("Iman nedir",model)

İman, İslam dininin en temel kavramlarından biri olup, Allah'a olan inancı, O'nun varlığını, birliğini, isimlerini ve sıfatlarını, peygamberlerini, kitaplarını ve ahiret gününü kabul etmeyi kapsar.

İmanın, İslam'ın temelini oluşturan dört esasından biri olduğu belirtilmektedir. Bu dört esas, İslâmî ilimlerde 'iman' kavramının anlamlı bir şekilde ifade edilmesi için yeterli görülmüştür.<|eot_id|>
