In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

Import libaries

In [None]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

Load model

In [2]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,

)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.2: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Load English data

In [4]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

Load Bangla data

In [12]:

# Load Bangla QA Dataset from Hugging Face
# Using the "Bangla Question Answer Pair 70K Dataset" as an example
bangla_qa_dataset = load_dataset("rasheduzzaman/Bangla_question_answer_pair_70K_dataset", split="train[:10%]")  # Loading a subset

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
# Update preprocessing for the detected dataset structure
def preprocess_bangla_data(examples):
    instructions = ["Answer the following question:"] * len(examples["input"])
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input_text, output_text in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input_text, output_text) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Apply the preprocessing
bangla_qa_dataset = bangla_qa_dataset.map(preprocess_bangla_data, batched=True)



In [13]:
bangla_qa_dataset

Dataset({
    features: ['input', 'output', 'text'],
    num_rows: 8107
})

In [14]:
# Check the first few rows to verify the text column
print(bangla_qa_dataset[:5])


{'input': ['\nকোন আইনের অধীনে রাজশাহী সিটি কর্পোরেশন গঠিত হয়েছে?\n', '\nকোন ওয়ার্ডের ভোটার তালিকায় নাম থাকলে কোন নির্বাচনে ভোট দেওয়া যায়?\n', '\nভোটার তালিকায় নাম থাকলেই কি ভোট দেওয়া যায়?\n', '\nকোন আইনের অধীনে নির্দিষ্টকরণ করা হয়?\n', '\nনির্দিষ্টকরণের জন্য অর্থ কোথা থেকে উত্তোলন করা হয়?\n'], 'output': ['\nরাজশাহী সিটি কর্পোরেশন আইন, ১৯৮৭\n\n', '\nসেই ওয়ার্ডের কমিশনার এবং মেয়র নির্বাচনে\n\n', '\nনা, নামটি আপাততঃ লিপিবদ্ধ থাকতে হবে', '\nনির্দিষ্টকরণ (অগ্রিম মঞ্জুরী দান) আইন, ১৯৯২\n\n', '\nসংযুক্ত তহবিল\n\n'], 'text': ['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nAnswer the following question:\n\n### Input:\n\nকোন আইনের অধীনে রাজশাহী সিটি কর্পোরেশন গঠিত হয়েছে?\n\n\n### Response:\n\nরাজশাহী সিটি কর্পোরেশন আইন, ১৯৮৭\n\n<|end_of_text|>', 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that

In [16]:
from datasets import concatenate_datasets
# Combine Alpaca Dataset with Bangla QA Dataset
combined_dataset = concatenate_datasets([dataset, bangla_qa_dataset])

# Verify Combined Dataset
print(combined_dataset)

Dataset({
    features: ['output', 'input', 'instruction', 'text'],
    num_rows: 59867
})


We now add LoRA adapters so we only need to update 1 to 10% of all

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
)

Model training

In [20]:
# Train the Model
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=combined_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        #fp16=not torch.cuda.is_bfloat16_supported(),
        #bf16=torch.cuda.is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

trainer.train()


Map (num_proc=2):   0%|          | 0/59867 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 59,867 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.7891
2,1.5176
3,1.5171
4,1.2579
5,1.5283
6,1.2792
7,1.1211
8,1.1403
9,1.1199
10,1.0215


TrainOutput(global_step=60, training_loss=0.9798799554506937, metrics={'train_runtime': 448.5013, 'train_samples_per_second': 1.07, 'train_steps_per_second': 0.134, 'total_flos': 5967338487349248.0, 'train_loss': 0.9798799554506937, 'epoch': 0.008017638805371818})

In [21]:
# Sample Bangla and English Inputs
test_samples = [
    {"instruction": "Answer the following question:",
     "input": "ccc",
     "expected_output": "ঢাকা"},
    {"instruction": "Answer the following question:",
     "input": "What is the capital of Bangladesh?",
     "expected_output": "Dhaka"},
]


Inference

In [28]:
# Alpaca prompt format
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Enable inference
FastLanguageModel.for_inference(model)

# Bangla Example
bangla_prompt = alpaca_prompt.format(
    "বাংলাদেশের রাজধানীর নাম কী?",  # Instruction
    "",                           # Input
    ""                            # Leave blank for generation
)

# English Example
english_prompt = alpaca_prompt.format(
    "What is the national flower of Bangladesh?",  # Instruction
    "",                                            # Input
    ""                                             # Leave blank for generation
)

# Tokenize the inputs
inputs = tokenizer([bangla_prompt, english_prompt], return_tensors="pt", padding=True).to("cuda")

# Generate outputs
outputs = model.generate(**inputs, max_new_tokens=50, use_cache=True)

# Decode the outputs
generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print results
for i, text in enumerate(generated_texts):
    print(f"Generated Response {i + 1}:")
    print(text)
    print("\n" + "="*50 + "\n")


Generated Response 1:
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
বাংলাদেশের রাজধানীর নাম কী?

### Input:


### Response:
বাংলাদেশের রাজধানীর নাম হল ঢাকা।


Generated Response 2:
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
What is the national flower of Bangladesh?

### Input:


### Response:
The national flower of Bangladesh is the Shapla (or Water Lily). It is a beautiful flower that grows in ponds and lakes, and is a symbol of peace and purity.


