In [None]:
# cell 1: Install & Setup (This takes about 2-3 mins)
print("üöÄ Installing Unsloth (This makes training 2x faster)...")
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

from unsloth import FastLanguageModel
import torch

# 1. Configuration
max_seq_length = 2048
dtype = None
load_in_4bit = True

# 2. Load the Base Model (Llama 3 8B)
print("üì• Downloading Llama-3-8B Base Model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit", # We use the 4-bit version to fit in free Colab
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# 3. Add LoRA Adapters (This is the "Brain Surgery" part)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

# 4. Load YOUR Dataset
print("üìö Loading your Hindi Slang Dataset...")
from datasets import load_dataset

# Note: We are pointing specifically to YOUR repository and file
dataset = load_dataset("defnotutkarsh/hindi-slang-v1", data_files="final_train.json", split="train")

# 5. Format the Data for Llama-3
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + tokenizer.eos_token
        texts.append(text)
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)

# 6. Start Training!
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

print("üî• Starting Training... (This will take 30-60 mins)")

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 300, # <-- QUICK RUN. Change to 200-300 for a full serious train.
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()
print("‚úÖ Training Finished!")

üöÄ Installing Unsloth (This makes training 2x faster)...
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-4s5evluf/unsloth_cec8e840a43945fe919b037e961ae51d
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-4s5evluf/unsloth_cec8e840a43945fe919b037e961ae51d
  Resolved https://github.com/unslothai/unsloth.git to commit 2eb6b0d5f363a60ed3792ea1f04250537ac66939
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting xformers<0.0.27
  Using cached xformers-0.0.26.post1.tar.gz (4.1 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting trl<0.9.0
  Using cached trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Using cached trl-0.8.6-py3-none-a

Unsloth 2025.12.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


üìö Loading your Hindi Slang Dataset...


Map:   0%|          | 0/851 [00:00<?, ? examples/s]

üî• Starting Training... (This will take 30-60 mins)


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/851 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 851 | Num Epochs = 3 | Total steps = 300
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)
wandb: (1) Create a W&B account
wandb: (2) Use an existing W&B account
wandb: (3) Don't visualize my results
wandb: Enter your choice:

 3


wandb: You chose "Don't visualize my results"


wandb: Detected [huggingface_hub.inference, openai] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.7085
2,1.59
3,1.6303
4,1.5096
5,1.3686
6,1.1805
7,0.8723
8,0.7157
9,0.6628
10,0.543




0,1
train/epoch,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñà
train/grad_norm,‚ñÜ‚ñá‚ñà‚ñÉ‚ñÑ‚ñÉ‚ñÇ‚ñÇ‚ñÉ‚ñÑ‚ñÉ‚ñÇ‚ñÇ‚ñÉ‚ñÇ‚ñÉ‚ñÇ‚ñÉ‚ñÉ‚ñÇ‚ñÉ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÅ‚ñÉ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÇ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ
train/learning_rate,‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñá‚ñá‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
train/loss,‚ñà‚ñÜ‚ñÖ‚ñÖ‚ñÜ‚ñÑ‚ñÖ‚ñÑ‚ñÖ‚ñÉ‚ñÖ‚ñÑ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÇ‚ñÉ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÉ‚ñÇ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÇ

0,1
total_flos,1.5835652895031296e+16
train/epoch,2.80751
train/global_step,300.0
train/grad_norm,0.44618
train/learning_rate,0.0
train/loss,0.1867
train_loss,0.308
train_runtime,1689.7987
train_samples_per_second,1.42
train_steps_per_second,0.178


‚úÖ Training Finished!


In [None]:
# Cell 3: SAVE THE V2 MODEL (Run this AFTER training finishes)
from google.colab import userdata

# 1. Paste your Write Token here
hf_token = "HIDDEN_TOKEN" # <--- REPLACE WITH YOUR TOKEN

# 2. V2 Name (The "Smarter" Version)
username = "defnotutkarsh"
model_name = "hindi-llama-3-slang-v2" # <--- Note the V2

print(f"üíæ Saving V2 model to {username}/{model_name}...")

# 3. Push to Hub
model.push_to_hub(f"{username}/{model_name}", token=hf_token)
tokenizer.push_to_hub(f"{username}/{model_name}", token=hf_token)

print(f"‚úÖ Saved! The V2 model is live: https://huggingface.co/{username}/{model_name}")

üíæ Saving V2 model to defnotutkarsh/hindi-llama-3-slang-v2...


README.md:   0%|          | 0.00/580 [00:00<?, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          |  556kB /  168MB            

Saved model to https://huggingface.co/defnotutkarsh/hindi-llama-3-slang-v2


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mpltctz3wt/tokenizer.json: 100%|##########| 17.2MB / 17.2MB            

‚úÖ Saved! The V2 model is live: https://huggingface.co/defnotutkarsh/hindi-llama-3-slang-v2


In [None]:
# Cell 2: Inference (Testing the Model)
from unsloth import FastLanguageModel

# 1. Prepare for Chatting
FastLanguageModel.for_inference(model)

# 2. Your Question (You can change this!)
hindi_prompt = "Bhai, software engineer banne ke liye kya roadmap follow karu?"

# 3. Format strictly as Alpaca
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# 4. Generate Answer
inputs = tokenizer(
[
    alpaca_prompt.format(
        hindi_prompt, # The User Question
        "",           # Input (empty)
        "",           # Output (empty)
    )
], return_tensors = "pt").to("cuda")

print("ü§î AI is thinking...")
outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
result = tokenizer.batch_decode(outputs)[0]

# 5. Clean up the output to just show the answer
print("\n" + result.split("### Response:")[-1].replace("<|end_of_text|>", "").strip())


ü§î AI is thinking...

Bhai, software engineer banne ke liye, aap ek computer science ki degree ya diploma le sakte hain, fir ek internship ya training program join kar sakte hain, aur fir ek company join kar sakte hain, jahan aap software engineer ke roop mein kaam kar sakte hain.


In [None]:
# Cell 3: Save to Hugging Face (PERMANENTLY)
from google.colab import userdata

# 1. PASTE YOUR WRITE TOKEN HERE
# Get it from: https://huggingface.co/settings/tokens
hf_token = "HIDDEN_TOKEN" # <--- REPLACE THIS WITH YOUR ACTUAL TOKEN

# 2. Your Identity & Project Name
username = "defnotutkarsh"
model_name = "hindi-llama-3-slang-v1"

print(f"üíæ Saving model to {username}/{model_name}...")

# 3. Push to Hub
model.push_to_hub(f"{username}/{model_name}", token=hf_token)
tokenizer.push_to_hub(f"{username}/{model_name}", token=hf_token)

print(f"‚úÖ Saved! View your model here: https://huggingface.co/{username}/{model_name}")


üíæ Saving model to defnotutkarsh/hindi-llama-3-slang-v1...


README.md:   0%|          | 0.00/580 [00:00<?, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          |  560kB /  168MB            

Saved model to https://huggingface.co/defnotutkarsh/hindi-llama-3-slang-v1


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mp9v0u708s/tokenizer.json: 100%|##########| 17.2MB / 17.2MB            

‚úÖ Saved! View your model here: https://huggingface.co/defnotutkarsh/hindi-llama-3-slang-v1


In [None]:
# Cell 4: Launch the Chat App (Shareable Link)
import locale
locale.getpreferredencoding = lambda: "UTF-8" # Fix for Colab locale bug
!pip install -q gradio

import gradio as gr
from unsloth import FastLanguageModel

# 1. Prepare Model for Chatting
FastLanguageModel.for_inference(model)

# 2. The Chat Function
def chat_with_slang_ai(user_input):
    # Format the prompt exactly like the training data
    prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}""".format(user_input, "", "")

    # Generate
    inputs = tokenizer([prompt], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)

    # Decode and clean up
    response = tokenizer.batch_decode(outputs)[0]
    final_answer = response.split("### Response:")[-1].replace("<|end_of_text|>", "").strip()
    return final_answer

# 3. The User Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# üáÆüá≥ Hindi Slang AI (V2)")
    gr.Markdown("This AI has been fine-tuned for 300 steps to speak like a real 'Bhai'. Ask it anything!")

    with gr.Row():
        user_box = gr.Textbox(label="Apna Sawaal Pucho", placeholder="Bhai, coding start kaise karu?")

    output_box = gr.Textbox(label="AI Ka Jawaab")
    btn = gr.Button("Submit / Bhejo", variant="primary")

    btn.click(chat_with_slang_ai, inputs=user_box, outputs=output_box)

# 4. Launch!
print("üöÄ Launching App...")
demo.launch(share=True)

  with gr.Blocks(theme=gr.themes.Soft()) as demo:


üöÄ Launching App...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b8ba2794b4f2fefe7e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


