In [1]:
!pip install transformers datasets accelerate peft bitsandbytes torch

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.1


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset

# Load the base model (choose based on available VRAM)
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  # Change as needed
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model in 8-bit (saves memory)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    use_cache=False,
    torch_dtype=torch.float16
)

model.gradient_checkpointing_enable()


tokenizer_config.json:   0%|          | 0.00/3.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [3]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

for param in model.parameters():
    param.requires_grad = False  # Freeze base model


model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [4]:
# Load dataset (use 'math' for more complex problems)
dataset = load_dataset("gsm8k", "main") 


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [6]:
def format_prompt(example):
    # GSM8K uses "question" and "answer" columns
    prompt = (
        "Solve the following math problem with step-by-step reasoning.\n\n"
        f"Question: {example['question']}\n\n"
        f"### Response: Let's think step by step. {example['answer']}"
    )
    return {"text": prompt}

# Apply formatting
dataset = dataset.map(format_prompt)

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [7]:
dataset = dataset["train"].train_test_split(test_size=0.1)  # 90% train, 10% eval

In [8]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )

# Tokenize all splits
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names  # Remove original columns
)

Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

Map:   0%|          | 0/748 [00:00<?, ? examples/s]

In [9]:
from transformers import TrainingArguments

# Ensure arguments have logging enabled
training_args = TrainingArguments(
    output_dir="./deepmath-7b-l",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    warmup_steps=500,
    max_steps=1000,
    evaluation_strategy="steps",
    learning_rate=2e-4,
    num_train_epochs=1,
    hub_always_push = True,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,  
    logging_steps=10,
    save_on_each_node=True,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    disable_tqdm=True,  
    bf16=False,
    fp16=True,
    optim="paged_adamw_8bit",
    push_to_hub=True
)




In [10]:
from huggingface_hub import notebook_login
notebook_login()  # For notebooks

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
tokenized_dataset["test"]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 748
})

In [12]:
from transformers import TrainerCallback

class LoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        # Try to retrieve the loss from logs; if it's not present, set it to None.
        loss = logs.get('loss') if logs is not None else None
        
        if loss is not None:
            print(f"Step {state.global_step}: Loss = {loss:.3f}")
        else:
            # Print all available logs if loss is not available.
            print(f"Step {state.global_step}: Logs = {logs}")

In [13]:
from transformers import Trainer, DataCollatorForLanguageModeling


tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(
    tokenizer,
    pad_to_multiple_of=8,  # TPU-friendly padding
    mlm=False,
    return_tensors="pt"
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    callbacks=[LoggingCallback()]
)


HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-679eda00-6b3ae1f80fa387a10ae937c8;a0b51fd2-efe6-4344-8a9d-51e629538908)

Invalid username or password.

In [None]:
# Start training
print("Training started...")
trainer.train()
print("Training finished!")

In [None]:
model.save_pretrained("./deepmath-l")

In [None]:
from peft import PeftModel

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

# Merge LoRA adapters
merged_model = PeftModel.from_pretrained(base_model, "./deepmath-l")
merged_model = merged_model.merge_and_unload()

# Test inference
prompt = "Solve the following math problem with step-by-step reasoning.\n\nQuestion: What is 15% of 200?\n\n### Response:"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = merged_model.generate(**inputs, max_length=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
# Merge LoRA with base model
merged_model = model.merge_and_unload()

# Save full model
merged_model.save_pretrained("./deepmath-m")
tokenizer.save_pretrained("./deepmath-m")

In [None]:
from huggingface_hub import HfApi

# Push adapters
model.push_to_hub(
    repo_id="codewithdark/deepmath-7b-l",
    private=False,  # Set to False for public
    commit_message="Added LoRA adapters for math reasoning"
)

In [None]:
# Push merged model and tokenizer
merged_model.push_to_hub(
    repo_id="codewithdark/deepmath-7b-m",
    private=False,
    commit_message="Full merged model for math QA"
)

tokenizer.push_to_hub(
    repo_id="codewithdark/deepmath-7b-m",
    commit_message="Tokenizer for math model"
)