<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/openr1_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://github.com/huggingface/open-r1

In [None]:
!pip install transformers --upgrade -q # Ensure transformers library is up-to-date
!pip install --upgrade accelerate -q
!pip install bitsandbytes --quiet # Install bitsandbytes library
!pip install peft --quiet         # Install PEFT library
!pip install datasets --quiet
!pip install colab-env --quiet

In [3]:
!nvidia-smi

Sun Jan 26 11:22:49 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0              49W / 400W |   6691MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [42]:
!pip install colab-env --quiet

import warnings

warnings.filterwarnings("ignore", message="You seem to be using the pipelines sequentially on GPU")


import colab_env
import os

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

from huggingface_hub import login

login(
  token=access_token_write,
  add_to_git_credential=True
)

In [None]:
# Install condacolab
!pip install -q condacolab
import condacolab
condacolab.install()
# Restart runtime here

In [None]:
# Update Conda (optional but recommended)
!conda update -n base -c defaults conda

# Create and activate conda environment
!conda create -n openr1 python=3.11 -y
!conda activate openr1

# Clone the Open-R1 repository:
!git clone https://github.com/huggingface/open-r1.git

# Change to project directory
%cd /content/open-r1

# Install necessary packages
!pip install -e ".[dev]"
!pip install vllm==0.6.6.post1 -q
!pip install vllm==0.6.6.post1 --extra-index-url https://download.pytorch.org/whl/cu121 -q


# Unset WANDB_DISABLED if it exists
import os
if 'WANDB_DISABLED' in os.environ:
    del os.environ['WANDB_DISABLED']


In [22]:
def preprocess_function(examples):
    all_texts = []
    for conv in examples["messages"]:
        if conv and any(msg and msg.get("content") for msg in conv):
            text = " ".join([str(msg['content']) if msg and msg.get('content') else '' for msg in conv])
            if isinstance(text, str):
                all_texts.append(text)
            else:
                print(f"Warning: Non-string text encountered: {text}")
                all_texts.append("")
        else:
            all_texts.append("")

    inputs = tokenizer(all_texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

    # You likely do not need to set requires_grad for input_ids and attention_mask.
    # These are inputs to the model, not parameters that need to be updated during training.
    # Remove these lines:
    # inputs['input_ids'].requires_grad = True
    # inputs['attention_mask'].requires_grad = True

    return inputs

TRAIN-DATASET

In [34]:
# 2. Create a smaller dataset (first 100 examples)
dataset = load_dataset("HuggingFaceH4/Bespoke-Stratos-17k")
smaller_dataset = dataset["train"].select(range(1000))

# 3. Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B-Instruct")  # Load your tokenizer

# 4. Apply preprocessing to smaller_dataset
processed_dataset = smaller_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=[col for col in smaller_dataset.column_names if col not in ['system', 'conversations', 'messages']],  # Remove original columns except the ones you need
    num_proc=1,  # Adjust if needed
    load_from_cache_file=False  # Disable cache if needed
)



# 5. Save the preprocessed dataset
processed_dataset.save_to_disk("/content/gdrive/MyDrive/datasets/preprocessed_smaller_dataset")  # Choose a save path

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

EVAL-DATASET

In [37]:
from datasets import load_dataset

# 1. Load the original dataset
dataset = load_dataset("HuggingFaceH4/Bespoke-Stratos-17k")

# 2. Create the evaluation dataset (e.g., the next 200 examples after the first 1000)
eval_dataset = dataset["train"].select(range(1000, 1200))  # Adjust the range as needed

# 3. Preprocess the evaluation dataset
processed_eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=[col for col in eval_dataset.column_names if col not in ['system', 'conversations', 'messages']],
    num_proc=1,
    load_from_cache_file=False
)

# 4. Save the preprocessed evaluation dataset
processed_eval_dataset.save_to_disk("/content/gdrive/MyDrive/datasets/preprocessed_eval_dataset")

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/200 [00:00<?, ? examples/s]

TRAINER

In [45]:
from transformers import TrainingArguments, BitsAndBytesConfig
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer

import colab_env
import torch

%cd /content/open-r1/src/open_r1/
import sft


# Load the model and tokenizer
model_name = "Qwen/Qwen2.5-Math-1.5B-Instruct"
device = "cuda" # the device to load the model onto

# Quantization config for 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
    trust_remote_code=True, #This is required to load Qwen models.
    quantization_config=bnb_config # Apply 4-bit quantization
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Load the dataset
train_dataset = load_from_disk("/content/gdrive/MyDrive/datasets/preprocessed_smaller_dataset")

# Convert the dataset to PyTorch tensors but no need to ensure requires_grad
# input_ids and attention_mask are not supposed to have gradients
train_dataset = train_dataset.with_format("torch")


# Load the preprocessed evaluation dataset
eval_dataset = load_from_disk("/content/gdrive/MyDrive/datasets/preprocessed_eval_dataset")


eval_dataset = eval_dataset.with_format("torch")

training_args = TrainingArguments(
    output_dir="/content/open-r1/data/Qwen2.5-1.5B-Open-R1-Distill",
    report_to="none",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=4,  # Added
    gradient_accumulation_steps=2,  # Changed to 2
    learning_rate=1e-5,  # Reduced learning rate (optional)
    num_train_epochs=3,
    gradient_checkpointing=True,
    fp16=False,  # Disable fp16/bf16 mixed precision
    logging_steps=5,  # Added
    eval_strategy="steps", # Added
    eval_steps=100,  # Added
)


# Apply PEFT to the model
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj"
    ],  # Reverted to your original target modules
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


# Initialize the trainer with training parameters and the model
trainer = sft.SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Pass the eval_dataset
    args=training_args,
)

# Before starting training
for name, param in model.named_parameters():
    if param.dtype == torch.bfloat16:  # Check if parameter is in bfloat16
        param.data = param.data.type(torch.float32)  # Convert to float32
        param.requires_grad = True

# Start the training
trainer.train()

/content/open-r1/src/open_r1
trainable params: 2,179,072 || all params: 1,545,893,376 || trainable%: 0.1410




Step,Training Loss,Validation Loss
100,1.2455,1.461247
200,1.423,1.334779
300,1.8676,1.254277
400,1.1744,1.201787
500,1.3364,1.161853
600,1.2464,1.132646
700,0.9503,1.110456
800,0.7486,1.094128
900,0.8232,1.081746
1000,0.7085,1.072135


TrainOutput(global_step=1500, training_loss=1.12103054300944, metrics={'train_runtime': 1438.4586, 'train_samples_per_second': 2.086, 'train_steps_per_second': 1.043, 'total_flos': 1.209618137088e+16, 'train_loss': 1.12103054300944, 'epoch': 3.0})

In [None]:
print(model.config)

In [2]:
# Install Git LFS (if not already installed):
!git-lfs --version

git-lfs/3.0.2 (GitHub; linux amd64; go 1.18.1)


In [None]:
!pip install datasets --quiet

OPEN-R1 FRAMEWORK

In [None]:
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer # Import AutoTokenizer here


#from google.colab import drive

# 1. Mount Google Drive (if not already mounted)
#drive.mount('/content/gdrive')

# 2. Create a smaller dataset (first 100 examples)
dataset = load_dataset("HuggingFaceH4/Bespoke-Stratos-17k")
smaller_dataset = dataset["train"].select(range(10))

# 3. Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B-Instruct")  # Load your tokenizer

# 4. Apply preprocessing to smaller_dataset
processed_dataset = smaller_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=[col for col in smaller_dataset.column_names if col not in ['system', 'conversations', 'messages']],  # Remove original columns except the ones you need
    num_proc=1,  # Adjust if needed
    load_from_cache_file=False  # Disable cache if needed
)



# 5. Save the preprocessed dataset
processed_dataset.save_to_disk("/content/gdrive/MyDrive/datasets/preprocessed_smaller_dataset")  # Choose a save path


# 5. Run training with the saved dataset path
!accelerate launch --config_file=/content/open-r1/configs/zero3.yaml --num_processes=1 /content/open-r1/src/open_r1/sft.py \
    --report_to none \
    --model_name_or_path Qwen/Qwen2.5-Math-1.5B-Instruct \
    --dataset_name /content/gdrive/MyDrive/datasets/preprocessed_smaller_dataset \
    --output_dir /content/open-r1/data/Qwen2.5-1.5B-Open-R1-Distill \
    --learning_rate 2.0e-5 \
    --num_train_epochs 1 \
    --packing \
    --max_seq_length 4096 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 4 \
    --gradient_accumulation_steps 2 \
    --gradient_checkpointing \
    --bf16 \
    --logging_steps 5 \
    --eval_strategy steps \
    --eval_steps 100

In [None]:
%cd /content/open-r1/src/open_r1/
import sft