<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/Finetune_deepseek_Essential_web_v1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://huggingface.co/datasets/EssentialAI/essential-web-v1.0

In [None]:
# --- 1. Set Up Your Environment ---
!pip install scikit-learn -q # For potential evaluation metrics (optional)
!pip install -U transformers -q
!pip install -U datasets -q
!pip install -U accelerate -q
!pip install -U peft -q
!pip install -U trl -q # For SFTTrainer
!pip install -U bitsandbytes -q
!pip install unsloth -q # Recommended for speed and efficiency
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git # For latest Unsloth

!pip install colab-env -q

In [None]:
import colab_env

In [None]:
from datasets import load_dataset

# Load in streaming mode
raw_dataset = load_dataset("EssentialAI/essential-web-v1.0", streaming=True)
data_stream = raw_dataset["train"]

In [4]:
# Iterate through examples
for example in data_stream.take(5):
    print(example)


{'id': -3908994749044929748, 'text': "Wednesday, May 25, 2011\n\nNEVER GROW UP\n\n\n\nThese are two persons I am always happy to meet, not only because they have amazing style.\n\nThe bubbles were in front of Monki's Helsinki store last Saturday, when Monki and Weekday were celebrating their one year in Helsinki.\nI was an extra at Monki party in the evening and I apologize for spilling those two beers onto somebodys clothes!\nI think I'm tired or something, today I just fell on the ground at work very smoothly.\nI guess there wasn't a chair there...my workmates didn't even notice before I was like, well, I'm on the ground, oops.\nWell, well, I hope everyone else is paying attention, I will soon follow!\n\n\n\nWednesday, May 18, 2011\n\nSHOOK ONES\n\n\n\n\nFirst photo Karoliina Niemenkari, second Sinikka Konttinen.\n\nSorry guys for not updating or reading your great blogs!\nIt's all good in the hood, I'm interning the whole summer at a pr office that's in a really posh street in Helsi

In [5]:
# --- 4. Prepare the Training Dataset ---
print("Loading and preparing EssentialAI/essential-web-v1.0 dataset...")

raw_dataset = data_stream

#FOR POC
eval_set_size = 2
train_set_size = 10

eval_dataset = raw_dataset.take(eval_set_size)

train_dataset = raw_dataset.skip(eval_set_size).take(train_set_size)

test_dataset = eval_dataset

Loading and preparing EssentialAI/essential-web-v1.0 dataset...


In [6]:
print("Dataset preparation complete.")
print("\n" + "="*70 + "\n")

# To see the actual examples in test_dataset
for i, example in enumerate(test_dataset):
    print(f"Example {i+1}: {example}")
    if i >= 2: # Stop after 2 examples as test_dataset only has 2
        break

print("\n" + "="*70 + "\n")

Dataset preparation complete.


Example 1: {'id': -3908994749044929748, 'text': "Wednesday, May 25, 2011\n\nNEVER GROW UP\n\n\n\nThese are two persons I am always happy to meet, not only because they have amazing style.\n\nThe bubbles were in front of Monki's Helsinki store last Saturday, when Monki and Weekday were celebrating their one year in Helsinki.\nI was an extra at Monki party in the evening and I apologize for spilling those two beers onto somebodys clothes!\nI think I'm tired or something, today I just fell on the ground at work very smoothly.\nI guess there wasn't a chair there...my workmates didn't even notice before I was like, well, I'm on the ground, oops.\nWell, well, I hope everyone else is paying attention, I will soon follow!\n\n\n\nWednesday, May 18, 2011\n\nSHOOK ONES\n\n\n\n\nFirst photo Karoliina Niemenkari, second Sinikka Konttinen.\n\nSorry guys for not updating or reading your great blogs!\nIt's all good in the hood, I'm interning the whole summer at a pr off

In [7]:
print(f"Eval sample size: {len(list(eval_dataset))}")

Eval sample size: 2


In [8]:
print(f"Train sample size: {len(list(train_dataset))}")

Train sample size: 10


In [None]:
from datasets import load_dataset
from unsloth import FastLanguageModel
from transformers import AutoTokenizer # Import AutoTokenizer just in case
import torch

# Iterate through examples
print("Inspecting first 5 examples from the raw dataset stream:")
for i, example in enumerate(data_stream.take(5)):
    print(f"Example {i+1}: {example}")
print("\n" + "="*70 + "\n")

# %%
# --- 4. Prepare the Training Dataset ---
print("Loading and preparing EssentialAI/essential-web-v1.0 dataset...")


# Define a filter function to check for required keys and non-empty values
def filter_essential_web_example(example):
    # Check if both keys exist and their values are not None or empty strings
    return (
        "prompt" in example and example["prompt"] is not None and example["prompt"] != "" and
        "chosen" in example and example["chosen"] is not None and example["chosen"] != ""
    )

# Define the formatting function
def format_essential_web_example(example):
    # This function assumes the example has "prompt" and "chosen" keys
    # because we filter beforehand.

    messages = [
        {"role": "user", "content": example["prompt"]},
        {"role": "assistant", "content": example["chosen"]}
    ]

    # Apply the chat template. Handle potential errors during template application
    try:
        # Ensure tokenizer is accessible (should be from the global scope after loading model)
        if 'tokenizer' not in globals():
             raise ValueError("Tokenizer is not available in format_essential_web_example function scope.")

        formatted_text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_special_tokens=True # Include chat template special tokens
        )
        example["text"] = formatted_text

        # Also ensure the resulting text is not empty after formatting
        if not example["text"].strip(): # Use strip() to catch whitespace-only results
             print(f"Warning: Empty or whitespace-only 'text' generated after formatting for example with prompt: {example['prompt'][:100]}...") # Print snippet of prompt
             # You could potentially handle this by returning {"text": ""} here
             # and filtering empty strings later, but the post-map filter handles this.

    except Exception as e:
        print(f"Error applying chat template to example: {e}")
        print(f"Messages: {messages}")
        # If chat template application fails, set the text field to an empty string
        example["text"] = ""

    return example

# Apply skip and take *before* filtering and mapping
# This ensures we are working with the correct subset of the raw stream as defined by the sizes.
eval_dataset_raw = raw_dataset.take(eval_set_size)
train_dataset_raw = raw_dataset.skip(eval_set_size).take(train_set_size)

# Now apply filtering to remove examples without required keys/values,
# then apply mapping, and finally filter out any examples where mapping resulted in empty text.
eval_dataset = eval_dataset_raw.filter(filter_essential_web_example).map(format_essential_web_example, batched=False).filter(lambda x: x.get("text", "") != "")
train_dataset = train_dataset_raw.filter(filter_essential_web_example).map(format_essential_web_example, batched=False).filter(lambda x: x.get("text", "") != "")

# test_dataset is just for inspection, created from the eval split
# Apply the same filtering and mapping
test_dataset_raw = eval_dataset_raw.take(10) # Take 10 raw examples from the beginning of the eval stream
test_dataset = test_dataset_raw.filter(filter_essential_web_example).map(format_essential_web_example, batched=False).filter(lambda x: x.get("text", "") != "")


print("Dataset preparation complete.")
print("\n" + "="*70 + "\n")

# To see the actual examples in test_dataset
# Iterate through test_dataset directly, which is now filtered and mapped
print("Inspecting first 10 examples in the filtered and formatted test dataset:")
try:
    # Use .take(10) on the streaming test_dataset
    for i, example in enumerate(test_dataset.take(10)):
        print(f"--- Test Example {i+1} ---")
        # Print the full formatted text field
        print(example.get("text", "No 'text' field found"))
        # You could also print other keys if needed, but 'text' is what the trainer uses
        # print(f"Original Prompt: {example.get('prompt', 'N/A')}")
        # print(f"Original Chosen: {example.get('chosen', 'N/A')}")
    print("\n" + "="*70 + "\n")

except Exception as e:
     print(f"Could not iterate through test_dataset for inspection: {e}")
     print("\n" + "="*70 + "\n")


# Print eval sample size for estimation purposes
# Note: Calling list() on streaming datasets consumes the stream.
# This is generally okay for a small eval set if it's only done once here
# and then the trainer iterates independently. Be mindful for larger datasets.
try:
    eval_size_list = list(eval_dataset)
    print(f"Eval sample size (after filtering/mapping): {len(eval_size_list)}")
    # You can uncomment this if you need to re-inspect eval_dataset after the count
    # eval_dataset = raw_dataset.take(eval_set_size).filter(filter_essential_web_example).map(format_essential_web_example, batched=False).filter(lambda x: x.get("text", "") != "")
except Exception as e:
    print(f"Could not determine eval sample size: {e}")

# Removed train_dataset len print to avoid consuming the stream before trainer uses it.
# print(f"Train sample size (after filtering/mapping): {len(list(train_dataset))}")


# Optional: Print a formatted example from the training set to check
print("\n--- Example of formatted training data (first element) ---")
try:
    # Take the first example *after* mapping and filtering
    # Use .take(1) and next(iter()) to get the first element without consuming the whole stream
    first_formatted_example = next(iter(train_dataset.take(1)))
    print(first_formatted_example.get("text", "No 'text' field found"))
    print("\n" + "="*70 + "\n")

except StopIteration:
     print("train_dataset is empty after mapping and filtering.")
     print("\n" + "="*70 + "\n")
except Exception as e:
    print(f"Could not print example from train_dataset after mapping and filtering: {e}")
    print("\n" + "="*70 + "\n")


print("Dataset preparation complete (filtered and formatted).")

# --- 2. Load the Model and Tokenizer ---
print("Loading DeepSeek-R1 model and tokenizer...")
# Increased max_seq_length as essential-web-v1.0 dialogues can be long
max_seq_length = 4096 # Adjust if your combined input/output is longer
# Use torch.bfloat16 if supported, otherwise torch.float16 (fp16)
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
load_in_4bit = True # Enable 4-bit quantization for memory efficiency


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Llama-8B", # Recommended for fine-tuning
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
print("Model and tokenizer loaded.")

# Ensure the tokenizer has a pad_token set, using eos_token if pad_token is None
# This is often needed for batching and generation
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print(f"Tokenizer pad_token was None, set to eos_token: {tokenizer.pad_token}")

# --- 3. Apply LoRA Adapters ---
print("Applying LoRA adapters...")
model = FastLanguageModel.get_peft_model(
    model,
    r=16, # Rank of the LoRA matrices (common values: 8, 16, 32, 64)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # DeepSeek specific modules
    lora_alpha=16, # Scaling factor for LoRA weights
    lora_dropout=0, # Dropout rate for LoRA (set to 0 for inference)
    bias="none", # Or "all", "lora_only"
    use_gradient_checkpointing=True, # Recommended for memory saving
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)
print("LoRA adapters applied.")

print("\n" + "="*70 + "\n")


In [None]:
# --- 5. Set Up and Configure the Trainer ---
print("Setting up SFTTrainer...")
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer
from datasets import load_dataset # Import load_dataset here

# Re-load the dataset streams specifically for the trainer
# This ensures the streams are fresh and not consumed by previous operations
print("Re-loading dataset streams for SFTTrainer...")
# Load the dataset from the start
raw_dataset_full_stream = load_dataset("EssentialAI/essential-web-v1.0", streaming=True)["train"]

In [None]:
import torch
# Define the filter function
def filter_essential_web_example(example):
    # Check if both keys exist and their values are not None or empty strings
    return (
        "prompt" in example and example["prompt"] is not None and example["prompt"] != "" and
        "chosen" in example and example["chosen"] is not None and example["chosen"] != ""
    )

# Define the formatting function (assuming tokenizer is available globally from cell ipython-input-10)
def format_essential_web_example(example):
    # This function assumes the example has "prompt" and "chosen" keys
    messages = [
        {"role": "user", "content": example["prompt"]},
        {"role": "assistant", "content": example["chosen"]}
    ]
    try:
        # Ensure tokenizer is available. If not, it will raise the ValueError.
        if 'tokenizer' not in globals():
             raise ValueError("Tokenizer is not available in format_essential_web_example function scope.")
        formatted_text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_special_tokens=True
        )
        example["text"] = formatted_text
        if not example["text"].strip():
             # print(f"Warning: Empty or whitespace-only 'text' generated after formatting for example with prompt: {example['prompt'][:100]}...")
             example["text"] = "" # Explicitly set to empty string if needed
    except Exception as e:
        # print(f"Error applying chat template to example: {e}")
        # print(f"Messages: {messages}")
        example["text"] = ""
    return example


# First, apply filtering and mapping to the raw stream
processed_dataset_stream = raw_dataset_full_stream \
    .filter(filter_essential_web_example) \
    .map(format_essential_web_example, batched=False) \
    .filter(lambda x: x.get("text", "") != "")

# Now, split the processed stream into train and eval using take/skip
# Note: take and skip on streaming datasets are sequential.
# The first N elements go to eval, the next M elements go to train.
eval_dataset_for_trainer = processed_dataset_stream.take(eval_set_size)
train_dataset_for_trainer = processed_dataset_stream.skip(eval_set_size).take(train_set_size)

print("Dataset streams re-loaded and prepared for trainer.")

# Define TrainingArguments - ADJUSTED FOR POC
# TUNE THESE PARAMETERS FOR SPEED AND PERFORMANCE
training_args = TrainingArguments(
    per_device_train_batch_size=4, # **POTENTIAL SPEEDUP: Increase if GPU memory allows.**
                                   # Start with small values (like 1 or 2) and increase gradually.
                                   # Ensure you have sufficient VRAM.
    gradient_accumulation_steps=2, # **Adjust based on batch size.** If increasing per_device_train_batch_size,
                                   # you might decrease this to maintain a similar effective batch size, or keep it
                                   # to increase the effective batch size further if memory permits.
    warmup_steps=5, # Shorter warmup is fine for POC.
    num_train_epochs=1, # **SPEEDUP: Set to 1 for a very quick run.** Increase for actual training.
    learning_rate=2e-4, # Standard LoRA LR. Adjusting this won't directly affect step speed much.
    fp16= (dtype == torch.float16), # **SPEEDUP: Keep True if using fp16 or bf16.**
    bf16= (dtype == torch.bfloat16), # **SPEEDUP: Keep True if using bf16.** bf16 is generally preferred if supported.
    logging_steps=50, # Logging adds a small overhead. Increase for less frequent logging.
    output_dir="./sft_results", # Saving output directory.
    optim="adamw_8bit", # **SPEEDUP: Recommended with 4-bit quantization.**
    seed=3407,
    save_steps=10000, # **SPEEDUP: High value disables frequent saving.** Keep high for speed during run.
    save_total_limit=1, # Keep few saves.
    eval_strategy="steps",
    eval_steps=10000, # **SPEEDUP: High value disables frequent evaluation.** Evaluation takes time.
    load_best_model_at_end=False, # No need to load best model for speed.
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none", # **SPEEDUP: Disables reporting overhead.**
)

import time
print("Starting SFTTrainer initialization...")
start_time = time.time()

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    # Pass the newly created, unconsumed streams
    train_dataset=train_dataset_for_trainer,
    eval_dataset=eval_dataset_for_trainer, # Use eval_dataset for evaluation during training
    dataset_text_field="text", # Use the formatted 'text' field
    max_seq_length=max_seq_length, # Ensure this is appropriate for your data and model's context window.
                                   # Longer sequences require more memory and computation.
    args=training_args, # Pass the defined TrainingArguments object
)

end_time = time.time()
print(f"SFTTrainer initialization complete. Time taken: {end_time - start_time:.2f} seconds.")
print("SFTTrainer configured.")

# --- 6. Start Training ---
print("Starting training...")
try:
    # Removed the manual check
    # Start the training process
    # **The actual training time is determined by the dataset size, training arguments, and hardware.**
    train_result = trainer.train()
    print("Training complete.")
except StopIteration:
    print("Training dataset is empty after filtering and mapping. Cannot start training.")
except Exception as e:
    print(f"An error occurred during training: {e}")
    # Print detailed traceback if available
    import traceback
    traceback.print_exc()

Dataset streams re-loaded and prepared for trainer.
Starting SFTTrainer initialization...
