In [None]:
# Import necessary libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from datasets import load_dataset
import json
from typing import Optional, Dict, Any

if torch.cuda.is_available():
    device = "cuda"
    print(f"Using CUDA GPU: {torch.cuda.get_device_name()}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = "mps"
    print("Using Apple MPS")
else:
    device = "cpu"
    print("Using CPU - you will need to use a GPU to train models")

# Authenticate with Hugging Face (optional, for private models)
from huggingface_hub import login
login()  # Uncomment if you need to access private models

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x155545317c70>>
Traceback (most recent call last):
  File "/home/hice1/dk305/scratch/envs/ml/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [2]:
# Load both base and instruct models for comparison
base_model_name = "HuggingFaceTB/SmolLM3-3B-Base"
instruct_model_name = "HuggingFaceTB/SmolLM3-3B"


# Load tokenizers
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
instruct_tokenizer = AutoTokenizer.from_pretrained(instruct_model_name)


# Load models (use smaller precision for memory efficiency)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    dtype=torch.bfloat16,
    device_map="auto"
)

instruct_model = AutoModelForCausalLM.from_pretrained(
    instruct_model_name,
    dtype=torch.bfloat16,
    device_map="auto"
)

print("Models loaded successfully!")

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/182 [00:00<?, ?B/s]

Models loaded successfully!


In [4]:
# Create different types of conversations to test
conversations = {
    "simple_qa": [
        {"role": "user", "content": "What is machine learning?"},
    ],
    
    "with_system": [
        {"role": "system", "content": "You are a helpful AI assistant specialized in explaining technical concepts clearly."},
        {"role": "user", "content": "What is machine learning?"},
    ],
    
    "multi_turn": [
        {"role": "system", "content": "You are a math tutor."},
        {"role": "user", "content": "What is calculus?"},
        {"role": "assistant", "content": "Calculus is a branch of mathematics that deals with rates of change and accumulation of quantities."},
        {"role": "user", "content": "Can you give me a simple example?"},
    ],
    
    "reasoning_task": [
        {"role": "user", "content": "Solve step by step: If a train travels 120 miles in 2 hours, what is its average speed?"},
    ]
}

for conv_type, messages in conversations.items():
    print(f"--- {conv_type.upper()} ---")
    
    # Format without generation prompt (for completed conversations)
    formatted_complete = instruct_tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=False
    )
    
    # Format with generation prompt (for inference)
    formatted_prompt = instruct_tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    print("Complete conversation format:")
    print(formatted_complete)
    print("\nWith generation prompt:")
    print(formatted_prompt)
    print("\n" + "="*50 + "\n")

--- SIMPLE_QA ---
Complete conversation format:
<|im_start|>system
## Metadata

Knowledge Cutoff Date: June 2025
Today Date: 26 November 2025
Reasoning Mode: /think

## Custom Instructions

You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> Thought section </think> Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current 

In [6]:
test_prompt = "Explain quantum computing in simple terms."
base_inputs = base_tokenizer(test_prompt, return_tensors="pt").to(device)

# Prepare the prompt for instruct model (with chat template)
instruct_messages = [{"role": "user", "content": test_prompt}]
instruct_formatted = instruct_tokenizer.apply_chat_template(
    instruct_messages, 
    tokenize=False, 
    add_generation_prompt=True
)
instruct_inputs = instruct_tokenizer(instruct_formatted, return_tensors="pt").to(device)


In [10]:
instruct_inputs.input_ids.shape

torch.Size([1, 256])

In [19]:
# Generate responses
print("=== Model comparison ===\n")

print("ðŸ¤– BASE MODEL RESPONSE:")
with torch.no_grad():
    base_outputs = base_model.generate(
        **base_inputs,
        max_new_tokens=150,
        temperature=0.7,
        do_sample=True,
        pad_token_id=base_tokenizer.eos_token_id
    )
    base_response = base_tokenizer.decode(base_outputs[0], skip_special_tokens=True)
    print(base_response[len(test_prompt):])  # Show only the generated part


print("\n" + "="*50)
print("Instruct model response:")
with torch.no_grad():
    instruct_outputs = instruct_model.generate(
        **instruct_inputs,
        max_new_tokens=150,
        temperature=0.7,
        do_sample=True,
        pad_token_id=instruct_tokenizer.eos_token_id
    )
    instruct_response = instruct_tokenizer.decode(instruct_outputs[0], skip_special_tokens=True)
    # Extract only the assistant's response
    assistant_start = instruct_response.find("assistant\n") + len("assistant\n")
    assistant_response = instruct_response[assistant_start:].split("<|im_end|>")[0]
    print(assistant_response)

=== Model comparison ===

ðŸ¤– BASE MODEL RESPONSE:
 A 10-year-old could understand it. For example, say that 1 quantum bit (qubit) is equivalent to 1 bit. So, you have 8 qubits (or 8 bits) that can give you 2^8, or 256 different values. This is a huge increase in power. One qubit is equivalent to 2 bits, and 2 qubits are equivalent to 4 bits, so 8 qubits are equivalent to 256 bits. This is why quantum computers can solve complex problems that classical computers cannot.
In this video, I will explain the basics of quantum computing. Quantum computing is still in its infancy, but it has the potential to revolutionize the way we store and process information

Instruct model response:
<think>
Okay, so I need to explain quantum computing in simple terms. Let me start by recalling what I know about classical computing first. Classical computers use bits, right? They're either 0 or 1, and they process information using logic gates like AND, OR, NOT. They're good for things like checking pass

In [20]:
# Test SmolLM3's reasoning capabilities
reasoning_prompts = [
    "What is 15 Ã— 24? Show your work.",
    "A recipe calls for 2 cups of flour for 12 cookies. How much flour is needed for 30 cookies?",
    "If I have $50 and spend $18.75 on lunch and $12.30 on a book, how much money do I have left?"
]

print("=== TESTING REASONING CAPABILITIES ===\n")

for i, prompt in enumerate(reasoning_prompts, 1):
    print(f"Problem {i}: {prompt}")
    
    messages = [{"role": "user", "content": prompt}]
    formatted_prompt = instruct_tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = instruct_tokenizer(formatted_prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = instruct_model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.3,  # Lower temperature for more consistent reasoning
            do_sample=True,
            pad_token_id=instruct_tokenizer.eos_token_id
        )
        response = instruct_tokenizer.decode(outputs[0], skip_special_tokens=True)
        assistant_start = response.find("<|im_start|>assistant\n") + len("<|im_start|>assistant\n")
        assistant_response = response[assistant_start:].split("<|im_end|>")[0]
        print(f"Answer: {assistant_response}")
    
    print("\n" + "-"*50 + "\n")

=== TESTING REASONING CAPABILITIES ===

Problem 1: What is 15 Ã— 24? Show your work.
Answer: nowledge Cutoff Date: June 2025
Today Date: 26 November 2025
Reasoning Mode: /think

## Custom Instructions

You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> Thought section </think> Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of 

In [17]:
base_response

'Explain quantum computing in simple terms. What are the advantages and disadvantages of quantum computing? What are the various types of quantum computers? How is it different from classical computing? Explain the basic concepts of quantum computing and its applications with examples.\nQuantum computing is a computing paradigm that utilizes quantum-mechanical phenomena, such as superposition and entanglement, to process information. This is in contrast to classical computing, which relies on binary digits, or bits, to process information.\nThe advantages of quantum computing include:\nâ€¢ Exponential speedup over classical computing for certain problems, such as factoring large numbers and simulating quantum systems.\nâ€¢ Ability to solve complex problems that are currently intractable with classical computing.\nThe disadvantages of quantum computing include:\nâ€¢ High error rates and difficulty in'

In [21]:
# Load and explore the SmolTalk2 dataset
print("=== EXPLORING SMOLTALK2 DATASET ===\n")
# Load the SFT subset
dataset_dict = load_dataset("HuggingFaceTB/smoltalk2", "SFT")
print(f"Total splits: {len(dataset_dict)}")
print(f"Available splits: {list(dataset_dict.keys())}")
print(f"Number of total rows: {sum([dataset_dict[d].num_rows for d in dataset_dict])}")
print(f"Dataset structure: {dataset_dict}")

=== EXPLORING SMOLTALK2 DATASET ===



README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/124 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/113 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/113 [00:00<?, ?it/s]

SFT/LongAlign_64k_Qwen3_32B_yarn_131k_th(â€¦):   0%|          | 0.00/135M [00:00<?, ?B/s]

SFT/LongAlign_64k_Qwen3_32B_yarn_131k_th(â€¦):   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading data:   0%|          | 0/113 [00:00<?, ?files/s]

SFT/OpenThoughts3_1.2M_think-00000-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00001-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00002-of-00(â€¦):   0%|          | 0.00/288M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00003-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00004-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00005-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00006-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00007-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00008-of-00(â€¦):   0%|          | 0.00/288M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00009-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00010-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00011-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00012-of-00(â€¦):   0%|          | 0.00/286M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00013-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00014-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00015-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00016-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00017-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00018-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00019-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00020-of-00(â€¦):   0%|          | 0.00/287M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00021-of-00(â€¦):   0%|          | 0.00/288M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00022-of-00(â€¦):   0%|          | 0.00/282M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00023-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00024-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00025-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00026-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00027-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00028-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00029-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00030-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00031-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00032-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00033-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00034-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00035-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00036-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00037-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00038-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00039-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00040-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00041-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00042-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00043-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00044-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00045-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00046-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00047-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00048-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00049-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00050-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00051-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00052-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00053-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00054-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00055-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00056-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00057-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00058-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00059-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00060-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00061-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00062-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00063-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00064-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00065-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00066-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00067-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00068-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00069-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00070-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00071-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00072-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00073-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00074-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00075-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00076-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00077-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00078-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00079-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00080-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00081-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00082-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00083-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00084-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00085-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00086-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00087-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00088-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00089-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00090-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00091-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00092-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00093-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00094-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00095-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00096-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00097-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00098-of-00(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00099-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00100-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00101-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00102-of-00(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00103-of-00(â€¦):   0%|          | 0.00/158M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00104-of-00(â€¦):   0%|          | 0.00/150M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00105-of-00(â€¦):   0%|          | 0.00/149M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00106-of-00(â€¦):   0%|          | 0.00/150M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00107-of-00(â€¦):   0%|          | 0.00/150M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00108-of-00(â€¦):   0%|          | 0.00/150M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00109-of-00(â€¦):   0%|          | 0.00/152M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00110-of-00(â€¦):   0%|          | 0.00/150M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00111-of-00(â€¦):   0%|          | 0.00/151M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_think-00112-of-00(â€¦):   0%|          | 0.00/150M [00:00<?, ?B/s]

SFT/aya_dataset_Qwen3_32B_think-00000-of(â€¦):   0%|          | 0.00/32.6M [00:00<?, ?B/s]

SFT/multi_turn_reasoning_if_think-00000-(â€¦):   0%|          | 0.00/178M [00:00<?, ?B/s]

SFT/s1k_1.1_think-00000-of-00001.parquet:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

SFT/smolagents_toolcalling_traces_think-(â€¦):   0%|          | 0.00/81.8M [00:00<?, ?B/s]

SFT/smoltalk_everyday_convs_reasoning_Qw(â€¦):   0%|          | 0.00/6.33M [00:00<?, ?B/s]

SFT/smoltalk_multilingual8_Qwen3_32B_thi(â€¦):   0%|          | 0.00/264M [00:00<?, ?B/s]

SFT/smoltalk_multilingual8_Qwen3_32B_thi(â€¦):   0%|          | 0.00/265M [00:00<?, ?B/s]

SFT/smoltalk_multilingual8_Qwen3_32B_thi(â€¦):   0%|          | 0.00/265M [00:00<?, ?B/s]

SFT/smoltalk_multilingual8_Qwen3_32B_thi(â€¦):   0%|          | 0.00/264M [00:00<?, ?B/s]

SFT/smoltalk_systemchats_Qwen3_32B_think(â€¦):   0%|          | 0.00/64.9M [00:00<?, ?B/s]

SFT/table_gpt_Qwen3_32B_think-00000-of-0(â€¦):   0%|          | 0.00/32.9M [00:00<?, ?B/s]

SFT/LongAlign_64k_context_lang_annotated(â€¦):   0%|          | 0.00/199M [00:00<?, ?B/s]

SFT/Mixture_of_Thoughts_science_no_think(â€¦):   0%|          | 0.00/63.5M [00:00<?, ?B/s]

SFT/OpenHermes_2.5_no_think-00000-of-000(â€¦):   0%|          | 0.00/164M [00:00<?, ?B/s]

SFT/OpenHermes_2.5_no_think-00001-of-000(â€¦):   0%|          | 0.00/159M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_no_think_no_think(â€¦):   0%|          | 0.00/245M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_no_think_no_think(â€¦):   0%|          | 0.00/121M [00:00<?, ?B/s]

SFT/OpenThoughts3_1.2M_no_think_no_think(â€¦):   0%|          | 0.00/218M [00:00<?, ?B/s]

SFT/hermes_function_calling_v1_no_think-(â€¦):   0%|          | 0.00/10.8M [00:00<?, ?B/s]

SFT/smoltalk_multilingual_8languages_lan(â€¦):   0%|          | 0.00/158M [00:00<?, ?B/s]

SFT/smoltalk_multilingual_8languages_lan(â€¦):   0%|          | 0.00/159M [00:00<?, ?B/s]

SFT/smoltalk_smollm3_everyday_conversati(â€¦):   0%|          | 0.00/899k [00:00<?, ?B/s]

SFT/smoltalk_smollm3_explore_instruct_re(â€¦):   0%|          | 0.00/5.34M [00:00<?, ?B/s]

SFT/smoltalk_smollm3_smol_magpie_ultra_n(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/smoltalk_smollm3_smol_magpie_ultra_n(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/smoltalk_smollm3_smol_magpie_ultra_n(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/smoltalk_smollm3_smol_magpie_ultra_n(â€¦):   0%|          | 0.00/230M [00:00<?, ?B/s]

SFT/smoltalk_smollm3_smol_magpie_ultra_n(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/smoltalk_smollm3_smol_magpie_ultra_n(â€¦):   0%|          | 0.00/231M [00:00<?, ?B/s]

SFT/smoltalk_smollm3_smol_rewrite_no_thi(â€¦):   0%|          | 0.00/38.5M [00:00<?, ?B/s]

SFT/smoltalk_smollm3_smol_summarize_no_t(â€¦):   0%|          | 0.00/117M [00:00<?, ?B/s]

SFT/smoltalk_smollm3_systemchats_30k_no_(â€¦):   0%|          | 0.00/47.2M [00:00<?, ?B/s]

SFT/table_gpt_no_think-00000-of-00001.pa(â€¦):   0%|          | 0.00/12.6M [00:00<?, ?B/s]

SFT/tulu_3_sft_personas_instruction_foll(â€¦):   0%|          | 0.00/33.2M [00:00<?, ?B/s]

SFT/xlam_traces_no_think-00000-of-00001.(â€¦):   0%|          | 0.00/30.6M [00:00<?, ?B/s]

Generating LongAlign_64k_Qwen3_32B_yarn_131k_think split:   0%|          | 0/7526 [00:00<?, ? examples/s]

Generating OpenThoughts3_1.2M_think split:   0%|          | 0/1133524 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [22]:
# Function to process different dataset formats
def process_qa_dataset(examples, question_col, answer_col):
    """Process Q&A datasets into chat format"""
    processed = []
    
    for question, answer in zip(examples[question_col], examples[answer_col]):
        messages = [
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer}
        ]
        processed.append(messages)
    
    return {"messages": processed}

def process_instruction_dataset(examples):
    """Process instruction-following datasets"""
    processed = []
    
    for instruction, response in zip(examples["instruction"], examples["response"]):
        messages = [
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": response}
        ]
        processed.append(messages)
    
    return {"messages": processed}

# Example: Process GSM8K math dataset
print("=== PROCESSING GSM8K DATASET ===\n")

gsm8k = load_dataset("openai/gsm8k", "main", split="train[:100]")  # Small subset for demo
print(f"Original GSM8K example: {gsm8k[0]}")

# Convert to chat format
def process_gsm8k(examples):
    processed = []
    for question, answer in zip(examples["question"], examples["answer"]):
        messages = [
            {"role": "system", "content": "You are a math tutor. Solve problems step by step."},
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer}
        ]
        processed.append(messages)
    return {"messages": processed}

gsm8k_processed = gsm8k.map(process_gsm8k, batched=True, remove_columns=gsm8k.column_names)
print(f"Processed example: {gsm8k_processed[0]}")

=== PROCESSING GSM8K DATASET ===



README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Original GSM8K example: {'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Processed example: {'messages': [{'content': 'You are a math tutor. Solve problems step by step.', 'role': 'system'}, {'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'role': 'user'}, {'content': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72', 'role': 'assistant'}]}


In [23]:
# Function to apply chat templates to processed datasets
def apply_chat_template_to_dataset(dataset, tokenizer):
    """Apply chat template to dataset for training"""
    
    def format_messages(examples):
        formatted_texts = []
        
        for messages in examples["messages"]:
            # Apply chat template
            formatted_text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False  # We want the complete conversation
            )
            formatted_texts.append(formatted_text)
        
        return {"text": formatted_texts}
    
    return dataset.map(format_messages, batched=True)

# Apply to our processed GSM8K dataset
gsm8k_formatted = apply_chat_template_to_dataset(gsm8k_processed, instruct_tokenizer)
print("=== FORMATTED TRAINING DATA ===")
print(gsm8k_formatted[0]["text"])

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

=== FORMATTED TRAINING DATA ===
<|im_start|>system
## Metadata

Knowledge Cutoff Date: June 2025
Today Date: 26 November 2025
Reasoning Mode: /think

## Custom Instructions

You are a math tutor. Solve problems step by step.

<|im_start|>user
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|im_end|>
<|im_start|>assistant
Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72<|im_end|>



### Finetuning

In [26]:
# Import required libraries for fine-tuning
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
import torch
import wandb  # Optional: for experiment tracking

# Initialize Weights & Biases (optional)
# wandb.init(project="smollm3-finetuning")

# Load SmolLM3 base model for fine-tuning
model_name = "HuggingFaceTB/SmolLM3-3B-Base"
new_model_name = "SmolLM3-Custom-SFT"

print(f"Loading {model_name}...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token
tokenizer.padding_side = "right"  # Padding on the right for generation

print(f"Model loaded! Parameters: {model.num_parameters():,}")

Loading HuggingFaceTB/SmolLM3-3B-Base...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded! Parameters: 3,075,098,624


In [27]:
# Load and prepare training dataset
print("=== PREPARING DATASET ===\n")

# Option 1: Use SmolTalk2 (recommended for beginners)
dataset = load_dataset("HuggingFaceTB/smoltalk2", "SFT")
train_dataset = dataset["smoltalk_everyday_convs_reasoning_Qwen3_32B_think"].select(range(1000))  # Use subset for faster training

# Option 2: Use your own processed dataset from Exercise 2
# train_dataset = gsm8k_formatted.select(range(500))

print(f"Training examples: {len(train_dataset)}")
print(f"Example: {train_dataset[0]}")

# Prepare the dataset for SFT
def format_chat_template(example):
    """Format the messages using the chat template"""
    if "messages" in example:
        # SmolTalk2 format
        messages = example["messages"]
    else:
        # Custom format - adapt as needed
        messages = [
            {"role": "user", "content": example["instruction"]},
            {"role": "assistant", "content": example["response"]}
        ]
    
    # Apply chat template
    text = instruct_tokenizer.apply_chat_template(
        messages, 
        tokenize=False,
        add_generation_prompt=False
    )
    return {"text": text}

# Apply formatting
formatted_dataset = train_dataset.map(format_chat_template)
formatted_dataset = formatted_dataset.remove_columns(
    [col for col in formatted_dataset.column_names if col != "text"]
)
print(f"Formatted example: {formatted_dataset[0]['text'][:200]}...")

=== PREPARING DATASET ===



Resolving data files:   0%|          | 0/124 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/113 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/113 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/113 [00:00<?, ?files/s]

Generating LongAlign_64k_Qwen3_32B_yarn_131k_think split:   0%|          | 0/7526 [00:00<?, ? examples/s]

Generating OpenThoughts3_1.2M_think split:   0%|          | 0/1133524 [00:00<?, ? examples/s]

Generating aya_dataset_Qwen3_32B_think split:   0%|          | 0/15222 [00:00<?, ? examples/s]

Generating multi_turn_reasoning_if_think split:   0%|          | 0/28217 [00:00<?, ? examples/s]

Generating s1k_1.1_think split:   0%|          | 0/835 [00:00<?, ? examples/s]

Generating smolagents_toolcalling_traces_think split:   0%|          | 0/9079 [00:00<?, ? examples/s]

Generating smoltalk_everyday_convs_reasoning_Qwen3_32B_think split:   0%|          | 0/2057 [00:00<?, ? examplâ€¦

Generating smoltalk_multilingual8_Qwen3_32B_think split:   0%|          | 0/244736 [00:00<?, ? examples/s]

Generating smoltalk_systemchats_Qwen3_32B_think split:   0%|          | 0/27436 [00:00<?, ? examples/s]

Generating table_gpt_Qwen3_32B_think split:   0%|          | 0/13201 [00:00<?, ? examples/s]

Generating LongAlign_64k_context_lang_annotated_lang_6_no_think split:   0%|          | 0/6249 [00:00<?, ? exaâ€¦

Generating Mixture_of_Thoughts_science_no_think split:   0%|          | 0/86110 [00:00<?, ? examples/s]

Generating OpenHermes_2.5_no_think split:   0%|          | 0/384900 [00:00<?, ? examples/s]

Generating OpenThoughts3_1.2M_no_think_no_think split:   0%|          | 0/435193 [00:00<?, ? examples/s]

Generating hermes_function_calling_v1_no_think split:   0%|          | 0/8961 [00:00<?, ? examples/s]

Generating smoltalk_multilingual_8languages_lang_5_no_think split:   0%|          | 0/254047 [00:00<?, ? exampâ€¦

Generating smoltalk_smollm3_everyday_conversations_no_think split:   0%|          | 0/2260 [00:00<?, ? exampleâ€¦

Generating smoltalk_smollm3_explore_instruct_rewriting_no_think split:   0%|          | 0/30391 [00:00<?, ? exâ€¦

Generating smoltalk_smollm3_smol_magpie_ultra_no_think split:   0%|          | 0/406843 [00:00<?, ? examples/sâ€¦

Generating smoltalk_smollm3_smol_rewrite_no_think split:   0%|          | 0/53262 [00:00<?, ? examples/s]

Generating smoltalk_smollm3_smol_summarize_no_think split:   0%|          | 0/96061 [00:00<?, ? examples/s]

Generating smoltalk_smollm3_systemchats_30k_no_think split:   0%|          | 0/33997 [00:00<?, ? examples/s]

Generating table_gpt_no_think split:   0%|          | 0/13203 [00:00<?, ? examples/s]

Generating tulu_3_sft_personas_instruction_following_no_think split:   0%|          | 0/29970 [00:00<?, ? examâ€¦

Generating xlam_traces_no_think split:   0%|          | 0/59962 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/105 [00:00<?, ?it/s]

Training examples: 1000
Example: {'messages': [{'content': 'Hi there', 'role': 'user'}, {'content': '<think>\nOkay, the user sent "Hi there". That\'s a friendly greeting. I should respond in a welcoming way. Let me check the guidelines. I need to be helpful, keep the conversation going, and maybe ask how I can assist them. Let me make sure the tone is warm and approachable. Alright, something like "Hello! How can I assist you today?" That should work. Let me confirm there\'s no typo and it\'s in a natural, conversational style.\n</think>\n\nHello! How can I assist you today?', 'role': 'assistant'}, {'content': "I'm looking for a healthy breakfast idea. What's a good option?", 'role': 'user'}, {'content': "<think>\nOkay, the user is asking for a healthy breakfast idea. Let me think about what makes a breakfast healthy. It should be balanced, providing a mix of nutrients like protein, fiber, healthy fats, and some carbs. Let me brainstorm some options.\n\nMaybe start with a classic like 

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Formatted example: <|im_start|>system
## Metadata

Knowledge Cutoff Date: June 2025
Today Date: 26 November 2025
Reasoning Mode: /think

## Custom Instructions

You are a helpful AI assistant named SmolLM, trained by Hu...


In [None]:
# Configure training parameters
training_config = SFTConfig(
    # Model and data
    output_dir=f"./{new_model_name}",
    dataset_text_field="text",
    max_length=2048,
    
    # Training hyperparameters
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    num_train_epochs=1,  # Start with 1 epoch
    max_steps=500,  # Limit steps for demo
    
    # Optimization
    warmup_steps=50,
    weight_decay=0.01,
    optim="adamw_torch",
    
    # Logging and saving
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    save_total_limit=2,
    
    # Memory optimization
    dataloader_num_workers=0,
    group_by_length=True,  # Group similar length sequences
    
    # Hugging Face Hub integration
    push_to_hub=True,  # Set to True to upload to Hub
    hub_model_id=f"dingusagar/{new_model_name}",
    
    # Experiment tracking
    report_to=["wandb"],  # Use wandb for experiment tracking
    run_name=f"{new_model_name}-training",
)

print("Training configuration set!")
print(f"Effective batch size: {training_config.per_device_train_batch_size * training_config.gradient_accumulation_steps}")

Error in callback <bound method _WandbInit._pre_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x15512e75bbb0>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 1550f5e29e40, raw_cell="# Configure training parameters
training_config = .." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://ondemand-ice.pace.gatech.edu/home/hice1/dk305/scratch/llm-finetuning-course/smol-exercise.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D>,),kwargs {}:


KeyboardInterrupt: 

Training configuration set!
Effective batch size: 16


In [None]:
# LoRA configuration with PEFT
from peft import LoraConfig

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],   # REQUIRED

)

# Create SFTTrainer with LoRA enabled
from trl import SFTTrainer

lora_trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,  # dataset with a "text" field or messages + dataset_text_field in config
    args=training_config,
    peft_config=peft_config,  # << enable LoRA
)

print("Starting LoRA trainingâ€¦")
lora_trainer.train()



Starting LoRA trainingâ€¦


[34m[1mwandb[0m: Currently logged in as: [33mdingusagar[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss
10,1.5008


KeyboardInterrupt: 

socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.s

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x15512e75bbb0>> (for post_run_cell), with arguments args (<ExecutionResult object at 15512e831300, execution_count=34 error_before_exec=None error_in_exec= info=<ExecutionInfo object at 15512e830b50, raw_cell="# LoRA configuration with PEFT
from peft import Lo.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://ondemand-ice.pace.gatech.edu/home/hice1/dk305/scratch/llm-finetuning-course/smol-exercise.ipynb#X23sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.


ConnectionResetError: Connection lost

socket.send() raised exception.
socket.send() raised exception.


socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.send() raised exception.
socket.s