**SMOLLLM2 158M No Modifications**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Specify the model checkpoint
checkpoint = "HuggingFaceTB/SmolLM2-135M"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    device_map="auto",              # Automatically map layers to GPU if available
    torch_dtype=torch.bfloat16       # Use bfloat16 for better memory usage
).to("cuda" if torch.cuda.is_available() else "cpu")  # Moves model to GPU or CPU

# Function to generate text
def generate_text(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(inputs["input_ids"], max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with a prompt
prompt = "Gravity is"
output = generate_text(prompt)
print(output)

# Display memory footprint (optional)
print(f"Memory footprint: {model.get_memory_footprint() / 1e6:.2f} MB")

tokenizer_config.json:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/788 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Gravity is the force that holds the Earth and the Moon together.

The Moon is a satellite of the Earth. It is a rocky body that orbits the Earth. The Moon is the only natural satellite of the Earth.

The Moon is a satellite of the Earth. It is a rocky body that orbits the Earth. The Moon is the only natural satellite of the Earth.

The Moon is a satellite of the Earth. It is a rocky body that orbits the Earth. The
Memory footprint: 269.03 MB


**With Differential Attention**

In [None]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig

# Differential Attention Layer
class DifferentialAttention(nn.Module):
    def __init__(self, d_model, heads, lambda_init=0.8):
        super(DifferentialAttention, self).__init__()
        self.heads = heads
        self.scale = 1 / (d_model ** 0.5)
        self.lambda_init = lambda_init
        self.W_q = nn.Linear(d_model, d_model * 2)
        self.W_k = nn.Linear(d_model, d_model * 2)
        self.W_v = nn.Linear(d_model, d_model)
        self.lambda_param = nn.Parameter(torch.tensor(lambda_init))

    def forward(self, x):
        # Project input for query, key, and value
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)

        # Split for differential attention
        Q1, Q2 = Q.chunk(2, dim=-1)
        K1, K2 = K.chunk(2, dim=-1)

        # Compute scaled dot-product attention
        scores1 = torch.matmul(Q1, K1.transpose(-2, -1)) * self.scale
        scores2 = torch.matmul(Q2, K2.transpose(-2, -1)) * self.scale

        # Differential attention
        diff_attention = torch.softmax(scores1, dim=-1) - self.lambda_param * torch.softmax(scores2, dim=-1)
        output = torch.matmul(diff_attention, V)
        return output

# Model with Differential Attention
class SmolLM2WithDiffAttention(nn.Module):
    def __init__(self, model_name):
        super(SmolLM2WithDiffAttention, self).__init__()
        self.base_model = AutoModelForCausalLM.from_pretrained(model_name)
        config = AutoConfig.from_pretrained(model_name)
        self.diff_attention = DifferentialAttention(d_model=config.hidden_size, heads=config.num_attention_heads)

    def forward(self, input_ids):
        hidden_states = self.base_model.transformer(input_ids)[0]
        return self.diff_attention(hidden_states)

    # Update the generate method to accept additional keyword arguments
    def generate(self, input_ids, **kwargs):
        return self.base_model.generate(input_ids=input_ids, **kwargs)

# Initialize model and tokenizer
checkpoint = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = SmolLM2WithDiffAttention(checkpoint)

# Generate text using Differential Attention with attention mask
def generate_text(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")

    # Set attention mask to handle padding correctly
    inputs['attention_mask'] = torch.ones_like(inputs.input_ids)

    # Generate text with model
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,  # Include attention mask here
        max_length=max_length
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Run a sample prompt
prompt = "Gravity is"
output = generate_text(prompt)
print(output)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Gravity is the force that holds the Earth and the Moon together.

The Moon is a satellite of the Earth. It is a rocky body that orbits the Earth. The Moon is the only natural satellite of the Earth.

The Moon is the only natural satellite of the Earth. It is a rocky body that orbits the Earth. The Moon is the only natural satellite of the Earth.

The Moon is the only natural satellite of the Earth. It is a rocky body that orbits


**With Entropy Based Differential Attention**

In [None]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig

# Differential Attention Layer with Entropix
class DifferentialAttention(nn.Module):
    def __init__(self, d_model, heads, lambda_init=0.8):
        super(DifferentialAttention, self).__init__()
        self.heads = heads
        self.scale = 1 / (d_model ** 0.5)
        self.lambda_init = lambda_init
        self.W_q = nn.Linear(d_model, d_model * 2)
        self.W_k = nn.Linear(d_model, d_model * 2)
        self.W_v = nn.Linear(d_model, d_model)
        self.lambda_param = nn.Parameter(torch.tensor(lambda_init))

    def compute_entropy(self, attention_scores):
        probs = torch.softmax(attention_scores, dim=-1)
        log_probs = torch.log(probs + 1e-9)  # Avoid log(0)
        entropy = -torch.sum(probs * log_probs, dim=-1, keepdim=True)  # Compute entropy
        return entropy

    def forward(self, x):
        # Project input for query, key, and value
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)

        # Split for differential attention
        Q1, Q2 = Q.chunk(2, dim=-1)
        K1, K2 = K.chunk(2, dim=-1)

        # Compute scaled dot-product attention
        scores1 = torch.matmul(Q1, K1.transpose(-2, -1)) * self.scale
        scores2 = torch.matmul(Q2, K2.transpose(-2, -1)) * self.scale

        # Differential attention with entropy modulation (entropix)
        diff_attention = torch.softmax(scores1, dim=-1) - self.lambda_param * torch.softmax(scores2, dim=-1)

        # Calculate entropy-based weight for entropix
        entropy_weight = self.compute_entropy(scores1)
        entropix_attention = diff_attention * entropy_weight  # Apply entropy as a weighting factor

        # Apply attention to values
        output = torch.matmul(entropix_attention, V)
        return output

# Model with Differential Attention and Entropix
class SmolLM2WithDiffAttention(nn.Module):
    def __init__(self, model_name):
        super(SmolLM2WithDiffAttention, self).__init__()
        self.base_model = AutoModelForCausalLM.from_pretrained(model_name)
        config = AutoConfig.from_pretrained(model_name)
        self.diff_attention = DifferentialAttention(d_model=config.hidden_size, heads=config.num_attention_heads)

    def forward(self, input_ids):
        hidden_states = self.base_model.transformer(input_ids)[0]
        return self.diff_attention(hidden_states)

    # Update the generate method to accept attention_mask and other kwargs
    def generate(self, input_ids, **kwargs):
        return self.base_model.generate(input_ids=input_ids, **kwargs)

# Initialize model and tokenizer
checkpoint = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = SmolLM2WithDiffAttention(checkpoint)

# Generate text using Differential Attention with Entropix
def generate_text(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    # Setting attention_mask here
    inputs['attention_mask'] = torch.ones_like(inputs.input_ids)
    outputs = model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_length=max_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Run a sample prompt
prompt = "Gravity is"
output = generate_text(prompt)
print(output)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Gravity is the force that holds the Earth and the Moon together.

The Moon is a satellite of the Earth. It is a rocky body that orbits the Earth. The Moon is the only natural satellite of the Earth.

The Moon is the only natural satellite of the Earth. It is a rocky body that orbits the Earth. The Moon is the only natural satellite of the Earth.

The Moon is the only natural satellite of the Earth. It is a rocky body that orbits


**Mixture of Experts**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch.nn.functional as F

# Specify the model checkpoint
checkpoint = "HuggingFaceTB/SmolLM2-135M"

# Load the tokenizer (shared across all models)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Load three versions of the model
models = [
    AutoModelForCausalLM.from_pretrained(
        checkpoint,
        device_map="auto",
        torch_dtype=torch.bfloat16
    ).to("cuda" if torch.cuda.is_available() else "cpu")
    for _ in range(3)
]

# Helper function to calculate entropy
def calculate_entropy(logits):
    probs = F.softmax(logits, dim=-1)
    log_probs = torch.log(probs + 1e-9)  # Add a small epsilon to avoid log(0)
    entropy = -torch.sum(probs * log_probs, dim=-1)
    return entropy.mean()

# Function to generate text using MoE with entropy-based weighted averaging
def generate_text_moe(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(models[0].device)

    # Collect logits and entropy for each model
    logits_list = []
    entropies = []
    for model in models:
        with torch.no_grad():
            outputs = model(inputs["input_ids"], output_hidden_states=False, return_dict=True)
            logits = outputs.logits
            logits_list.append(logits)

            # Calculate entropy for the model's output
            entropy = calculate_entropy(logits)
            entropies.append(entropy)

    # Convert entropies to weights (lower entropy -> higher weight)
    entropies = torch.tensor(entropies)
    weights = (1 / entropies) / (1 / entropies).sum()

    # Weighted sum of logits based on entropy scores
    weighted_logits = sum(w * logits for w, logits in zip(weights, logits_list))

    # Generate text from weighted logits
    generated_ids = torch.argmax(weighted_logits, dim=-1)
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# Test with a prompt
prompt = "Gravity is"
output = generate_text_moe(prompt)
print(output)

# Display memory footprint for each model (optional)
for i, model in enumerate(models):
    print(f"Memory footprint of model {i+1}: {model.get_memory_footprint() / 1e6:.2f} MB")

ctor is the
Memory footprint of model 1: 269.03 MB
Memory footprint of model 2: 269.03 MB
Memory footprint of model 3: 269.03 MB


**With Entropix**

In this code, we add an entropy-aware sampling function that will:

* Track the entropy of possible next tokens.
* Dynamically adjust sampling behavior based on the entropy level.
* Use parallel chain-of-thought decoding by generating multiple reasoning paths.

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM

# Define a function to calculate entropy
def calculate_entropy(probs):
    return -torch.sum(probs * torch.log(probs + 1e-10), dim=-1)

# Custom Entropix Sampling Function
def entropix_sampling(logits, entropy_threshold=1.5):
    # Calculate probabilities
    probs = F.softmax(logits, dim=-1)

    # Calculate entropy
    entropy = calculate_entropy(probs)

    # Adjust sampling based on entropy levels
    if entropy > entropy_threshold:
        # High uncertainty, more random sampling
        sampled_token = torch.multinomial(probs, num_samples=1)
    else:
        # Low uncertainty, deterministic argmax sampling
        sampled_token = torch.argmax(probs, dim=-1, keepdim=True)

    return sampled_token, entropy.item()

# Parallel Chain-of-Thought Decoding
def entropix_chain_of_thought(model, tokenizer, prompt, max_length=100, num_paths=3, entropy_threshold=1.5):
    inputs = tokenizer(prompt, return_tensors="pt")
    generated_paths = []
    entropies = []

    # Generate multiple reasoning paths
    for _ in range(num_paths):
        generated = inputs["input_ids"]
        path_entropy = []

        for _ in range(max_length):
            outputs = model(input_ids=generated)
            logits = outputs.logits[:, -1, :]  # Get logits of the last token
            sampled_token, entropy = entropix_sampling(logits, entropy_threshold)

            path_entropy.append(entropy)
            generated = torch.cat((generated, sampled_token), dim=1)

            # Stop if end of sequence token is generated
            if sampled_token == tokenizer.eos_token_id:
                break

        generated_paths.append(generated)
        entropies.append(sum(path_entropy) / len(path_entropy))  # Average entropy for path

    # Select the path with the most stable entropy
    best_path_idx = entropies.index(min(entropies, key=lambda e: abs(e - 1.5)))  # Targeting stable entropy around 1.5
    best_path = generated_paths[best_path_idx]

    return tokenizer.decode(best_path[0], skip_special_tokens=True)

# Initialize model and tokenizer
checkpoint = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint)

# Generate text using Entropix-style reasoning
prompt = "Gravity is"
output = entropix_chain_of_thought(model, tokenizer, prompt)
print(output)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Gravity is external to us. We observe it in a physical form, conducting the action/reaction of magnets toward lighter objects and pulling away or accentuating those opposing objects'.

    to be computer-detected exists in the nervous system. Such analysis and detection has been responsible for much novel and advanced medical forensics.

    Although we do not know what is labeled solely as weightlessness about our body, I notice enough that the fact that we feel weightless. Of course we feel weightless whether we really are there
