In [1]:
import os
import math
import time
import inspect
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F


class RMSNorm(nn.Module):
    """Root Mean Square Layer Normalization"""
    def __init__(self, dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def forward(self, x):
        # Calculate RMS
        rms = torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps)
        # Normalize and scale
        x_norm = x / rms
        return self.weight * x_norm


def precompute_freqs_cis(dim, max_seq_len, theta=100000.0):
    """Precompute the frequency tensor for complex exponentials (RoPE)"""
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
    t = torch.arange(max_seq_len, dtype=torch.float32)
    freqs = torch.outer(t, freqs).float()
    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
    return freqs_cis


def apply_rotary_emb(x, freqs_cis):
    """Apply rotary embeddings to input tensors"""
    # x shape: (B, n_heads, T, head_dim)
    # Reshape x to treat pairs of values as complex numbers
    x_float = x.float()
    x_reshaped = x_float.reshape(*x_float.shape[:-1], -1, 2)  # (B, n_heads, T, head_dim//2, 2)
    x_complex = torch.view_as_complex(x_reshaped)  # (B, n_heads, T, head_dim//2)

    # Reshape freqs_cis to match
    freqs_cis = freqs_cis.unsqueeze(0).unsqueeze(0)  # (1, 1, T, head_dim//2)

    # Apply rotation
    x_rotated = x_complex * freqs_cis

    # Convert back to real
    x_out = torch.view_as_real(x_rotated)  # (B, n_heads, T, head_dim//2, 2)
    x_out = x_out.reshape(*x_out.shape[:-2], -1)  # (B, n_heads, T, head_dim)

    return x_out.type_as(x)


class GroupedQueryAttention(nn.Module):
    """Grouped Query Attention with RoPE"""

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0

        self.n_head = config.n_head  # Number of query heads
        self.n_kv_head = config.n_kv_head  # Number of key-value heads
        self.head_dim = config.head_dim
        self.n_embd = config.n_embd

        # Q projection: one per query head
        self.q_proj = nn.Linear(config.n_embd, config.n_head * config.head_dim, bias=config.attention_bias)
        # K, V projections: one per key-value head
        self.k_proj = nn.Linear(config.n_embd, config.n_kv_head * config.head_dim, bias=config.attention_bias)
        self.v_proj = nn.Linear(config.n_embd, config.n_kv_head * config.head_dim, bias=config.attention_bias)
        # Output projection
        self.o_proj = nn.Linear(config.n_head * config.head_dim, config.n_embd, bias=config.attention_bias)

        # Dropout
        self.attn_dropout = nn.Dropout(config.attention_dropout)

        # Causal mask
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x, freqs_cis):
        B, T, C = x.size()  # batch, seq_len, embedding_dim

        # Project to Q, K, V
        q = self.q_proj(x)  # (B, T, n_head * head_dim)
        k = self.k_proj(x)  # (B, T, n_kv_head * head_dim)
        v = self.v_proj(x)  # (B, T, n_kv_head * head_dim)

        # Reshape for multi-head attention
        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B, n_head, T, head_dim)
        k = k.view(B, T, self.n_kv_head, self.head_dim).transpose(1, 2)  # (B, n_kv_head, T, head_dim)
        v = v.view(B, T, self.n_kv_head, self.head_dim).transpose(1, 2)  # (B, n_kv_head, T, head_dim)

        # Apply RoPE to Q and K
        freqs_cis_current = freqs_cis[:T]
        q = apply_rotary_emb(q, freqs_cis_current)
        k = apply_rotary_emb(k, freqs_cis_current)

        # Expand K and V to match number of query heads (grouped query attention)
        # Each KV head serves n_head // n_kv_head query heads
        n_rep = self.n_head // self.n_kv_head
        if n_rep > 1:
            k = k.unsqueeze(2).expand(B, self.n_kv_head, n_rep, T, self.head_dim).reshape(B, self.n_head, T, self.head_dim)
            v = v.unsqueeze(2).expand(B, self.n_kv_head, n_rep, T, self.head_dim).reshape(B, self.n_head, T, self.head_dim)

        # Compute attention scores
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_dim))
        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)

        # Apply attention to values
        y = att @ v  # (B, n_head, T, head_dim)

        # Concatenate heads
        y = y.transpose(1, 2).contiguous().view(B, T, self.n_head * self.head_dim)

        # Output projection
        y = self.o_proj(y)
        return y


class SwiGLU(nn.Module):
    """SwiGLU MLP as used in LLaMA"""

    def __init__(self, config):
        super().__init__()
        self.gate_proj = nn.Linear(config.n_embd, config.intermediate_size, bias=config.mlp_bias)
        self.up_proj = nn.Linear(config.n_embd, config.intermediate_size, bias=config.mlp_bias)
        self.down_proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.mlp_bias)

    def forward(self, x):
        # SwiGLU: silu(gate) * up
        gate = F.silu(self.gate_proj(x))
        up = self.up_proj(x)
        return self.down_proj(gate * up)


class Block(nn.Module):
    """Transformer block with pre-normalization"""

    def __init__(self, config):
        super().__init__()
        self.input_layernorm = RMSNorm(config.n_embd, eps=config.rms_norm_eps)
        self.self_attn = GroupedQueryAttention(config)
        self.post_attention_layernorm = RMSNorm(config.n_embd, eps=config.rms_norm_eps)
        self.mlp = SwiGLU(config)

    def forward(self, x, freqs_cis):
        # Pre-norm architecture with residual connections
        x = x + self.self_attn(self.input_layernorm(x), freqs_cis)
        x = x + self.mlp(self.post_attention_layernorm(x))
        return x


@dataclass
class SmolLM2Config:
    block_size: int = 8192  # max sequence length
    vocab_size: int = 49152  # number of tokens
    n_layer: int = 30  # number of layers
    n_head: int = 9  # number of query heads
    n_kv_head: int = 3  # number of key-value heads
    n_embd: int = 576  # embedding dimension
    intermediate_size: int = 1536  # MLP hidden dimension
    head_dim: int = 64  # dimension per head
    rms_norm_eps: float = 1e-5
    rope_theta: float = 100000.0
    attention_bias: bool = False
    mlp_bias: bool = False
    attention_dropout: float = 0.0
    tie_word_embeddings: bool = True


class SmolLM2(nn.Module):
    """SmolLM2 Language Model"""

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.model = nn.ModuleDict(dict(
            embed_tokens = nn.Embedding(config.vocab_size, config.n_embd),
            layers = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            norm = RMSNorm(config.n_embd, eps=config.rms_norm_eps),
        ))

        # LM head (output projection to vocabulary)
        if config.tie_word_embeddings:
            self.lm_head = None  # Will use embed_tokens.weight
        else:
            self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # Precompute RoPE frequencies
        self.freqs_cis = precompute_freqs_cis(
            config.head_dim,
            config.block_size,
            config.rope_theta
        )

        # Initialize weights
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"

        # Move freqs_cis to same device as input
        device = idx.device
        freqs_cis = self.freqs_cis.to(device)

        # Token embeddings
        x = self.model.embed_tokens(idx)  # (B, T, n_embd)

        # Forward through transformer blocks
        for block in self.model.layers:
            x = block(x, freqs_cis)

        # Final layer norm
        x = self.model.norm(x)

        # Project to vocabulary
        if self.config.tie_word_embeddings:
            logits = F.linear(x, self.model.embed_tokens.weight)
        else:
            logits = self.lm_head(x)

        # Calculate loss if targets provided
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss

    @classmethod
    def from_pretrained(cls, model_type='HuggingFaceTB/SmolLM2-135M'):
        """Loads pretrained SmolLM2 model weights from huggingface"""
        from transformers import AutoModelForCausalLM
        print(f"loading weights from pretrained model: {model_type}")

        # Load HuggingFace model
        model_hf = AutoModelForCausalLM.from_pretrained(model_type)

        # Create config from HF model config
        config = SmolLM2Config(
            vocab_size=model_hf.config.vocab_size,
            block_size=model_hf.config.max_position_embeddings,
            n_layer=model_hf.config.num_hidden_layers,
            n_head=model_hf.config.num_attention_heads,
            n_kv_head=model_hf.config.num_key_value_heads,
            n_embd=model_hf.config.hidden_size,
            intermediate_size=model_hf.config.intermediate_size,
            head_dim=model_hf.config.head_dim,
            rms_norm_eps=model_hf.config.rms_norm_eps,
            rope_theta=model_hf.config.rope_theta,
            attention_bias=model_hf.config.attention_bias,
            mlp_bias=model_hf.config.mlp_bias,
            tie_word_embeddings=model_hf.config.tie_word_embeddings,
        )

        # Create our model
        model = cls(config)

        # Copy weights
        sd = model.state_dict()
        sd_hf = model_hf.state_dict()

        # Keys to skip (buffers created by our implementation)
        skip_keys = ['freqs_cis', 'bias']  # Skip RoPE freqs and causal mask

        # Weight mapping
        for key in sd.keys():
            # Skip buffers that are created by our implementation
            if any(skip_key in key for skip_key in skip_keys):
                continue

            # Map our keys to HF keys
            hf_key = key

            if key in sd_hf:
                with torch.no_grad():
                    sd[key].copy_(sd_hf[hf_key])
            else:
                print(f"Warning: {key} not found in pretrained model")

        return model




In [None]:
# Example usage
if __name__ == "__main__":
    device = 'cpu'
    if torch.cuda.is_available():
        device = 'cuda'
    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        device = "mps"
    print(f"using device: {device}")

    # Create model from scratch
    config = SmolLM2Config()
    model = SmolLM2(config)
    model.to(device)

    # Test with random input
    B, T = 4, 32
    x = torch.randint(0, config.vocab_size, (B, T)).to(device)
    y = torch.randint(0, config.vocab_size, (B, T)).to(device)

    logits, loss = model(x, targets=y)
    print(f"logits shape: {logits.shape}")
    print(f"loss: {loss.item()}")

    # Count parameters
    n_params = sum(p.numel() for p in model.parameters())
    print(f"number of parameters: {n_params/1e6:.2f}M")

In [3]:
if __name__ == "__main__":
    device = 'cpu'
    if torch.cuda.is_available():
        device = 'cuda'
    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        device = "mps"
    print(f"using device: {device}")

    # Create model from scratch
    config = SmolLM2Config()
    model = SmolLM2.from_pretrained('HuggingFaceTB/SmolLM2-135M')
    # model = SmolLM2(config)
    model.to(device)

    # Test with random input
    B, T = 4, 32
    x = torch.randint(0, config.vocab_size, (B, T)).to(device)
    y = torch.randint(0, config.vocab_size, (B, T)).to(device)

    logits, loss = model(x, targets=y)
    print(f"logits shape: {logits.shape}")
    print(f"loss: {loss.item()}")

    # Count parameters
    n_params = sum(p.numel() for p in model.parameters())
    print(f"number of parameters: {n_params/1e6:.2f}M")

using device: cuda
loading weights from pretrained model: HuggingFaceTB/SmolLM2-135M


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


logits shape: torch.Size([4, 32, 49152])
loss: 13.789576530456543
number of parameters: 134.52M


In [4]:
model = SmolLM2.from_pretrained('HuggingFaceTB/SmolLM2-135M')
print(model)

loading weights from pretrained model: HuggingFaceTB/SmolLM2-135M
SmolLM2(
  (model): ModuleDict(
    (embed_tokens): Embedding(49152, 576)
    (layers): ModuleList(
      (0-29): 30 x Block(
        (input_layernorm): RMSNorm()
        (self_attn): GroupedQueryAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=192, bias=False)
          (v_proj): Linear(in_features=576, out_features=192, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): RMSNorm()
        (mlp): SwiGLU(
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
        )
      )
    )
    (norm): RMSNorm()
  )
)


In [2]:
import os
import time
import torch
import tiktoken
#from model import SmolLM2, SmolLM2Config
from transformers import AutoTokenizer, AutoModelForCausalLM


# Clear any existing CUDA errors
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

# Set matmul precision for faster training on modern GPUs
torch.set_float32_matmul_precision('high')

# ============================================================================
# DataLoader
# ============================================================================
class DataLoaderLite:
    def __init__(self, B, T):
        self.B = B
        self.T = T

        # at init load tokens from disk and store them in memory
        with open('input.txt', 'r') as f:
            text = f.read()
        tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")
        tokens = tokenizer.encode(text)
        self.tokens = torch.tensor(tokens)
        print(f'loaded {len(self.tokens)} tokens')
        print(f'1 epoch = {len(self.tokens) // (B * T)} batches')

        # state
        self.current_position = 0

    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_position: self.current_position + B * T + 1]
        x = (buf[:-1]).view(B, T) # inputs
        y = (buf[1:]).view(B, T) # targets
        # advance the position in the tensor
        self.current_position += B*T
        # if loading the next batch would be out of bounds, reset
        if self.current_position + (B * T + 1) > len(self.tokens):
            self.current_position = 0
        return x, y


# ============================================================================
# Checkpoint utilities
# ============================================================================

def save_checkpoint(model, optimizer, step, loss, filepath):
    """Save model checkpoint"""
    checkpoint = {
        'step': step,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'config': model.config,
    }
    torch.save(checkpoint, filepath)
    print(f"\nCheckpoint saved to {filepath}")


def load_checkpoint(filepath, model, optimizer=None):
    """Load model checkpoint"""
    checkpoint = torch.load(filepath, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    if optimizer is not None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    step = checkpoint['step']
    loss = checkpoint['loss']
    print(f"\nCheckpoint loaded from {filepath}")
    print(f"Resuming from step {step}, loss {loss:.4f}\n")
    return step, loss


# ============================================================================
# Setup
# ============================================================================

# Device setup
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"Using device: {device}")

# Model config
config = SmolLM2Config(
    block_size=1024,
    vocab_size=49152,
    n_layer=30,  # Smaller for faster training
    n_head=9,
    n_kv_head=3,
    n_embd=576,
    intermediate_size=1536,
    head_dim=64,
)

# Initialize model
model = SmolLM2(config)
model.to(device)

# Count parameters
n_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {n_params/1e6:.2f}M\n")

# Initialize dataloader
train_loader = DataLoaderLite(B=4, T=256)

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9, 0.95), weight_decay=0.1)

# ============================================================================
# PHASE 1: Train for 5000 steps
# ============================================================================

print("=" * 80)
print("PHASE 1: Training for 5000 steps")
print("=" * 80)

max_steps = 5000
checkpoint_path = 'smollm2_checkpoint_5000.pt'

for step in range(max_steps):
    t0 = time.time()

    # Get batch
    x, y = train_loader.next_batch()
    x, y = x.to(device), y.to(device)

    # Forward pass with autocast for mixed precision
    optimizer.zero_grad()
    with torch.autocast(device_type=device, dtype=torch.bfloat16):
        logits, loss = model(x, y)

    # Backward pass
    loss.backward()

    # Gradient clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # Optimizer step
    optimizer.step()

    # Synchronize for accurate timing
    if device == 'cuda':
        torch.cuda.synchronize()

    t1 = time.time()
    dt = (t1 - t0) * 1000
    tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)

    # Print progress
    if step % 100 == 0 or step == max_steps - 1:
        print(f'step {step:4d} | loss: {loss.item():.4f} | dt: {dt:6.2f}ms | tok/sec: {tokens_per_sec:8.2f}')

print(f'\nFinal loss: {loss.item():.4f}')

# Save checkpoint
save_checkpoint(model, optimizer, max_steps, loss.item(), checkpoint_path)

print("=" * 80)
print("PHASE 1 Complete!")
print("=" * 80)


Using device: cuda
Model parameters: 134.52M



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (341094 > 8192). Running this sequence through the model will result in indexing errors


loaded 341094 tokens
1 epoch = 333 batches
PHASE 1: Training for 5000 steps
step    0 | loss: 10.9473 | dt: 1005.10ms | tok/sec:  1018.81
step  100 | loss: 6.2206 | dt: 484.31ms | tok/sec:  2114.33
step  200 | loss: 5.2624 | dt: 477.98ms | tok/sec:  2142.35
step  300 | loss: 5.0609 | dt: 481.78ms | tok/sec:  2125.46
step  400 | loss: 4.9098 | dt: 481.93ms | tok/sec:  2124.80
step  500 | loss: 5.0130 | dt: 483.20ms | tok/sec:  2119.21
step  600 | loss: 5.0317 | dt: 480.63ms | tok/sec:  2130.53
step  700 | loss: 4.7445 | dt: 482.32ms | tok/sec:  2123.07
step  800 | loss: 5.0885 | dt: 482.35ms | tok/sec:  2122.94
step  900 | loss: 4.9827 | dt: 481.53ms | tok/sec:  2126.55
step 1000 | loss: 4.6949 | dt: 481.95ms | tok/sec:  2124.69
step 1100 | loss: 4.3046 | dt: 482.58ms | tok/sec:  2121.92
step 1200 | loss: 3.9141 | dt: 477.03ms | tok/sec:  2146.62
step 1300 | loss: 4.0214 | dt: 482.11ms | tok/sec:  2124.01
step 1400 | loss: 3.2829 | dt: 485.81ms | tok/sec:  2107.82
step 1500 | loss: 4.31

UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL __main__.SmolLM2Config was not an allowed global by default. Please use `torch.serialization.add_safe_globals([__main__.SmolLM2Config])` or the `torch.serialization.safe_globals([__main__.SmolLM2Config])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

In [None]:
# ============================================================================
# load checkpoint, train 50 more steps
# ============================================================================

print("\n" + "=" * 80)
print("PHASE 2: Loading checkpoint and training for 50 more steps")
print("=" * 80)

# Recreate model and optimizer
model = SmolLM2(config)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9, 0.95), weight_decay=0.1)

# Load checkpoint
start_step, last_loss = load_checkpoint(checkpoint_path, model, optimizer)

# Train for 50 more steps
additional_steps = 50
final_checkpoint_path = 'smollm2_checkpoint_5050.pt'

for step in range(start_step, start_step + additional_steps):
    t0 = time.time()

    # Get batch
    x, y = train_loader.next_batch()
    x, y = x.to(device), y.to(device)

    # Forward pass with autocast
    optimizer.zero_grad()
    with torch.autocast(device_type=device, dtype=torch.bfloat16):
        logits, loss = model(x, y)

    # Backward pass
    loss.backward()

    # Gradient clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # Optimizer step
    optimizer.step()

    # Synchronize for accurate timing
    if device == 'cuda':
        torch.cuda.synchronize()

    t1 = time.time()
    dt = (t1 - t0) * 1000
    tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)

    # Print every step for these 50 steps
    print(f'step {step:4d} | loss: {loss.item():.4f} | dt: {dt:6.2f}ms | tok/sec: {tokens_per_sec:8.2f}')

print(f'\nFinal loss: {loss.item():.4f}')

# Save final checkpoint
save_checkpoint(model, optimizer, start_step + additional_steps, loss.item(), final_checkpoint_path)

print("\n" + "=" * 80)
print("PHASE 2 Complete!")
print(f"Training finished at step {start_step + additional_steps}")
print("=" * 80)