<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/NL_DEMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# --- 1. The Deep Optimizer (Higher Level / Slow Update) ---
# This simulates the "Learning-to-Learn" mechanism.
class DeepOptimizer:
    """
    The Outer Loop: Learns the optimal 'consolidation factor' based on long-term performance.
    """
    def __init__(self, slow_update_rate=0.01):
        # The parameter to be learned by the Optimizer:
        # The optimal 'consolidation factor' for preserving knowledge.
        self.consolidation_factor = 0.95
        self.meta_rate = slow_update_rate # Rate at which the Optimizer learns

    def get_factor(self):
        """Provides the learned consolidation rule to the lower-level memory system."""
        return self.consolidation_factor

    def update(self, long_term_loss):
        """
        Adjusts the consolidation factor based on the average long-term performance loss.
        If long_term_loss is high, the optimizer learns to increase the factor (consolidate more).
        """
        # We model the update: If the average loss is low (good), the factor moves towards 1.0.
        # We use (1.0 - loss) as a proxy for 'goodness' of performance over time.
        self.consolidation_factor += self.meta_rate * (1.0 - long_term_loss)

        # Ensure the factor stays between reasonable bounds (0.5 and 1.0)
        self.consolidation_factor = np.clip(self.consolidation_factor, 0.5, 1.0)
        return self.consolidation_factor

# --- 2. The Continuum Memory System (Lower Level / Fast Update) ---
# This simulates the core NL architecture with multi-time-scale modules.
class NLMemorySystem:
    """
    The Inner Loop: Contains two memory modules with different update frequencies.
    """
    def __init__(self, initial_factor):
        # FAST Memory: Simulates quick adaptation (high update frequency)
        self.fast_memory = 0.5
        self.fast_rate = 0.5

        # SLOW Memory: Simulates consolidated knowledge (low update frequency)
        self.slow_memory = 0.5
        self.slow_rate = 0.05

        # The consolidation rule is provided by the Deep Optimizer
        self.consolidation_factor = initial_factor

    def predict(self):
        """The model's output is a blending of its memories."""
        return (self.fast_memory + self.slow_memory) / 2

    # Inner Loop (Fast Task Update)
    def fast_update(self, error):
        """Updates the fast memory module every step based on immediate error."""
        self.fast_memory -= self.fast_rate * error

    # Outer Loop Rule Application (Slow Consolidation)
    def consolidate_knowledge(self, task_error, new_factor):
        """
        Applies the consolidation rule determined by the Deep Optimizer.
        This is the mechanism that mitigates catastrophic forgetting.
        """
        self.consolidation_factor = new_factor

        # 1. Update SLOW Memory (Core Knowledge Consolidation)
        # This update is slow and steady.
        self.slow_memory -= self.slow_rate * task_error

        # 2. Modulate FAST Memory
        # The consolidation_factor (learned by the Deep Optimizer) controls
        # how much fast knowledge is 'dampened' or preserved before the next cycle.
        self.fast_memory *= self.consolidation_factor

# --- 3. Nested Training Process Simulation ---
def nested_learning_demo(epochs=20):
    optimizer = DeepOptimizer()
    model = NLMemorySystem(initial_factor=optimizer.get_factor())

    TARGET = 10.0
    CONSOLIDATION_FREQUENCY = 4

    print("--- Nested Learning Simulation Start (Target: 10.0) ---")

    cumulative_loss = 0.0

    for i in range(1, epochs + 1):
        # --- INNER LOOP (Task Execution: FAST Time Scale) ---
        target = TARGET
        prediction = model.predict()
        task_error = prediction - target
        absolute_error = np.abs(task_error)

        # 1. Update the Learner's Fast Memory based on immediate error
        model.fast_update(task_error)

        # 2. Accumulate Loss for the Outer Loop (The meta-signal)
        cumulative_loss += absolute_error

        # --- OUTER LOOP (Meta-Learning: SLOW Time Scale) ---
        if i % CONSOLIDATION_FREQUENCY == 0:
            avg_loss = cumulative_loss / CONSOLIDATION_FREQUENCY

            # --- Deep Optimizer Update ---
            # The Deep Optimizer learns from the sustained performance signal.
            new_factor = optimizer.update(avg_loss)

            # --- CMS Rule Application ---
            # The model applies the new learned consolidation rule for slow consolidation.
            model.consolidate_knowledge(task_error, new_factor)

            print(f"\n--- Epoch {i} (Deep Optimizer Update) ---")
            print(f"Prediction: {model.predict():.4f}")
            print(f"Avg Loss (Meta-Signal): {avg_loss:.4f}")
            print(f"Learned Consolidation Factor: {new_factor:.6f}")

            # Reset cumulative loss
            cumulative_loss = 0.0

    print("\n--- Final Results ---")
    print(f"Final Prediction: {model.predict():.4f}")
    print(f"Final Learned Consolidation Factor (Learned Rule): {optimizer.get_factor():.6f}")

# Run the simulation
nested_learning_demo()

--- Nested Learning Simulation Start (Target: 10.0) ---

--- Epoch 4 (Deep Optimizer Update) ---
Prediction: 6.3866
Avg Loss (Meta-Signal): 6.4941
Learned Consolidation Factor: 0.895059

--- Epoch 8 (Deep Optimizer Update) ---
Prediction: 7.8771
Avg Loss (Meta-Signal): 2.4701
Learned Consolidation Factor: 0.880358

--- Epoch 12 (Deep Optimizer Update) ---
Prediction: 8.2407
Avg Loss (Meta-Signal): 1.4512
Learned Consolidation Factor: 0.875845

--- Epoch 16 (Deep Optimizer Update) ---
Prediction: 8.3222
Avg Loss (Meta-Signal): 1.2026
Learned Consolidation Factor: 0.873819

--- Epoch 20 (Deep Optimizer Update) ---
Prediction: 8.3329
Avg Loss (Meta-Signal): 1.1470
Learned Consolidation Factor: 0.872350

--- Final Results ---
Final Prediction: 8.3329
Final Learned Consolidation Factor (Learned Rule): 0.872350


In [2]:
import numpy as np
import torch
from torch import nn

# --- 1. The Deep Optimizer (Meta-Learning / Outer Loop) ---
class DeepOptimizer:
    """
    Simulates the slow-time-scale meta-optimizer.
    It learns a meta-parameter (e.g., consolidation factor) based on long-term loss.
    """
    def __init__(self, meta_learning_rate=0.01):
        # This parameter represents a learned rule (e.g., how much to consolidate)
        self.consolidation_factor = torch.tensor(0.95, requires_grad=True)
        self.meta_optimizer = torch.optim.SGD([self.consolidation_factor], lr=meta_learning_rate)

    def get_factor(self):
        """Returns the current learned consolidation rule."""
        return self.consolidation_factor.item()

    def update(self, long_term_loss):
        """
        Updates the consolidation factor based on the sustained long-term performance loss.
        (In a real scenario, this involves second-order derivatives or reinforcement learning.)
        """
        self.meta_optimizer.zero_grad()

        # Conceptual Meta-Loss: We define the Meta-Loss as (1.0 - Loss)
        # to encourage the factor to change when the average loss is high.
        meta_loss = (1.0 - long_term_loss) * self.consolidation_factor # Example proxy loss
        meta_loss.backward()
        self.meta_optimizer.step()

        # Clamp the factor to keep it within sensible bounds
        self.consolidation_factor.data.clamp_(0.5, 1.0)
        return self.get_factor()

# --- 2. The NL Transformer Feed-Forward Network (CMS) ---
class NLTransformerFFN(nn.Module):
    """
    Simulates the Continuum Memory System (CMS) component of a Transformer.
    It contains parallel Fast and Slow memory modules with separate optimization.
    """
    def __init__(self, d_model):
        super().__init__()

        # --- SLOW MEMORY MODULE (Consolidation) ---
        self.slow_linear = nn.Linear(d_model, d_model)

        # --- FAST MEMORY MODULE (Adaptation) ---
        self.fast_linear = nn.Linear(d_model, d_model)

        # Optimizer for the Fast Module (Inner Loop)
        self.fast_optimizer = torch.optim.Adam(self.fast_linear.parameters(), lr=1e-3)

        # Optimizer for the Slow Module (Outer Loop / Consolidation)
        self.slow_optimizer = torch.optim.Adam(self.slow_linear.parameters(), lr=1e-5)

    def forward(self, x):
        """The output is the combination of fast and slow memory modules."""
        fast_out = torch.relu(self.fast_linear(x))
        slow_out = torch.relu(self.slow_linear(x))
        return fast_out + slow_out

    # Inner Loop Update (FAST)
    def fast_update(self, loss_grad):
        """Updates the fast memory module based on task gradient."""
        self.fast_optimizer.zero_grad()
        # In a real model, we would backprop through the whole model.
        # Here we conceptually update the fast module using the overall loss gradient.
        # This is the standard, frequent Transformer weight update.
        self.fast_optimizer.step() # Requires loss.backward() to run first

    # Outer Loop Update (SLOW / Consolidation)
    def slow_update(self, consolidation_factor):
        """Updates the slow memory module and applies the consolidation rule."""
        # 1. Update SLOW Memory (lock in consolidated knowledge)
        # This step is conceptually driven by the Meta-Loss in a real NL model,
        # but here we use the small slow_optimizer rate to represent consolidation.
        self.slow_optimizer.step() # Requires a backward pass based on Meta-Loss

        # 2. Apply Deep Optimizer's Rule (Consolidation Factor)
        # The factor modulates the Fast Memory to prevent its parameters from straying too far
        # from the Slow Memory's consolidated state, mitigating forgetting.
        for param in self.fast_linear.parameters():
            param.data.mul_(consolidation_factor)


# --- 3. Nested Training Simulation ---

def nl_transformer_demo(epochs=20, inner_steps=10):
    # Hyperparameters for simulation
    D_MODEL = 64
    CONSOLIDATION_FREQUENCY = 5

    # 1. Initialize Components
    deep_optimizer = DeepOptimizer()
    nl_ffn = NLTransformerFFN(D_MODEL)

    # Simulated Target Data (e.g., an encoded sequence)
    target_data = torch.randn(1, D_MODEL)

    print("--- Conceptual NL-Transformer Training Start ---")

    cumulative_loss_history = []

    for epoch in range(1, epochs + 1):
        avg_loss_for_meta = 0.0

        for step in range(inner_steps):
            # --- INNER LOOP (FAST TIME SCALE) ---

            # Simulated Input (sequence data)
            input_data = torch.randn(1, D_MODEL)

            # Forward Pass and Task Loss (Simulated)
            prediction = nl_ffn(input_data)
            task_loss = nn.MSELoss()(prediction, target_data)

            # Backpropagation for Gradients
            task_loss.backward(retain_graph=True) # Retain graph for simplicity

            # 1. FAST UPDATE (Standard Transformer Update)
            nl_ffn.fast_update(task_loss)

            avg_loss_for_meta += task_loss.item()

        # --- OUTER LOOP (SLOW TIME SCALE / CONSOLIDATION) ---

        avg_loss_for_meta /= inner_steps
        cumulative_loss_history.append(avg_loss_for_meta)

        if epoch % CONSOLIDATION_FREQUENCY == 0:

            # 2. DEEP OPTIMIZER UPDATE
            # The Meta-Optimizer learns from the sustained performance (avg_loss)
            learned_factor = deep_optimizer.update(torch.tensor(avg_loss_for_meta))

            # 3. SLOW UPDATE / CONSOLIDATION
            # The Slow Memory consolidates knowledge, guided by the learned factor
            nl_ffn.slow_update(learned_factor)

            print(f"\n--- Epoch {epoch} (CONSOLIDATION) ---")
            print(f"Avg Task Loss (Meta-Signal): {avg_loss_for_meta:.4f}")
            print(f"Learned Consolidation Factor: {learned_factor:.6f}")
            print(f"Fast Module Norm: {nl_ffn.fast_linear.weight.norm().item():.4f}")
            print(f"Slow Module Norm: {nl_ffn.slow_linear.weight.norm().item():.4f}")

    print("\n--- NL-Transformer Simulation Complete ---")
    print(f"Final Learned Consolidation Factor (Deep Optimizer's Rule): {deep_optimizer.get_factor():.6f}")

# Run the simulation
nl_transformer_demo()

--- Conceptual NL-Transformer Training Start ---

--- Epoch 5 (CONSOLIDATION) ---
Avg Task Loss (Meta-Signal): 1.6427
Learned Consolidation Factor: 0.956427
Fast Module Norm: 4.4180
Slow Module Norm: 4.6270

--- Epoch 10 (CONSOLIDATION) ---
Avg Task Loss (Meta-Signal): 1.5223
Learned Consolidation Factor: 0.961650
Fast Module Norm: 4.2486
Slow Module Norm: 4.6268

--- Epoch 15 (CONSOLIDATION) ---
Avg Task Loss (Meta-Signal): 1.6075
Learned Consolidation Factor: 0.967725
Fast Module Norm: 4.1114
Slow Module Norm: 4.6266

--- Epoch 20 (CONSOLIDATION) ---
Avg Task Loss (Meta-Signal): 1.5365
Learned Consolidation Factor: 0.973090
Fast Module Norm: 4.0008
Slow Module Norm: 4.6264

--- NL-Transformer Simulation Complete ---
Final Learned Consolidation Factor (Deep Optimizer's Rule): 0.973090
