In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os, numpy as np
import sys

class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))
    def forward(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight

def apply_mr_rope(q, k):
    T = q.shape[1]
    dim = q.shape[-1]
    rates = torch.exp(torch.linspace(0, -np.log(10000), dim // 2)).to(q.device)
    t = torch.arange(T, device=q.device).float()
    freqs = torch.outer(t, rates)
    cos = torch.cos(freqs).repeat(1, 2) 
    sin = torch.sin(freqs).repeat(1, 2) 
    q_out = (q * cos) + (rotate_half(q) * sin)
    k_out = (k * cos) + (rotate_half(k) * sin)
    return q_out, k_out

def rotate_half(x):
    x1, x2 = x.chunk(2, dim=-1)
    return torch.cat((-x2, x1), dim=-1)

class SovereignTransformer(nn.Module):
    def __init__(self, vocab_size, emb=128, layers=3, n_experts=4):
        super().__init__()
        self.emb, self.layers, self.n_experts = emb, layers, n_experts
        self.kv_dim = emb // 4
        self.z_dim = int(8/3 * emb)
        self.n_buckets = 32
        self.lsh_proj = nn.Parameter(torch.randn(self.kv_dim, self.n_buckets))
        self.lru_gate = nn.Linear(emb, emb)
        self.embedding = nn.Embedding(vocab_size, emb)
        self.norm = RMSNorm(emb)
        self.kv_latent = nn.Linear(emb, self.kv_dim * 2)
        self.q_experts = nn.ModuleList([nn.Linear(emb, self.kv_dim) for _ in range(n_experts)])
        self.o_experts = nn.ModuleList([nn.Linear(self.kv_dim, emb) for _ in range(n_experts)])
        self.tau = nn.Parameter(torch.ones(layers, emb))
        self.dt = 0.1
        self.ambient_solidify = nn.Parameter(torch.ones(1))
        self.w_up = nn.Linear(emb, self.z_dim)
        self.w_gate = nn.Linear(emb, self.z_dim)
        self.w_down = nn.Linear(self.z_dim, emb)
        self.head_t1 = nn.Linear(emb, vocab_size, bias=False)
        self.head_t2 = nn.Linear(emb, vocab_size)
        self.pred_state = nn.Linear(emb, emb)
        self.router = nn.Linear(emb, n_experts)
        self.head_t1.weight = self.embedding.weight

    def hashed_mla(self, q, k, v):
        q_b = torch.argmax(q @ self.lsh_proj, dim=-1)
        k_b = torch.argmax(k @ self.lsh_proj, dim=-1)
        bucket_mask = (q_b.unsqueeze(-1) == k_b.unsqueeze(-2))
        attn = (q @ k.transpose(-2, -1)) / (self.kv_dim**0.5)
        attn = attn.masked_fill(~bucket_mask, -1e9)
        return F.softmax(attn, dim=-1) @ v

    def forward(self, x, lru_state=None, cache=None, tau=1.0):
        B, T = x.shape
        h = self.embedding(x)
        new_cache=[]
        all_route_probs = []
        if lru_state is None:
            lru_state = torch.zeros(B, self.emb, device=x.device)
        for l in range(self.layers):
            h = self.norm(h)
            h_in = h
            logits = self.router(h)
            if self.training:
                route_soft = F.gumbel_softmax(logits, tau=tau, hard=True)
            else:
                # In eval mode, we still need the same shape for the loop
                indices = logits.argmax(-1)
                route_soft = F.one_hot(indices, num_classes=self.n_experts).float()
            all_route_probs.append(route_soft)
            lambda_lru = torch.sigmoid(self.lru_gate(h))
            lru_state_expanded = lru_state.unsqueeze(1).expand(-1, T, -1)
            h_mem = lambda_lru * h + (1 - lambda_lru) * lru_state_expanded
            kv = self.kv_latent(h_mem)
            k, v = torch.chunk(kv, 2, dim=-1)
            if cache is not None:
                prev_k, prev_v = cache[l]
                k = torch.cat([prev_k, k], dim=1)
                v = torch.cat([prev_v, v], dim=1)
            new_cache.append((k.detach(), v.detach()))
            combined_context = torch.zeros(B, T, self.emb, device=h.device, dtype=h.dtype)
            h_flat = h.view(-1, self.emb) 
            route_flat = route_soft.view(-1, self.n_experts)
            expert_accumulation = torch.zeros_like(h_flat)
            for e_idx in range(self.n_experts):
                mask = route_flat[:, e_idx].bool()
                if not mask.any(): continue
                h_active = h_flat[mask]
                q_e = self.q_experts[e_idx](h_active) 
                k_mean = k.mean(dim=1).view(B, -1)
                v_mean = v.mean(dim=1).view(B, -1)
                attn_score = torch.sigmoid((q_e @ k_mean.T).mean(dim=-1, keepdim=True))
                ctx = attn_score * v_mean.mean(dim=0) 
                expert_accumulation[mask] = self.o_experts[e_idx](ctx).to(h.dtype)
            combined_context = expert_accumulation.view(B, T, self.emb)
            effective_dt = self.dt * torch.sigmoid(self.ambient_solidify)
            dh_dt = (-h + combined_context) / torch.exp(self.tau[l])
            h_liquid = h + dh_dt * effective_dt
            gate = self.w_gate(h_liquid)
            up = self.w_up(h_liquid)
            h_ff = F.silu(gate) * up 
            h = h_in + self.w_down(h_ff)
            logits_t1 = self.head_t1(h) / (self.emb ** 0.5)
        return logits_t1, self.head_t2(h), self.pred_state(h), h, new_cache, all_route_probs
    
    @torch.no_grad()
    def generate(self, prompt, tokenizer, max_new_tokens=20, temperature=0.7, top_k=4):
        self.eval()
        tokens = torch.tensor([tokenizer.encode(prompt)], dtype=torch.long).to(device)
        cache=None
        print("Sovereign: ", end="", flush=True)
        for _ in range(max_new_tokens):
            idx_cond = tokens[:, -128:]
            logits, _, _, _, cache, _ = self.forward(idx_cond, cache=None)
            next_token_logits = logits[:, -1, :] / temperature
            v, _ = torch.topk(next_token_logits, top_k)
            next_token_logits[next_token_logits < v[:, [-1]]] = -float('Inf')
            probs = F.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            word = tokenizer.decode(next_token[0].tolist())
            print(word, end="", flush=True)
            tokens = torch.cat((tokens, next_token), dim=1)
            logits, _, _, _, cache, _ = self.forward(next_token, cache=cache)
        print()

class BPETokenizer:
    def __init__(self, vocab_size=2048):
        self.vocab_size, self.merges, self.vocab = vocab_size, {}, {i: bytes([i]) for i in range(256)}
    def train(self, text):
        tokens = list(text.encode("utf-8"))
        for i in range(self.vocab_size - 256):
            stats = {}
            for pair in zip(tokens, tokens[1:]): stats[pair] = stats.get(pair, 0) + 1
            if not stats: break
            pair = max(stats, key=stats.get)
            idx = 256 + i
            self.merges[pair], self.vocab[idx] = idx, self.vocab[pair[0]] + self.vocab[pair[1]]
            new_tokens, j = [], 0
            while j < len(tokens):
                if j < len(tokens)-1 and (tokens[j], tokens[j+1]) == pair:
                    new_tokens.append(idx); j += 2
                else: new_tokens.append(tokens[j]); j += 1
            tokens = new_tokens
        return tokens
    def encode(self, text):
        tokens = list(text.encode("utf-8"))
        while len(tokens) >= 2:
            stats = {p: self.merges[p] for p in zip(tokens, tokens[1:]) if p in self.merges}
            if not stats: break
            pair = min(stats.keys(), key=lambda p: self.merges[p])
            idx, new_tokens, j = self.merges[pair], [], 0
            while j < len(tokens):
                if j < len(tokens)-1 and (tokens[j], tokens[j+1]) == pair:
                    new_tokens.append(idx); j += 2
                else: new_tokens.append(tokens[j]); j += 1
            tokens = new_tokens
        return tokens
    def decode(self, ids):
        return b"".join(self.vocab.get(i, b"<?>") for i in ids).decode("utf-8", errors="replace")

def get_batch(dataset, seq_length, batch_size):
    ix = torch.randint(len(dataset) - seq_length - 2, (batch_size,))
    x = torch.stack([torch.tensor(dataset[i:i+seq_length], dtype=torch.long) for i in ix])
    y1 = torch.stack([torch.tensor(dataset[i+1:i+seq_length+1], dtype=torch.long) for i in ix])
    y2 = torch.stack([torch.tensor(dataset[i+2:i+seq_length+2], dtype=torch.long) for i in ix])
    return x, y1, y2

def print_sovereign_report(step, loss, experts_usage):
    print(f"\n--- SOVEREIGN PERFORMANCE REPORT ---")
    print(f"Step: {step} | Convergence: {1/loss:.4f}")
    print(f"Expert Balance (Entropy): {np.std(experts_usage):.2f}")
    print(f"Hardware Status: Stable (Ryzen 3 Optimized)")
    print(f"------------------------------------\n")

PATH = 'C:\\Users\\user\\Downloads\\BNCCorpus.txt'
VOCAB_SIZE, EMBED_DIM, LAYERS, N_EXPERTS = 2048, 128, 3, 4
SEQ_LEN, BATCH_SIZE, BASE_LR, MIN_LR = 64, 8, 5e-4, 5e-5
INIT_TAU, MIN_TAU, MAX_STEPS, WARMUP = 1.0, 0.1, 1001, 500

tokenizer = BPETokenizer(VOCAB_SIZE)
try:
    with open(PATH, 'r', encoding='utf-8') as f:
        text = f.read()[:100000].lower()
    tokenizer.train(text)
    data = np.array(tokenizer.encode(text))
except Exception as e:
    sys.exit()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt = SovereignTransformer(VOCAB_SIZE, EMBED_DIM, LAYERS, N_EXPERTS).to(device)
optimizer = torch.optim.AdamW(gpt.parameters(), lr=5e-4, weight_decay=0.01)
scaler = torch.amp.GradScaler('cuda') if torch.cuda.is_available() else None

checkpoint_path = "sovereign_core.pt"
print("\n[SOVEREIGN CORE MANAGEMENT]")
action = input("Command: [c]ontinue, [i]nterface/chat, or [d]elete? ").lower()

if action == 'd':
    if os.path.exists(checkpoint_path):
        os.remove(checkpoint_path)
        print("!!! CORE PURGED: Starting fresh. !!!")
    start_step = 0
    mode = 'train'

elif action == 'i':
    if os.path.exists(checkpoint_path):
        print("--- Loading Sovereign for Dialogue... ---")
        checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
        gpt.load_state_dict(checkpoint['model_state_dict'])
        gpt.eval() 
        mode = 'chat'
    else:
        print("!! No Core found to chat with. Please train first. !!")
        exit()

elif action == 'c':
    if os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path, map_location=device)
        gpt.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_step = checkpoint['step']
        mode = 'train'
        print(f"-> Resuming training from step {start_step}")
    else:
        print("? No checkpoint found. Starting fresh.")
        start_step = 0
        mode = 'train'
else:
    print("--- Initializing New Training ---")
    start_step = 0
    mode = 'train'

if mode == 'chat':
    print("\n" + "="*30)
    print(" SOVEREIGN INTERFACE ACTIVE ")
    print(" (Type 'exit' to close) ")
    print("="*30)
    while True:
        u = input("\nYou: ").strip()
        if u.lower() in ['exit', 'q']: break
        # Generate with a slightly lower temperature for logic
        gpt.generate(u, tokenizer, max_new_tokens=50, temperature=0.7)
elif mode == 'train':
    for i in range(start_step, MAX_STEPS):
        if i < WARMUP:
            curr_lr = BASE_LR * (i / WARMUP)
        else:
            progress = (i - WARMUP) / (MAX_STEPS - WARMUP)
            curr_lr = MIN_LR + 0.5 * (BASE_LR - MIN_LR) * (1 + np.cos(np.pi * progress))
            curr_tau = INIT_TAU - (INIT_TAU - MIN_TAU) * (i / MAX_STEPS)
        for param_group in optimizer.param_groups:
            param_group['lr'] = curr_lr
        xb, yb1, yb2 = get_batch(data, SEQ_LEN, BATCH_SIZE)
        xb, yb1, yb2 = xb.to(device), yb1.to(device), yb2.to(device)
        context = torch.amp.autocast(device_type='cuda', dtype=torch.float16) if torch.cuda.is_available() else torch.amp.autocast(device_type='cpu', dtype=torch.bfloat16)
        with context:
            l1, l2, ph, hf, _, routes = gpt(xb, tau=curr_tau)
            loss_t1 = F.cross_entropy(l1.view(-1, VOCAB_SIZE), yb1.view(-1))
            if i > 1000:
                loss_t2 = F.cross_entropy(l2[:, :-1].reshape(-1, VOCAB_SIZE), yb2[:, :-1].reshape(-1))
                loss_dream = F.mse_loss(ph[:, :-1], hf[:, 1:].detach())
                total_loss = loss_t1 + (0.1 * loss_t2) + (0.01 * loss_dream)
            else:
                total_loss = loss_t1
        optimizer.zero_grad()
        if scaler:
            scaler.scale(total_loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            total_loss.backward()
            optimizer.step()
        if i % 50 == 0: 
            usage = routes[-1].sum(dim=(0, 1)).cpu().detach().float().numpy()
            print_sovereign_report(i, total_loss.item(), usage)
        if i % 500 == 0 and i > start_step:
            temp_path = checkpoint_path + ".tmp"
            torch.save({
                'step': i,
                'model_state_dict': gpt.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': total_loss.item(),
            }, temp_path)
            os.replace(temp_path, checkpoint_path) 
            print(f"--- Checkpoint Solidified at Step {i} ---")

In [None]:
import sys, os
import numpy as np

# --- 1. TOKENIZER (BPE) ---
class BPETokenizer:
    def __init__(self, vocab_size=1024):
        self.vocab_size, self.merges, self.vocab = vocab_size, {}, {i: bytes([i]) for i in range(256)}

    def train(self, text):
        tokens = list(text.encode("utf-8"))
        for i in range(self.vocab_size - 256):
            stats = {}
            for pair in zip(tokens, tokens[1:]): stats[pair] = stats.get(pair, 0) + 1
            if not stats: break
            pair = max(stats, key=stats.get)
            idx = 256 + i
            self.merges[pair], self.vocab[idx] = idx, self.vocab[pair[0]] + self.vocab[pair[1]]
            new_tokens, j = [], 0
            while j < len(tokens):
                if j < len(tokens)-1 and (tokens[j], tokens[j+1]) == pair:
                    new_tokens.append(idx); j += 2
                else: new_tokens.append(tokens[j]); j += 1
            tokens = new_tokens
        return tokens

    def encode(self, text):
        tokens = list(text.encode("utf-8"))
        while len(tokens) >= 2:
            stats = {p: self.merges[p] for p in zip(tokens, tokens[1:]) if p in self.merges}
            if not stats: break
            pair = min(stats.keys(), key=lambda p: self.merges[p])
            idx, new_tokens, j = self.merges[pair], [], 0
            while j < len(tokens):
                if j < len(tokens)-1 and (tokens[j], tokens[j+1]) == pair:
                    new_tokens.append(idx); j += 2
                else: new_tokens.append(tokens[j]); j += 1
            tokens = new_tokens
        return tokens

    def decode(self, ids):
        return b"".join(self.vocab.get(i, b"<?>") for i in ids).decode("utf-8", errors="replace")

# --- 2. THE SOVEREIGN GPT CLASS ---
class SovereignGPT:
    def __init__(self, v_size, emb, layers, b_size, n_experts, cycles):
        self.v_size, self.emb, self.layers, self.b_size = v_size, emb, layers, b_size
        self.n_experts = n_experts
        self.dt = 0.5
        self.cycles = cycles
        
        self.W = {}
        self.init_weights()
        # Adam buffers
        self.m = {k: np.zeros_like(v, dtype=np.float32) for k, v in self.W.items()}
        self.v = {k: np.zeros_like(v, dtype=np.float32) for k, v in self.W.items()}
        self.t = 0

    def init_weights(self):
        # Embedding
        self.W['E'] = np.random.randn(self.v_size, self.emb) * np.sqrt(1/self.emb)
        # Router (Projection to experts)
        self.W['router'] = np.random.randn(self.emb, self.n_experts) * 0.02
        
        hidden = int(8/3 * self.emb) 
        
        for l in range(self.layers):
            self.W[f'W_kv_up_l{l}'] = np.random.randn(self.emb, self.emb * 2) * np.sqrt(1/self.emb)
            for e in range(self.n_experts):
                self.W[f'Wq_e{e}_l{l}'] = np.random.randn(self.emb, self.emb) * 0.02
                self.W[f'Wo_e{e}_l{l}'] = np.random.randn(self.emb, self.emb) * 0.02
                self.W[f'tau_e{e}_l{l}'] = np.exp(np.random.uniform(-1, 1, (1, self.emb)))
            
            # THE ZNN / SWIGLU WEIGHTS (The memory filters)
            self.W[f'W_gate_l{l}'] = np.random.randn(self.emb, hidden) * np.sqrt(2/self.emb)
            self.W[f'W_up_l{l}'] = np.random.randn(self.emb, hidden) * np.sqrt(2/self.emb)
            self.W[f'W_down_l{l}'] = np.random.randn(hidden, self.emb) * np.sqrt(2/hidden)

            # 1. Predictive State Head (Predicting the NEXT hidden state vector)
            self.W['W_pred_state'] = np.random.randn(self.emb, self.emb) * 0.02

            # 2. Multi-Token Heads (Guessing the next TWO tokens at once)
            self.W['W_head_t1'] = np.random.randn(self.emb, self.v_size) * 0.02
            self.W['W_head_t2'] = np.random.randn(self.emb, self.v_size) * 0.02

            # 3. Recursive Confidence Gate (Determines when to stop looping)
            self.W['W_recur_gate'] = np.random.randn(self.emb, 1) * 0.02

    # --- Comfort Functions from Old Code ---
    def rms_norm(self, x):
        return x * (np.mean(x**2, axis=-1, keepdims=True) + 1e-6)**-0.5

    def softmax(self, x):
        x_max = np.max(x, axis=-1, keepdims=True)
        e_x = np.exp(x - x_max)
        return e_x / (e_x.sum(axis=-1, keepdims=True) + 1e-10)

    def get_batch(self, data, batch_size):
        ix = np.random.randint(0, len(data) - self.b_size, batch_size)
        x = np.array([data[i:i+self.b_size] for i in ix])
        y = np.array([data[i+1:i+self.b_size+1] for i in ix])
        return x, y

    def forward(self, idx, return_cache=False):
        B, T = idx.shape
        h = self.rms_norm(self.W['E'][idx])
        cache = {'h_0': h.copy()}
        
        # 1. Create a single causal mask for one sequence
        causal_mask = np.triu(np.ones((T, T)), k=1) * -1e9
        
        for l in range(self.layers):
            for r in range(self.cycles):
                h_in = h.copy()
                h_flat = h.reshape(-1, self.emb) # (512, 384)

                route_logits = h_flat @ self.W['router']
                expert_idx = np.argmax(route_logits, axis=-1)
                cache[f'expert_idx_l{l}'] = expert_idx
                
                # 1. MLA: Get compressed KV for the whole batch
                kv = h @ self.W[f'W_kv_up_l{l}']
                k, v = np.split(kv, 2, axis=-1) # (B, T, Emb)

                # 2. Query for the tokens
                q = h_flat @ self.W[f'Wq_e0_l{l}'] # (B*T, Emb)
                q = q.reshape(B, T, self.emb)

                # 3. Efficient Batch Attention (The "Streaming" part)
                # We do this for all 512 tokens at once
                attn = (q @ k.transpose(0, 2, 1)) / np.sqrt(self.emb)
                attn += causal_mask # Apply the 2026 "Streaming" mask
                sm = self.softmax(attn)
                context = (sm @ v) @ self.W[f'Wo_e0_l{l}'] # (B, T, Emb)

                context_flat = context.reshape(-1, self.emb) # Ensure this is (512, 384)
                cache[f'context_l{l}'] = context_flat

                # 4. Liquid & Filter Logic
                dh_dt = (-h_flat + context_flat) / self.W[f'tau_e0_l{l}']
                h_liquid = h_flat + ((-h_flat + context_flat) / self.W[f'tau_e0_l{l}']) * self.dt
                
                # 5. ZNN/SwiGLU Nudge
                g = h_flat @ self.W[f'W_gate_l{l}']
                gate = g * (1 / (1 + np.exp(-g)))
                z = h_liquid @ self.W[f'W_up_l{l}']
                h_filtered = (z * gate) @ self.W[f'W_down_l{l}'] 
                h = np.tanh(h_in + h_filtered.reshape(B, T, self.emb))

                confidence = 1 / (1 + np.exp(-(h @ self.W['W_recur_gate'])))
                if np.mean(confidence) > 0.8: break
                
                # --- THE FIX: Uniform Caching ---
            cache[f'h_{l+1}'] = h.copy()

            # --- MULTI-TOKEN & PREDICTIVE HEADS ---
            # 1. Standard Head (T+1)
            logits = h @ self.W['W_head_t1']
            # 2. Future Head (T+2)
            logits_t2 = h @ self.W['W_head_t2']
            # 3. Predictive State (Dreaming of the next vector)
            pred_h_next = h @ self.W['W_pred_state']
            
            cache['pred_h_next'] = pred_h_next
            cache['logits_t2'] = logits_t2

            cache[f'context_l{l}'] = context_flat # No list, just the matrix
            cache[f'h_liq_l{l}'] = h_liquid
            cache[f'gate_l{l}'] = gate
            cache[f'z_l{l}'] = z
                
        if return_cache: return logits, cache
        return logits

    def backward(self, targets, logits, cache):
        B, T, V = logits.shape
        probs = self.softmax(logits)

        # --- THE FOCAL KNOB ---
        gamma = 2.0 # Higher = more focus on "hard" tokens
        target_indices = np.arange(B*T)
        flat_targets = targets.flatten()
        
        # Get probability the model gave to the CORRECT token
        pt = probs.reshape(-1, V)[target_indices, flat_targets]
        # Focal weight: (1-p)^gamma. If p is 0.99 (easy), weight is tiny.
        f_weight = (1 - pt) ** gamma
        
        d_logits = probs.copy()
        d_logits.reshape(-1, V)[target_indices, flat_targets] -= 1
        # Apply the focal nudge
        d_logits *= f_weight.reshape(-1, 1)
        d_logits /= (B * T)
        
        grads = {k: np.zeros_like(v) for k, v in self.W.items()}
        grads['E'] = (d_logits.transpose(0, 2, 1) @ cache[f'h_{self.layers}']).sum(0)
        
        probs_t2 = self.softmax(cache['logits_t2'])
        # Target for T2 is the word AFTER targets (targets[:, 1:])
        d_logits_t2 = np.zeros_like(probs_t2)
        # We can only predict T+2 for the first T-1 tokens
        t2_targets = targets[:, 1:].flatten()
        d_logits_t2[:, :-1].reshape(-1, V)[np.arange(B*(T-1)), t2_targets] -= 1
        d_logits_t2 = (probs_t2 + d_logits_t2) / (B * T)
        
        # Update Multi-token weights
        grads['W_head_t1'] = cache[f'h_{self.layers}'].transpose(0, 2, 1).reshape(self.emb, -1) @ d_logits.reshape(-1, V)
        grads['W_head_t2'] = cache[f'h_{self.layers}'].transpose(0, 2, 1).reshape(self.emb, -1) @ d_logits_t2.reshape(-1, V)

        # 3. PREDICTIVE STATE GRADIENT (Hidden State dreaming)
        # Error = Pred_h_next[T] - Actual_h_final[T+1]
        h_final = cache[f'h_{self.layers}']
        d_pred_h = np.zeros_like(h_final)
        d_pred_h[:, :-1] = cache['pred_h_next'][:, :-1] - h_final[:, 1:]
        grads['W_pred_state'] = h_final.transpose(0, 2, 1).reshape(self.emb, -1) @ d_pred_h.reshape(-1, self.emb)

        # 4. RECURSIVE GRADIENT (Initial dh signal)
        # The total error flowing back into the layers is a sum of all heads
        dh = (d_logits @ self.W['W_head_t1'].T) + (d_logits_t2 @ self.W['W_head_t2'].T) + (d_pred_h @ self.W['W_pred_state'].T)
        
        # Define the hidden expansion size (must match your init_weights)
        z_tube_dim = int(8/3 * self.emb)
        
        for l in reversed(range(self.layers)):
            h_curr, h_prev = cache[f'h_{l+1}'], cache[f'h_{l}']
            h_prev_flat = h_prev.reshape(-1, self.emb)
            dh_flat = dh.reshape(-1, self.emb)
            expert_idx = cache[f'expert_idx_l{l}']
            
            # Backprop through Tanh
            dtanh = dh_flat * (1 - h_curr.reshape(-1, self.emb)**2)
            
            # 1. Backprop through W_down (The SwiGLU "Muscle")
            # Shapes: z_flat and gate_flat are (B*T, z_tube_dim)
            z_flat = cache[f'z_l{l}'].reshape(-1, z_tube_dim)
            gate_flat = cache[f'gate_l{l}'].reshape(-1, z_tube_dim)
            
            # Gradient for the down-projection matrix
            grads[f'W_down_l{l}'] = (z_flat * gate_flat).T @ dtanh
            
            # Error signal flowing back into the two branches
            delementwise = dtanh @ self.W[f'W_down_l{l}'].T
            
            # 2. Split into Gate and Up-projection (LNN branch)
            # dz_filtered is the error destined for the LNN
            dz_filtered = delementwise * gate_flat
            d_gate = delementwise * z_flat
            
            # W_up grad (Up-projection of the Liquid State)
            h_liq_flat = cache[f'h_liq_l{l}'].reshape(-1, self.emb)
            grads[f'W_up_l{l}'] = h_liq_flat.T @ dz_filtered
            
            # W_gate grad (Swish derivative on the Gate branch)
            x_g = h_prev_flat @ self.W[f'W_gate_l{l}']
            sig = 1 / (1 + np.exp(-x_g))
            dx_g = d_gate * (sig * (1 + x_g * (1 - sig)))
            grads[f'W_gate_l{l}'] = h_prev_flat.T @ dx_g
            
            # 3. Router Gradient (Kept as requested)
            d_route = dtanh @ self.W['router']
            grads['router'] += h_prev_flat.T @ d_route * 1e-4
            
            # 4. Error signal for LNN (Flows back through W_up)
            dh_liquid = dz_filtered @ self.W[f'W_up_l{l}'].T
            
            for e in range(self.n_experts):
                mask = (expert_idx == e)
                
                # Retrieve the saved flat context (512, 384)
                ctx = cache[f'context_l{l}'] 
                
                # Slice both to match the tokens owned by this expert
                d_target = dh_liquid[mask]
                ctx_selected = ctx[mask]
                h_prev_selected = h_prev_flat[mask]

                # Now the math aligns: (384, N) @ (N, 384) = (384, 384)
                grads[f'Wo_e{e}_l{l}'] += ctx_selected.T @ d_target
                grads[f'Wq_e{e}_l{l}'] += h_prev_selected.T @ d_target
        
            # MLA Up-projection Grad (Shared KV)
            dh_reshaped = dh_liquid.reshape(B, T, self.emb)
            dh_up = np.concatenate([dh_reshaped, dh_reshaped], axis=-1) 
            grads[f'W_kv_up_l{l}'] = (h_prev.transpose(0, 2, 1) @ dh_up).sum(0)

            # Update dh for the layer below (Residual connection)
            # We add the liquid error to the previous dh signal
            dh = dh_reshaped + (dh_reshaped @ self.W[f'W_kv_up_l{l}'][:, :self.emb].T)

            grads['W_recur_gate'] += (h_final.transpose(0, 2, 1).reshape(self.emb, -1) @ dh_flat).sum(axis=1, keepdims=True) * 0.01

        return grads

    def update(self, grads, lr_max, warmup, wd):
        self.t += 1
        # Re-introducing your old Learning Rate schedule
        lr = lr_max * min(1.0, self.t / warmup)
        for k in self.W:
            g = np.clip(grads[k], -1.0, 1.0) # Grad clipping from old code
            
            # Adam buffers
            self.m[k] = 0.9 * self.m[k] + 0.1 * g
            self.v[k] = 0.999 * self.v[k] + 0.001 * (g**2)
            
            m_hat = self.m[k] / (1 - 0.9**self.t)
            v_hat = self.v[k] / (1 - 0.999**self.t)
            
            # Update with Weight Decay
            update_val = lr * (m_hat / (np.sqrt(v_hat) + 1e-8) + wd * self.W[k])
            self.W[k] -= update_val

    def generate(self, prompt, tok, length=50, tmp=0.7, k=40, p=0.9):
        ids = tok.encode(prompt.lower())
        for _ in range(length):
            context = np.array(ids[-self.b_size:]).reshape(1, -1)
            logits = self.forward(context)[0, -1, :].astype(np.float32)
            
            # 1. Repetition Penalty (The "Anti-Stutter" Filter)
            for prev_id in set(ids[-15:]): 
                logits[prev_id] -= 2.0 

            # 2. Temperature scaling
            logits /= (tmp + 1e-10)
            
            # Top-K / Top-P logic restored from old code
            if k > 0:
                threshold = np.partition(logits, -k)[-k]
                logits[logits < threshold] = -1e4
            
            probs = np.exp(logits - np.max(logits))
            probs /= (probs.sum() + 1e-10)
            
            if p < 1.0:
                sorted_indices = np.argsort(probs)[::-1]
                sorted_probs = probs[sorted_indices]
                cumulative_probs = np.cumsum(sorted_probs)
                to_remove = cumulative_probs > p
                to_remove[1:] = to_remove[:-1].copy()
                to_remove[0] = False
                probs[sorted_indices[to_remove]] = 0
                probs /= (probs.sum() + 1e-10)
                
            next_id = np.random.choice(len(probs), p=probs)
            ids.append(next_id)
            print(tok.decode([next_id]), end="", flush=True)

# --- 3. EXECUTION ---
path = 'C:\\Users\\user\\Downloads\\BNCCorpus.txt'
tokenizer = BPETokenizer(1024)
try:
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read()[:100000]
    tokenizer.train(text)
    data = np.array(tokenizer.encode(text))
except:
    print("Data not found."); sys.exit()

# Init model with Old-style config
gpt = SovereignGPT(v_size=1024, emb=384, layers=3, b_size=128, n_experts=1, cycles=3)

print("Training Sovereign System (Old Hierarchy + New Logic)...")
for i in range(301):
    xb, yb = gpt.get_batch(data, batch_size=4)
    logits, cache = gpt.forward(xb, return_cache=True)
    grads = gpt.backward(yb, logits, cache)

    # CLIP GRADIENS: Prevent values from exploding
    for k in grads:
    # Calculate the global norm of the gradient
        gnorm = np.linalg.norm(grads[k])
        if gnorm > 1.0:
            grads[k] = grads[k] * (1.0 / gnorm)
    
    # Adaptive LR
    lr = 0.001 * (0.5 * (1 + np.cos(np.pi * i / 1000)))
    gpt.update(grads, lr, warmup=100, wd=0.1)
    
    if i % 25 == 0:
        loss = -np.mean(np.log(gpt.softmax(logits.reshape(-1, 1024))[np.arange(len(yb.flat)), yb.flat] + 1e-9))
        print(f"\nStep {i} | Loss: {loss:.4f}")
        gpt.generate("The ", tokenizer, length=7)
        print("\n" + "-"*30)

print("\n--- Sovereign Chat ---")
while True:
    u = input("\nSovereign> ").strip()
    if u.lower() in ['q', 'exit']: break
    gpt.generate(u, tokenizer)

In [None]:
import sys, os
import numpy as np

# --- TOKENIZER (BPE) ---
class BPETokenizer:
    def __init__(self, vocab_size=512):
        self.vocab_size, self.merges, self.vocab = vocab_size, {}, {i: bytes([i]) for i in range(256)}

    def train(self, text):
        tokens = list(text.encode("utf-8"))
        for i in range(self.vocab_size - 256):

            stats = {}
            for pair in zip(tokens, tokens[1:]): stats[pair] = stats.get(pair, 0) + 1
            if not stats: break

            pair = max(stats, key=stats.get)
            idx = 256 + i
            self.merges[pair], self.vocab[idx] = idx, self.vocab[pair[0]] + self.vocab[pair[1]]
            new_tokens, j = [], 0

            while j < len(tokens):
                if j < len(tokens)-1 and (tokens[j], tokens[j+1]) == pair:
                    new_tokens.append(idx); j += 2
                else: new_tokens.append(tokens[j]); j += 1
            tokens = new_tokens
        return tokens

    def encode(self, text):
        tokens = list(text.encode("utf-8"))
        while len(tokens) >= 2:

            stats = {p: self.merges[p] for p in zip(tokens, tokens[1:]) if p in self.merges}
            if not stats: break

            pair = min(stats.keys(), key=lambda p: self.merges[p])
            idx, new_tokens, j = self.merges[pair], [], 0

            while j < len(tokens):
                if j < len(tokens)-1 and (tokens[j], tokens[j+1]) == pair:
                    new_tokens.append(idx); j += 2
                else: new_tokens.append(tokens[j]); j += 1
            tokens = new_tokens
        return tokens

    def decode(self, ids):
        return b"".join(self.vocab[idx] for idx in ids).decode("utf-8", errors="replace")

# --- MODEL ---
class GPT:
    def __init__(self, v_size, emb, h, groups, layers, b_size):
        self.v_size, self.emb, self.h, self.layers, self.b_size = v_size, emb, h, layers, b_size
        self.groups, self.head_dim = groups, emb // h
        self.heads_per_group = h // groups 

        self.mask = np.triu(np.ones((b_size, b_size)), 1).astype(bool)
        inv = 1.0 / (10000 ** (np.arange(0, self.head_dim, 2) / self.head_dim))
        freqs = np.outer(np.arange(b_size), inv)
        self.cos_pre, self.sin_pre = np.cos(freqs).astype(np.float16), np.sin(freqs).astype(np.float16)

        self.W, self.kv_cache = {}, [None] * layers
        self.init_w()
        self.t = 0
        self.m = {k: np.zeros_like(v) for k, v in self.W.items()}
        self.v = {k: np.zeros_like(v) for k, v in self.W.items()}

    def init_w(self):
        self.W['E'] = (np.random.randn(self.v_size, self.emb) * 0.02).astype(np.float16)
        for l in range(self.layers):
            self.W[f'g1{l}'] = np.ones((1,1,self.emb), dtype=np.float16)
            self.W[f'g2{l}'] = np.ones((1,1,self.emb), dtype=np.float16)

            std = np.sqrt(2.0 / (2 * self.emb))
            self.W[f'Wq{l}'] = (np.random.randn(self.emb, self.emb) * std).astype(np.float16)
            self.W[f'Wk{l}'] = (np.random.randn(self.emb, self.groups * self.head_dim) * std).astype(np.float16)
            self.W[f'Wv{l}'] = (np.random.randn(self.emb, self.groups * self.head_dim) * std).astype(np.float16)
            self.W[f'Wo{l}'] = (np.random.randn(self.emb, self.emb) * std).astype(np.float16)
            
            hidden = int(8/3 * self.emb)
            self.W[f'Wf1{l}'] = (np.random.randn(self.emb, hidden) * np.sqrt(2/self.emb)).astype(np.float16)
            self.W[f'Wf2{l}'] = (np.random.randn(hidden, self.emb) * np.sqrt(2/hidden)).astype(np.float16)
            self.W[f'Wf3{l}'] = (np.random.randn(self.emb, hidden) * np.sqrt(2/self.emb)).astype(np.float16)

    def rms_norm(self, x, g, out=None):
        if out is None: out = np.empty_like(x)
        msq = np.mean(x.astype(np.float32)**2, axis=-1, keepdims=True)
        out[:] = (g * (x.astype(np.float32) * (msq + 1e-6)**-0.5)).astype(np.float16)
        return out.astype(np.float16)
    
    def softmax(self, x):
        x_32 = x.astype(np.float32)
        e_x = np.exp(x_32 - np.max(x_32, axis=-1, keepdims=True))
        out = e_x / (e_x.sum(axis=-1, keepdims=True) + 1e-10)
        return out.astype(np.float16)

    def apply_rope(self, x, pos_idx, rev=False):
        c, s = self.cos_pre[pos_idx], self.sin_pre[pos_idx]
        if rev: s = -s
        x1, x2 = x[..., 0::2], x[..., 1::2]
        out = np.empty_like(x)
        out[..., 0::2] = x1 * c - x2 * s
        out[..., 1::2] = x1 * s + x2 * c
        return out

    def get_batch(self, data, batch_size):
        ix = np.random.randint(0, len(data) - self.b_size, batch_size)
        x = np.empty((batch_size, self.b_size), dtype=np.int32)
        y = np.empty((batch_size, self.b_size), dtype=np.int32)
        for i, idx in enumerate(ix):
            x[i] = data[idx : idx + self.b_size]
            y[i] = data[idx + 1 : idx + self.b_size + 1]
        return x, y

    def forward(self, idx, start_pos=0, use_cache=False):
        B, T = idx.shape

        # Transform discrete Token IDs into continuous 256-dim vectors
        # Shape: (B, T) -> (B, T, 256)
        x = self.W['E'][idx].astype(np.float16)

        pos_idx = np.arange(start_pos, start_pos + T)
        cache = {'idx': idx, 'x_0': x.copy()} 
        
        for l in range(self.layers):
            ln1 = self.rms_norm(x, self.W[f'g1{l}'])

            # Project normalized input into Query, Key, and Value spaces
            # Q: (B, T, 256) @ (256, 256) -> (B, 8, T, 32) [Split into 8 heads]
            # K/V: (B, T, 256) @ (256, 64) -> (B, 2, T, 32) [Split into 2 groups
            q = (ln1 @ self.W[f'Wq{l}']).reshape(B, T, self.h, self.head_dim).transpose(0, 2, 1, 3).astype(np.float16)
            k = (ln1 @ self.W[f'Wk{l}']).reshape(B, T, self.groups, self.head_dim).transpose(0, 2, 1, 3).astype(np.float16)
            v = (ln1 @ self.W[f'Wv{l}']).reshape(B, T, self.groups, self.head_dim).transpose(0, 2, 1, 3).astype(np.float16)

            q, k = self.apply_rope(q, pos_idx), self.apply_rope(k, pos_idx)
            
            if use_cache:
                if self.kv_cache[l] is None or start_pos == 0: self.kv_cache[l] = (k, v)
                else:
                    k = np.concatenate([self.kv_cache[l][0], k], axis=2)
                    v = np.concatenate([self.kv_cache[l][1], v], axis=2)
                    self.kv_cache[l] = (k, v)
            
            # Expand Keys to match the number of Query heads (GQA logic)
            # Shape: (B, 2, T, 32) -> (B, 8, T, 32)
            k_rep = np.repeat(k, self.heads_per_group, axis=1)
            v_rep = np.repeat(v, self.heads_per_group, axis=1)
            
            # Compute alignment scores: "How much does word T look at word T-n?"
            # (B, 8, T, 32) @ (B, 8, 32, T) -> (B, 8, T, T)
            sc = (q @ k_rep.transpose(0, 1, 3, 2)) / np.sqrt(self.head_dim)
            if not use_cache: sc[:, :, self.mask[:T, :T]] = -65500
            
            # Convert scores to probabilities (Softmax)
            # Result: (B, 8, T, T) summing to 1.0 across the last dimension
            sc_32 = sc.astype(np.float32)
            e_x = np.exp(sc_32 - np.max(sc_32, axis=-1, keepdims=True))
            attn = (e_x / (e_x.sum(axis=-1, keepdims=True) + 1e-10)).astype(np.float16)
            
            # Use attention weights to mix the "Content" (Values)
            # Input: (B, h, T, T) @ (B, h, T, head_dim) -> Output: (B, h, T, head_dim)
            out_att = (attn @ v_rep).transpose(0, 2, 1, 3).reshape(B, T, self.emb)

            # Project mixed content back to the residual highway and add it
            # (B, T, self.emb) + (B, T, self.emb) -> (B, T, self.emb)
            x += out_att @ self.W[f'Wo{l}']
            
            # Normalize the highway (Residual Stream)
            # Input: (B, T, 256) -> Output: (B, T, 256)
            ln2 = self.rms_norm(x, self.W[f'g2{l}'])

            # Project to Hidden Space (Expansion)
            # (B, T, 256) @ (256, 682) -> (B, T, 682)
            gate = ln2 @ self.W[f'Wf1{l}']
            up = ln2 @ self.W[f'Wf3{l}']

            # Apply Activation (Swish)
            # Shape remains (B, T, 682)
            gate_32 = gate.astype(np.float32)
            act_gate = (gate_32 / (1.0 + np.exp(-gate_32))).astype(np.float16)

            # Element-wise multiply and Shrink back to Embedding size
            # (B, T, 682) * (B, T, 682) -> (B, T, 682)
            # (B, T, 682) @ (682, 256) -> (B, T, 256)
            x += (act_gate * up) @ self.W[f'Wf2{l}']
            cache[f'l{l}'] = (ln1, q, k, v, attn, out_att, ln2, gate, up, act_gate)

        cache['x_final'] = x
        return x @ self.W['E'].T, cache

    def backward(self, yb, logits, cache):
        B, T = yb.shape
        grads = {k: np.zeros_like(v, dtype=np.float32) for k, v in self.W.items()}
        pp_sum = np.exp(logits.astype(np.float32) - np.max(logits.astype(np.float32), axis=-1, keepdims=True))
        probs = pp_sum / (pp_sum.sum(axis=-1, keepdims=True) + 1e-10)
        probs[np.arange(B)[:, None], np.arange(T), yb] -= 1

        # Calculate Cross-Entropy Loss gradient (Probs - Ground Truth)
        # Shape: (B, T, Vocab)
        dl = (probs / (B * T)).astype(np.float16)
        
        # Calculate error for output projection
        # (B, T, vocab) @ (B, T, 256) -> (Vocab, 256)
        grads['E'] += (dl.transpose(0, 2, 1) @ cache['x_final']).sum(0)

        # Backprop error into the residual stream 'dx'
        # (B, T, Vocab) @ (Vocab, Emb) -> (B, T, Emb)
        dx = (dl @ self.W['E']).astype(np.float16)
        pos_idx = np.arange(T)

        for l in reversed(range(self.layers)):
            ln1, q, k, v, attn, out_att, ln2, gate, up, act_gate = cache[f'l{l}']

            # Move error through the SwiGLU bottleneck
            # (B, T, 256) @ (256, 682) -> (B, T, 682)
            df2 = dx @ self.W[f'Wf2{l}'].T

            # Gradient for Wf2: Input (act_gate) @ Error (dx)
            # (Hidden, B*T) @ (B*T, Emb) -> (Hidden, Emb)
            grads[f'Wf2{l}'] += act_gate.reshape(-1, act_gate.shape[-1]).T @ dx.reshape(-1, self.emb)
            
            sig = 1 / (1 + np.exp(-gate.astype(np.float32)))
            dgate32 = (df2.astype(np.float32) * up) * (sig * (1 + gate.astype(np.float32) * (1 - sig)))
            dgate = dgate32.astype(np.float16)
            dup = (df2 * act_gate).astype(np.float16)

            # Gradient for Weight f3 (Up projection)
            # (256, B*T) @ (B*T, 682) -> (256, 682)
            grads[f'Wf3{l}'] += ln2.reshape(-1, self.emb).T @ dup.reshape(-1, dup.shape[-1])
            grads[f'Wf1{l}'] += ln2.reshape(-1, self.emb).T @ dgate.reshape(-1, dgate.shape[-1])
            
            # Update dx with the error flowing through the MLP back to the main highway
            # (B, T, Hidden) @ (Hidden, Emb) -> (B, T, Emb)
            dx += (dgate @ self.W[f'Wf1{l}'].T) + (dup @ self.W[f'Wf3{l}'].T)
            
            grads[f'Wo{l}'] += out_att.reshape(-1, self.emb).T @ dx.reshape(-1, self.emb)
            
            # Error for the attention mechanism result
            # (B, T, Emb) @ (Emb, Emb) -> (B, 8, T, 32)
            dout = (dx @ self.W[f'Wo{l}'].T).reshape(B, T, self.h, self.head_dim).transpose(0, 2, 1, 3)
            
            # Error for V: Sum heads back into groups
            # (B, 8, T, T).T @ (B, 8, T, 32) -> (B, 2, T, 32)
            dv = (attn.transpose(0, 1, 3, 2) @ dout).reshape(B, self.groups, self.heads_per_group, T, self.head_dim).sum(2)
            
            # Error for Attention Scores (Softmax derivative)
            da = (dout @ np.repeat(v, self.heads_per_group, axis=1).transpose(0, 1, 3, 2))
            da_f32 = da.astype(np.float32)
            attn_f32 = attn.astype(np.float32)
            ds = attn_f32 * (da_f32 - (attn_f32 * da_f32).sum(-1, keepdims=True))
            ds = ds.astype(np.float16)

            # Error for Q and K (Untwist RoPE first)
            dq = self.apply_rope(ds @ np.repeat(k, self.heads_per_group, axis=1), pos_idx, rev=True) / np.sqrt(self.head_dim)
            dk = self.apply_rope(ds.transpose(0, 1, 3, 2) @ q, pos_idx, rev=True).reshape(B, self.groups, self.heads_per_group, T, self.head_dim).sum(2) / np.sqrt(self.head_dim)
            
            # Backprop to Q, K, V Projection weights
            # (Emb, B*T) @ (B*T, Head_Dim) -> (Emb, Emb)
            grads[f'Wq{l}'] += ln1.reshape(-1, self.emb).T @ dq.transpose(0, 2, 1, 3).reshape(-1, self.emb)
            grads[f'Wk{l}'] += ln1.reshape(-1, self.emb).T @ dk.transpose(0, 2, 1, 3).reshape(-1, self.groups*self.head_dim)
            grads[f'Wv{l}'] += ln1.reshape(-1, self.emb).T @ dv.transpose(0, 2, 1, 3).reshape(-1, self.groups*self.head_dim)
            
            dx += (dq.transpose(0, 2, 1, 3).reshape(-1, self.emb) @ self.W[f'Wq{l}'].T).reshape(B, T, self.emb)
        
        np.add.at(grads['E'], cache['idx'], dx)
        return grads

    def update(self, grads, step, lr_max, warmup, wd):
        self.t += 1
        lr = lr_max * min(1.0, step/warmup) * max(0.1, 1.0 - (step-warmup)/4000)
        for k in self.W:
            g = np.clip(grads[k], -1.0, 1.0)

            self.m[k] = (0.9 * self.m[k].astype(np.float32) + 0.1 * g)
            self.v[k] = (0.999 * self.v[k].astype(np.float32) + 0.001 * (g**2))

            mh = self.m[k] / (1 - 0.9**self.t)
            vh = self.v[k] / (1 - 0.999**self.t)
            
            update_val = lr * (mh / (np.sqrt(vh) + 1e-8) + wd * self.W[k].astype(np.float32))
            self.W[k] = (self.W[k].astype(np.float32) - update_val).astype(np.float16)
        
        if self.t % 25 == 0:
                for k, g in grads.items():
                    print(f"{k} | Mean: {np.mean(g):.6f} | Std: {np.std(g):.6f}")

    def generate(self, prompt, tok, length=30, tmp=0.7, k=40, p=0.9):
        ids = tok.encode(prompt)
        self.kv_cache = [None] * self.layers
        curr_ids = np.array(ids).reshape(1, -1)

        for _ in range(length):
            logits, _ = self.forward(curr_ids, start_pos=len(ids)-curr_ids.shape[1], use_cache=True)
            logits = logits[0, -1, :].astype(np.float32) / (tmp + 1e-10)

            if k > 0: logits[logits < np.partition(logits, -k)[-k]] = -1e4
            probs = np.exp(logits - np.max(logits))
            probs /= (probs.sum() + 1e-10)

            if p < 1.0:
                si = np.argsort(probs)[::-1]; sp = probs[si]; cp = np.cumsum(sp)
                ir = cp > p; ir[1:] = ir[:-1].copy(); ir[0] = False
                probs[si[ir]] = 0; probs /= (probs.sum() + 1e-10)
                
            next_id = np.random.choice(len(probs), p=probs)
            ids.append(next_id); curr_ids = np.array([[next_id]])
            print(tok.decode([next_id]), end="", flush=True)
        return ids

# --- CONFIG ---
path = "C:\\Users\\user\\Downloads\\BNCCorpus.txt" 
weight_path = "C:\\Users\\user\\Downloads\\model_weights.npz"
token_path = "C:\\Users\\user\\Downloads\\tokenized_data.npy"
vocab_size, block_size, embed, groups, heads, n_layers = 384, 128, 64, 2, 4, 3
lr_max, batch_size, warmup, accum_steps, wd = 0.0001, 4, 200, 1, 0.1

# --- EXECUTION ---
try:
    with open(path, "r", encoding="utf-8") as f: text = f.read().lower()[:150000]
except: print('No data...'); sys.exit()

tokenizer = BPETokenizer(vocab_size)
if os.path.exists(token_path):
    token_data = np.load(token_path)
    print("Loaded pre-tokenized data.")
else:
    with open(path, "r", encoding="utf-8") as f: 
        text = f.read().lower()[:150000]
    print("Tokenizing text (this may take a few minutes once)...")
    token_data = np.array(tokenizer.encode(text), dtype=np.uint16)
    np.save(token_path, token_data)
    print("Tokens saved.")

n = int(0.9 * len(token_data))
train_data, val_data = token_data[:n], token_data[n:]

gpt = GPT(vocab_size, embed, heads, groups, n_layers, block_size)
if os.path.exists(weight_path):
    cp = np.load(weight_path)

    for k in gpt.W:
        if k in cp: gpt.W[k] = cp[k]
        if f"m_{k}" in cp: gpt.m[k], gpt.v[k] = cp[f"m_{k}"], cp[f"v_{k}"]
    print("Existing weights loaded.")
    mode = input("\n[T]rain further or [C]hat only? ").lower()

else:
    print("No weights found. Starting fresh training...")
    mode = 't'

if mode == 't':
    token_data = np.array(tokenizer.encode(text), dtype=np.uint16)
    n = int(0.9 * len(token_data))
    train_data, val_data = token_data[:n], token_data[n:]
    acc_grads = {k: np.zeros_like(v, dtype=np.float32) for k, v in gpt.W.items()}
    
    for i in range(401):
        xb, yb = gpt.get_batch(train_data, batch_size)
        logits, cache = gpt.forward(xb)
        grads = gpt.backward(yb, logits, cache)
        for k in grads: acc_grads[k] += np.clip(grads[k], -1.0, 1.0)
        del cache  
        del grads

        if i % 1 == 0: 
            print(f"Iteration {i} started...", end='\r')
        
        if (i + 1) % accum_steps == 0:
            for k in acc_grads: acc_grads[k] /= accum_steps
            gpt.update(acc_grads, i // accum_steps, lr_max, warmup, wd)
            for k in acc_grads: acc_grads[k].fill(0)

        if i % 20 == 0:
            xv, yv = gpt.get_batch(val_data, batch_size)
            v_logits, _ = gpt.forward(xv)
            v_probs = gpt.softmax(v_logits)
            v_loss = -np.mean(np.log(v_probs[np.arange(batch_size)[:,None], np.arange(block_size), yv] + 1e-10))
            
            save_dict = {**gpt.W, **{f"m_{k}": gpt.m[k] for k in gpt.W}, **{f"v_{k}": gpt.v[k] for k in gpt.W}}
            np.savez(weight_path, **save_dict)
            print(f"Step {i:5d} | Loss: {v_loss:.4f} ", flush=True)

print("\n--- Chatting ---")
while True:
    u = input("\n> ").strip()
    if u in ['q', 'exit']: break
    gpt.generate(u, tokenizer,  length=60, tmp=0.8)
    print()

In [None]:
old_games = [
    [0.1, 0.8, 0.1],
    [0.9, 0.9, 0.9], 
    [0.8, 0.2, 0.5],
]
results = [0, 0, 1]

def knn_predict(new_data, dataset, targets, k=3):
    distances = []
    for i in range(len(dataset)):
        dist = sum((new_data[j] - dataset[i][j])**2 for j in range(len(new_data)))**0.5
        distances.append((dist, targets[i]))
    distances.sort(key=lambda x: x[0])
    nearest = [d[1] for d in distances[:k]]
    return "Хит" if max(set(nearest), key=nearest.count) == 0 else "Провал"

test_game = [0.15, 0.85, 0.12]
print(f"Вердикт KNN: {knn_predict(test_game, old_games, results, k=1)}")

In [None]:
def tree_predict(game):
    if game[1] > 0.8:
        if game[0] < 0.3: return "Инди-хит"
        else: return "Блокбастер"
    else: return "Провал"

new_game = [0.1, 0.9, 0.5]
print(f"Вердикт дерева: {tree_predict(new_game)}")

In [None]:
import random, math
dataset = [[0.1, 0.8, 0.1], [0.9, 0.9, 0.9], [0.8, 0.2, 0.5]]
targets = [1, 1, -1] 
weights = [random.uniform(-0.1, 0.1) for _ in range(3)]
bias = 0.0
lr = 0.01 * 2
C = 1.0
epochs = 1000
l1_param = 0.005
l2_param = 0.01

for epoch in range(epochs):
    for i, x in enumerate(dataset):
        condition = targets[i] * (sum(x[j] * weights[j] for j in range(3)) + bias)
        if condition >= 1:
            for j in range(3):
                weights[j] -= lr * (l2_param * weights[j] + l1_param * (1 if weights[j] > 0 else -1))
        else:
            for j in range(3):
                weights[j] -= lr * (l2_param * weights[j] - C * x[j] * targets[i])
            bias += lr * C * targets[i]

test_game = [0.15, 0.85, 0.12]
result = sum(test_game[j] * weights[j] for j in range(3)) + bias
def sigmoid(z):
    return 1 / (1 + math.exp(-z))
prob_hit = sigmoid(result)
print(f"SVM вердикт: {'Хит' if result > 0 else 'Провал'}")
print(f"Счет (Score): {result:.2f}")
print(f"Уверенность (Вероятность Хима): {prob_hit:.2%}")