<a href="https://colab.research.google.com/github/eka-smi/mcmc-nn-architecture-search-weather/blob/main/Project_random_walk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# **–ü—É–Ω–∫—Ç B**
---



In [None]:
import math
import random
import time
from dataclasses import dataclass
from typing import List, Tuple, Dict, Any, Optional

import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset


In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [None]:
class MLP(nn.Module):

    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        arch: List[int],
        activation: str = "relu",
        dropout: float = 0.0
    ):
        super().__init__()

        act_layer = {
            "relu": nn.ReLU,
            "tanh": nn.Tanh,
            "gelu": nn.GELU,
            "sigmoid": nn.Sigmoid
        }.get(activation.lower(), nn.ReLU)

        layers = []
        prev = input_dim

        for width in arch:
            layers.append(nn.Linear(prev, width))
            layers.append(act_layer())
            if dropout > 0:
                layers.append(nn.Dropout(dropout))
            prev = width

        layers.append(nn.Linear(prev, output_dim))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


In [None]:
model = MLP(input_dim=10, output_dim=1, arch=[32, 16], activation="relu", dropout=0.1)
print(model)


MLP(
  (net): Sequential(
    (0): Linear(in_features=10, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=32, out_features=16, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.1, inplace=False)
    (6): Linear(in_features=16, out_features=1, bias=True)
  )
)


In [None]:
@torch.no_grad()
def evaluate_loss(model: nn.Module, loader: DataLoader, loss_fn) -> float:

    model.eval()

    total_loss = 0.0
    total_count = 0

    for x, y in loader:
        x, y = x.to(device), y.to(device)
        pred = model(x)
        loss = loss_fn(pred, y)

        # —É–º–Ω–æ–∂–∞–µ–º –Ω–∞ —Ä–∞–∑–º–µ—Ä –±–∞—Ç—á–∞, —á—Ç–æ–±—ã –ø–æ—Ç–æ–º —á–µ—Å—Ç–Ω–æ —É—Å—Ä–µ–¥–Ω–∏—Ç—å
        total_loss += loss.item() * x.size(0)
        total_count += x.size(0)

    return total_loss / max(total_count, 1)


In [None]:
def train_and_get_val_loss(
    arch: List[int],
    train_loader: DataLoader,
    val_loader: DataLoader,
    input_dim: int,
    output_dim: int,
    task: str = "regression",     # "regression" –∏–ª–∏ "classification"
    activation: str = "relu",
    lr: float = 1e-3,
    epochs: int = 10,
    weight_decay: float = 0.0,
    dropout: float = 0.0,
    verbose: bool = False
) -> float:


    # 1) —Å–æ–∑–¥–∞—ë–º –º–æ–¥–µ–ª—å –ø–æ –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–µ
    model = MLP(input_dim, output_dim, arch, activation=activation, dropout=dropout).to(device)

    # 2) –≤—ã–±–∏—Ä–∞–µ–º loss –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç —Ç–∏–ø–∞ –∑–∞–¥–∞—á–∏
    if task == "regression":
        # —Ä–µ–≥—Ä–µ—Å—Å–∏—è: y –æ–±—ã—á–Ω–æ float, shape [batch, 1] –∏–ª–∏ [batch, output_dim]
        loss_fn = nn.MSELoss()
    elif task == "classification":
        # –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏—è: y –¥–æ–ª–∂–Ω–æ –±—ã—Ç—å LongTensor shape [batch]
        # –∞ –≤—ã—Ö–æ–¥ –º–æ–¥–µ–ª–∏ shape [batch, num_classes]
        loss_fn = nn.CrossEntropyLoss()
    else:
        raise ValueError("task must be 'regression' or 'classification'")

    # 3) –æ–ø—Ç–∏–º–∏–∑–∞—Ç–æ—Ä
    # weight_decay ‚Äî L2-—Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏—è
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # 4) –æ–±—É—á–µ–Ω–∏–µ
    for ep in range(epochs):
        model.train()

        for x, y in train_loader:
            x, y = x.to(device), y.to(device)

            optimizer.zero_grad()
            pred = model(x)
            loss = loss_fn(pred, y)
            loss.backward()
            optimizer.step()

        if verbose and ep == epochs - 1:
            # –ø–æ–∫–∞–∑—ã–≤–∞–µ–º val_loss –≤ –∫–æ–Ω—Ü–µ, —á—Ç–æ–±—ã –Ω–µ —Å–ø–∞–º–∏—Ç—å
            val_loss = evaluate_loss(model, val_loader, loss_fn)
            print(f"[arch={arch}] epoch={ep+1}/{epochs}, val_loss={val_loss:.6f}")

    # –∏—Ç–æ–≥–æ–≤–∞—è –æ—Ü–µ–Ω–∫–∞ –Ω–∞ –≤–∞–ª–∏–¥–∞—Ü–∏–∏
    val_loss = evaluate_loss(model, val_loader, loss_fn)
    return float(val_loss)


In [None]:
def propose_architecture(
    arch: List[int],
    min_units: int,
    max_units: int,
    step: int = 8
) -> List[int]:
    """
    Proposal –¥–ª—è Metropolis-Hastings:
    - –≤—ã–±–∏—Ä–∞–µ–º —Å–ª—É—á–∞–π–Ω—ã–π —Å–ª–æ–π i
    - –ø—ã—Ç–∞–µ–º—Å—è –∏–∑–º–µ–Ω–∏—Ç—å arch[i] –Ω–∞ +/- step
    - –µ—Å–ª–∏ —É—à–ª–∏ –∑–∞ –≥—Ä–∞–Ω–∏—Ü—ã => –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –∏—Å—Ö–æ–¥–Ω—É—é –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—É (—Ç–æ –µ—Å—Ç—å "–Ω–µ —Å–¥–µ–ª–∞–ª–∏ —Ö–æ–¥")

    –ü–æ—á–µ–º—É —ç—Ç–æ —É–¥–æ–±–Ω–æ:
    - –í—Å–µ —Ä–µ–∞–ª—å–Ω—ã–µ –ø–µ—Ä–µ—Ö–æ–¥—ã –∏–º–µ—é—Ç –æ–±—Ä–∞—Ç–Ω—ã–π –ø–µ—Ä–µ—Ö–æ–¥ —Å —Ç–æ–π –∂–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å—é.
    - –ú—ã –Ω–µ –¥–µ–ª–∞–µ–º "–æ–±—Ä–µ–∑–∫—É" (clamp) –¥–æ max/min —Ç–∞–∫, —á—Ç–æ–±—ã —Å–ª—É—á–∞–π–Ω–æ –ø–æ–ª—É—á–∏–ª—Å—è
      —à–∞–≥ –º–µ–Ω—å—à–µ–≥–æ —Ä–∞–∑–º–µ—Ä–∞ (—ç—Ç–æ –º–æ–∂–µ—Ç –Ω–∞—Ä—É—à–∞—Ç—å —Å–∏–º–º–µ—Ç—Ä–∏—é, –µ—Å–ª–∏ bounds –Ω–µ –∫—Ä–∞—Ç–Ω—ã step).
    """
    new_arch = arch.copy()

    i = random.randrange(len(new_arch))         # –∫–∞–∫–æ–π —Å–ª–æ–π –º–µ–Ω—è–µ–º
    direction = random.choice([-1, +1])         # –≤ –∫–∞–∫—É—é —Å—Ç–æ—Ä–æ–Ω—É

    cand = new_arch[i] + direction * step       # –∫–∞–Ω–¥–∏–¥–∞—Ç

    # –ï—Å–ª–∏ –≤—ã—à–ª–∏ –∑–∞ –≥—Ä–∞–Ω–∏—Ü—ã ‚Äî –¥–µ–ª–∞–µ–º "–Ω—É–ª–µ–≤–æ–π —Ö–æ–¥"
    # (—Ü–µ–ø—å –º–æ–∂–µ—Ç –∏–Ω–æ–≥–¥–∞ —Å—Ç–æ—è—Ç—å –Ω–∞ –º–µ—Å—Ç–µ ‚Äî —ç—Ç–æ –Ω–æ—Ä–º–∞–ª—å–Ω–æ)
    if cand < min_units or cand > max_units:
        return new_arch  # –±–µ–∑ –∏–∑–º–µ–Ω–µ–Ω–∏–π

    new_arch[i] = int(cand)
    return new_arch


In [None]:
arch = [32, 16, 64]
for _ in range(10):
    print(arch, "->", propose_architecture(arch, min_units=8, max_units=128, step=8))


[32, 16, 64] -> [32, 16, 56]
[32, 16, 64] -> [40, 16, 64]
[32, 16, 64] -> [24, 16, 64]
[32, 16, 64] -> [24, 16, 64]
[32, 16, 64] -> [32, 16, 56]
[32, 16, 64] -> [32, 16, 72]
[32, 16, 64] -> [24, 16, 64]
[32, 16, 64] -> [24, 16, 64]
[32, 16, 64] -> [24, 16, 64]
[32, 16, 64] -> [32, 16, 56]


In [None]:
@dataclass
class MCMCConfig:
    """
    –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –Ω–∞—à–µ–π MCMC-–æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–∏ –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—ã.
    """
    K: int = 3                 # —á–∏—Å–ª–æ —Å–∫—Ä—ã—Ç—ã—Ö —Å–ª–æ–µ–≤ (–≤ –ø—É–Ω–∫—Ç–µ b —Ñ–∏–∫—Å–∏—Ä—É–µ–º –≥–ª—É–±–∏–Ω—É)
    min_units: int = 8
    max_units: int = 128
    step: int = 8              # –Ω–∞ —Å–∫–æ–ª—å–∫–æ –Ω–µ–π—Ä–æ–Ω–æ–≤ –º–µ–Ω—è–µ–º —à–∏—Ä–∏–Ω—É —Å–ª–æ—è –∑–∞ –æ–¥–∏–Ω proposal
    iters: int = 30            # —Å–∫–æ–ª—å–∫–æ —à–∞–≥–æ–≤ MCMC –¥–µ–ª–∞–µ–º

    # –¢–µ–º–ø–µ—Ä–∞—Ç—É—Ä–∞:
    # –º–∞–ª–µ–Ω—å–∫–∞—è => –ø–æ—á—Ç–∏ –≤—Å–µ–≥–¥–∞ –ø—Ä–∏–Ω–∏–º–∞–µ–º —Ç–æ–ª—å–∫–æ —É–ª—É—á—à–µ–Ω–∏—è (–∂–∞–¥–Ω—ã–π –ø–æ–∏—Å–∫)
    # –±–æ–ª—å—à–∞—è => —á–∞—Å—Ç–æ –ø—Ä–∏–Ω–∏–º–∞–µ–º —É—Ö—É–¥—à–µ–Ω–∏—è (–±–æ–ª—å—à–µ –∏—Å—Å–ª–µ–¥–æ–≤–∞–Ω–∏—è)
    temperature: float = 0.05

    # –ë—é–¥–∂–µ—Ç –æ–±—É—á–µ–Ω–∏—è –Ω–∞ –æ–¥–Ω—É –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—É (—ç—Ç–æ –≥–ª–∞–≤–Ω–∞—è "—Ü–µ–Ω–∞" –ø–æ –≤—Ä–µ–º–µ–Ω–∏)
    train_epochs: int = 10
    lr: float = 1e-3

    activation: str = "relu"
    dropout: float = 0.0
    weight_decay: float = 0.0

    task: str = "regression"   # –∏–ª–∏ "classification"

    # –ö–µ—à–∏—Ä–æ–≤–∞–Ω–∏–µ:
    # –µ—Å–ª–∏ –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞ –ø–æ–≤—Ç–æ—Ä–∏–ª–∞—Å—å, –º–æ–∂–Ω–æ –Ω–µ –æ–±—É—á–∞—Ç—å –∑–∞–Ω–æ–≤–æ, –∞ –≤–∑—è—Ç—å —Å—Ç–∞—Ä—ã–π val_loss.
    cache: bool = True


In [None]:
def metropolis_hastings_arch_search(
    train_loader: DataLoader,
    val_loader: DataLoader,
    input_dim: int,
    output_dim: int,
    init_arch: List[int],
    cfg: MCMCConfig,
    verbose: bool = True
) -> Dict[str, Any]:
    """
    –†–µ–∞–ª–∏–∑–∞—Ü–∏—è –ú–µ—Ç—Ä–æ–ø–æ–ª–∏—Å–∞‚Äì–ì–∞—Å—Ç–∏–Ω–≥—Å–∞ –ø–æ –ø—Ä–æ—Å—Ç—Ä–∞–Ω—Å—Ç–≤—É –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä.

    –°–æ—Å—Ç–æ—è–Ω–∏–µ —Ü–µ–ø–∏: arch = [n1, ..., nK]
    –≠–Ω–µ—Ä–≥–∏—è —Å–æ—Å—Ç–æ—è–Ω–∏—è: L(arch) = val_loss –ø–æ—Å–ª–µ –∫–æ—Ä–æ—Ç–∫–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è

    –¶–µ–ª–µ–≤–∞—è "–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å" –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—ã:
        pi(arch) ‚àù exp( - L(arch) / T )

    –¢–æ–≥–¥–∞ —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã–π –ú–µ—Ç—Ä–æ–ø–æ–ª–∏—Å (–ø—Ä–∏ —Å–∏–º–º–µ—Ç—Ä–∏—á–Ω–æ–º proposal):
        alpha = min(1, exp( -(L_new - L_old)/T ))
    """

    assert len(init_arch) == cfg.K, "init_arch –¥–æ–ª–∂–Ω–∞ –∏–º–µ—Ç—å –¥–ª–∏–Ω—É K (—Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–Ω–Ω–∞—è –≥–ª—É–±–∏–Ω–∞ –¥–ª—è –ø—É–Ω–∫—Ç–∞ b)."

    # loss_cache: —Å–æ—Ö—Ä–∞–Ω—è–µ–º —É–∂–µ –ø–æ—Å—á–∏—Ç–∞–Ω–Ω—ã–µ val_loss –¥–ª—è –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä,
    # —á—Ç–æ–±—ã –Ω–µ —Ç—Ä–∞—Ç–∏—Ç—å –≤—Ä–µ–º—è –Ω–∞ –ø–æ–≤—Ç–æ—Ä–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ.
    loss_cache: Dict[Tuple[int, ...], float] = {}

    def get_loss(arch: List[int]) -> float:
        """
        –í–æ–∑–≤—Ä–∞—â–∞–µ–º val_loss –¥–ª—è arch, –∏—Å–ø–æ–ª—å–∑—É—è –∫–µ—à –ø—Ä–∏ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ—Å—Ç–∏.
        """
        key = tuple(arch)

        if cfg.cache and key in loss_cache:
            return loss_cache[key]

        val_loss = train_and_get_val_loss(
            arch=arch,
            train_loader=train_loader,
            val_loader=val_loader,
            input_dim=input_dim,
            output_dim=output_dim,
            task=cfg.task,
            activation=cfg.activation,
            lr=cfg.lr,
            epochs=cfg.train_epochs,
            weight_decay=cfg.weight_decay,
            dropout=cfg.dropout,
            verbose=False
        )

        if cfg.cache:
            loss_cache[key] = val_loss

        return val_loss

    # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è —Ü–µ–ø–∏
    current_arch = init_arch.copy()
    current_loss = get_loss(current_arch)

    # –•—Ä–∞–Ω–∏–º –ª—É—á—à–∏–π –Ω–∞–π–¥–µ–Ω–Ω—ã–π –≤–∞—Ä–∏–∞–Ω—Ç –æ—Ç–¥–µ–ª—å–Ω–æ
    best_arch = current_arch.copy()
    best_loss = current_loss

    history = []

    if verbose:
        print(f"Start: arch={current_arch}, val_loss={current_loss:.6f}")

    for t in range(cfg.iters):
        # 1) –°–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞—Ç—å –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ –Ω–æ–≤–æ–≥–æ —Å–æ—Å—Ç–æ—è–Ω–∏—è
        proposed_arch = propose_architecture(
            current_arch,
            min_units=cfg.min_units,
            max_units=cfg.max_units,
            step=cfg.step
        )

        # 2) –û—Ü–µ–Ω–∏—Ç—å "—ç–Ω–µ—Ä–≥–∏—é" –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è
        proposed_loss = get_loss(proposed_arch)

        # 3) –ü–æ—Å—á–∏—Ç–∞—Ç—å –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å –ø—Ä–∏–Ω—è—Ç–∏—è
        # delta < 0 => —Å—Ç–∞–ª–æ –ª—É—á—à–µ => –ø—Ä–∏–Ω–∏–º–∞–µ–º –≤—Å–µ–≥–¥–∞
        # delta > 0 => —Å—Ç–∞–ª–æ —Ö—É–∂–µ => –ø—Ä–∏–Ω–∏–º–∞–µ–º –∏–Ω–æ–≥–¥–∞
        delta = proposed_loss - current_loss

        if delta <= 0:
            accept_prob = 1.0
        else:
            # exp(-delta/T): –µ—Å–ª–∏ T –º–∞–ª–µ–Ω—å–∫–æ–µ, —Ç–æ —É—Ö—É–¥—à–µ–Ω–∏—è –ø–æ—á—Ç–∏ –Ω–µ –ø—Ä–∏–Ω–∏–º–∞—é—Ç—Å—è
            accept_prob = math.exp(-delta / max(cfg.temperature, 1e-12))

        # 4) –ú–æ–Ω–µ—Ç–∫–∞
        accepted = (random.random() < accept_prob)

        # 5) –û–±–Ω–æ–≤–ª–µ–Ω–∏–µ —Ü–µ–ø–∏
        if accepted:
            current_arch = proposed_arch
            current_loss = proposed_loss

        # 6) –û–±–Ω–æ–≤–ª–µ–Ω–∏–µ –ª—É—á—à–µ–≥–æ
        if current_loss < best_loss:
            best_loss = current_loss
            best_arch = current_arch.copy()

        # 7) –õ–æ–≥–∏—Ä—É–µ–º —à–∞–≥
        history.append({
            "iter": t + 1,
            "current_arch": current_arch.copy(),
            "current_loss": float(current_loss),
            "proposed_arch": proposed_arch,
            "proposed_loss": float(proposed_loss),
            "accepted": accepted,
            "accept_prob": float(accept_prob),
            "best_arch": best_arch.copy(),
            "best_loss": float(best_loss),
        })

        if verbose:
            status = "ACCEPT ‚úÖ" if accepted else "reject ‚ùå"
            print(
                f"[{t+1:03d}] {status} | "
                f"prop={proposed_arch} loss={proposed_loss:.6f} | "
                f"cur={current_arch} loss={current_loss:.6f} | "
                f"best={best_arch} best_loss={best_loss:.6f} | "
                f"p={accept_prob:.3f}"
            )

    return {
        "best_arch": best_arch,
        "best_loss": best_loss,
        "history": history,
        "loss_cache_size": len(loss_cache),
    }


In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

def sanity_checks():
    print("=== 1) –ü—Ä–æ–≤–µ—Ä–∫–∞ MLP: —Ñ–æ—Ä–º—ã –≤—Ö–æ–¥–∞/–≤—ã—Ö–æ–¥–∞ ===")
    model = MLP(input_dim=10, output_dim=3, arch=[32, 16], activation="relu", dropout=0.1).to(device)
    x = torch.randn(5, 10).to(device)
    yhat = model(x)
    print("model(x).shape =", tuple(yhat.shape))
    assert yhat.shape == (5, 3), "‚ùå –ù–µ–≤–µ—Ä–Ω–∞—è —Ñ–æ—Ä–º–∞ –≤—ã—Ö–æ–¥–∞ MLP"
    print("‚úÖ MLP —Ñ–æ—Ä–º—ã –æ–∫\n")

    print("=== 2) –ü—Ä–æ–≤–µ—Ä–∫–∞ propose_architecture: –æ–¥–∏–Ω —Å–ª–æ–π –º–µ–Ω—è–µ—Ç—Å—è –Ω–∞ ¬±step –∏ –≤ –≥—Ä–∞–Ω–∏—Ü–∞—Ö ===")
    arch0 = [32, 32, 32]
    min_u, max_u, step = 8, 128, 8

    for _ in range(200):
        arch1 = propose_architecture(arch0, min_units=min_u, max_units=max_u, step=step)
        assert len(arch1) == len(arch0), "‚ùå –î–ª–∏–Ω–∞ –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—ã –∏–∑–º–µ–Ω–∏–ª–∞—Å—å"
        assert all(min_u <= a <= max_u for a in arch1), "‚ùå –í—ã—à–ª–∏ –∑–∞ –≥—Ä–∞–Ω–∏—Ü—ã min/max"

        # –ü—Ä–æ–≤–µ—Ä–∫–∞: –æ—Ç–ª–∏—á–∞–µ—Ç—Å—è –º–∞–∫—Å–∏–º—É–º –æ–¥–∏–Ω —ç–ª–µ–º–µ–Ω—Ç, –∏ –æ—Ç–ª–∏—á–∏–µ –ª–∏–±–æ 0, –ª–∏–±–æ —Ä–æ–≤–Ω–æ step
        diffs = [abs(a - b) for a, b in zip(arch0, arch1)]
        nonzero = [d for d in diffs if d != 0]
        assert len(nonzero) in [0, 1], "‚ùå –ú–µ–Ω—è–µ—Ç—Å—è –±–æ–ª—å—à–µ –æ–¥–Ω–æ–≥–æ —Å–ª–æ—è –∑–∞ —à–∞–≥"
        if len(nonzero) == 1:
            assert nonzero[0] == step, "‚ùå –®–∞–≥ –∏–∑–º–µ–Ω–µ–Ω–∏—è –Ω–µ —Ä–∞–≤–µ–Ω step"
    print("‚úÖ propose_architecture –æ–∫\n")

    print("=== 3) –ì–æ—Ç–æ–≤–∏–º –º–∞–ª–µ–Ω—å–∫–∏–π —Å–∏–Ω—Ç–µ—Ç–∏—á–µ—Å–∫–∏–π –¥–∞—Ç–∞—Å–µ—Ç –¥–ª—è —Ä–µ–≥—Ä–µ—Å—Å–∏–∏ (–±—ã—Å—Ç—Ä–æ) ===")
    # y = Xw + —à—É–º (–ø—Ä–æ—Å—Ç–∞—è —Ä–µ–≥—Ä–µ—Å—Å–∏—è, —á—Ç–æ–±—ã –æ–±—É—á–µ–Ω–∏–µ –±—ã–ª–æ —Å—Ç–∞–±–∏–ª—å–Ω—ã–º)
    torch.manual_seed(0)
    N = 1200
    input_dim = 10
    output_dim = 1

    X = torch.randn(N, input_dim)
    true_w = torch.randn(input_dim, output_dim)
    y = X @ true_w + 0.1 * torch.randn(N, output_dim)

    # train/val split
    n_train = 900
    X_train, y_train = X[:n_train], y[:n_train]
    X_val, y_val     = X[n_train:], y[n_train:]

    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=128, shuffle=True)
    val_loader   = DataLoader(TensorDataset(X_val, y_val), batch_size=256, shuffle=False)

    print("‚úÖ –î–∞—Ç–∞—Å–µ—Ç –≥–æ—Ç–æ–≤\n")

    print("=== 4) –ü—Ä–æ–≤–µ—Ä–∫–∞ train_and_get_val_loss: –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç –∫–æ–Ω–µ—á–Ω–æ–µ —á–∏—Å–ª–æ ===")
    loss1 = train_and_get_val_loss(
        arch=[32, 16, 8],
        train_loader=train_loader,
        val_loader=val_loader,
        input_dim=input_dim,
        output_dim=output_dim,
        task="regression",
        activation="relu",
        lr=1e-3,
        epochs=3,
        dropout=0.0,
        weight_decay=0.0,
        verbose=False
    )
    print("val_loss =", loss1)
    assert isinstance(loss1, float), "‚ùå val_loss –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å float"
    assert np.isfinite(loss1), "‚ùå val_loss nan/inf"
    print("‚úÖ train_and_get_val_loss –æ–∫\n")

    print("=== 5) –ü—Ä–æ–≤–µ—Ä–∫–∞ MCMC (Metropolis-Hastings): —à–∞–≥–∏, accept/reject, –∏—Å—Ç–æ—Ä–∏—è ===")
    cfg = MCMCConfig(
        K=3,
        min_units=8,
        max_units=128,
        step=8,
        iters=12,            # –º–∞–ª–æ, —á—Ç–æ–±—ã –±—ã—Å—Ç—Ä–æ
        temperature=0.005,    # –º–æ–∂–Ω–æ –º–µ–Ω—è—Ç—å
        train_epochs=3,      # –º–∞–ª–æ, —á—Ç–æ–±—ã –±—ã—Å—Ç—Ä–æ
        lr=1e-3,
        activation="relu",
        dropout=0.0,
        weight_decay=0.0,
        task="regression",
        cache=True
    )

    res = metropolis_hastings_arch_search(
        train_loader=train_loader,
        val_loader=val_loader,
        input_dim=input_dim,
        output_dim=output_dim,
        init_arch=[32, 32, 32],
        cfg=cfg,
        verbose=False
    )

    assert "best_arch" in res and "best_loss" in res and "history" in res, "‚ùå –ù–µ–ø–æ–ª–Ω—ã–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç MCMC"
    assert len(res["history"]) == cfg.iters, "‚ùå –î–ª–∏–Ω–∞ history –Ω–µ —Ä–∞–≤–Ω–∞ iters"
    assert np.isfinite(res["best_loss"]), "‚ùå best_loss nan/inf"

    accepts = [h["accepted"] for h in res["history"]]
    acc_rate = sum(accepts) / len(accepts)
    print("acceptance rate =", acc_rate)
    print("best_arch =", res["best_arch"])
    print("best_loss =", res["best_loss"])

    # best_loss –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å –º–∏–Ω–∏–º—É–º–æ–º –ø–æ best_loss –≤ –∏—Å—Ç–æ—Ä–∏–∏
    best_losses = [h["best_loss"] for h in res["history"]]
    assert abs(res["best_loss"] - min(best_losses)) < 1e-12, "‚ùå best_loss –Ω–µ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å –º–∏–Ω–∏–º—É–º–æ–º –ø–æ –∏—Å—Ç–æ—Ä–∏–∏"

    # sanity: accept_prob –≤ [0,1]
    assert all(0.0 <= h["accept_prob"] <= 1.0 for h in res["history"]), "‚ùå accept_prob –≤–Ω–µ [0,1]"

    print("‚úÖ MCMC –æ–∫\n")

    print("üéâ –í–°–Å –ü–†–û–®–õ–û: –±–∞–∑–æ–≤–∞—è —Ä–µ–∞–ª–∏–∑–∞—Ü–∏—è –ø—É–Ω–∫—Ç–∞ (b) —Ä–∞–±–æ—Ç–∞–µ—Ç –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ.")

sanity_checks()


=== 1) –ü—Ä–æ–≤–µ—Ä–∫–∞ MLP: —Ñ–æ—Ä–º—ã –≤—Ö–æ–¥–∞/–≤—ã—Ö–æ–¥–∞ ===
model(x).shape = (5, 3)
‚úÖ MLP —Ñ–æ—Ä–º—ã –æ–∫

=== 2) –ü—Ä–æ–≤–µ—Ä–∫–∞ propose_architecture: –æ–¥–∏–Ω —Å–ª–æ–π –º–µ–Ω—è–µ—Ç—Å—è –Ω–∞ ¬±step –∏ –≤ –≥—Ä–∞–Ω–∏—Ü–∞—Ö ===
‚úÖ propose_architecture –æ–∫

=== 3) –ì–æ—Ç–æ–≤–∏–º –º–∞–ª–µ–Ω—å–∫–∏–π —Å–∏–Ω—Ç–µ—Ç–∏—á–µ—Å–∫–∏–π –¥–∞—Ç–∞—Å–µ—Ç –¥–ª—è —Ä–µ–≥—Ä–µ—Å—Å–∏–∏ (–±—ã—Å—Ç—Ä–æ) ===
‚úÖ –î–∞—Ç–∞—Å–µ—Ç –≥–æ—Ç–æ–≤

=== 4) –ü—Ä–æ–≤–µ—Ä–∫–∞ train_and_get_val_loss: –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç –∫–æ–Ω–µ—á–Ω–æ–µ —á–∏—Å–ª–æ ===
val_loss = 17.18493570963542
‚úÖ train_and_get_val_loss –æ–∫

=== 5) –ü—Ä–æ–≤–µ—Ä–∫–∞ MCMC (Metropolis-Hastings): —à–∞–≥–∏, accept/reject, –∏—Å—Ç–æ—Ä–∏—è ===
acceptance rate = 0.3333333333333333
best_arch = [40, 40, 32]
best_loss = 16.006267598470053
‚úÖ MCMC –æ–∫

üéâ –í–°–Å –ü–†–û–®–õ–û: –±–∞–∑–æ–≤–∞—è —Ä–µ–∞–ª–∏–∑–∞—Ü–∏—è –ø—É–Ω–∫—Ç–∞ (b) —Ä–∞–±–æ—Ç–∞–µ—Ç –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ.


# **–ü—É–Ω–∫—Ç C**

In [None]:
import os, re, math, random
from dataclasses import dataclass
from datetime import datetime, timedelta, date
from typing import List, Dict, Any, Tuple, Optional

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

import importlib, subprocess, sys

def ensure_pkg(pkg: str):
    try:
        importlib.import_module(pkg)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "-q", "install", pkg])

ensure_pkg("xlrd")

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)


device: cpu


In [None]:
class MLP(nn.Module):
    def __init__(self, input_dim: int, output_dim: int, arch: List[int],
                 activation: str = "relu", dropout: float = 0.0):
        super().__init__()
        act_layer = {
            "relu": nn.ReLU,
            "tanh": nn.Tanh,
            "gelu": nn.GELU,
            "sigmoid": nn.Sigmoid,
        }.get(activation.lower(), nn.ReLU)

        layers = []
        prev = input_dim
        for w in arch:
            layers.append(nn.Linear(prev, w))
            layers.append(act_layer())
            if dropout and dropout > 0:
                layers.append(nn.Dropout(dropout))
            prev = w
        layers.append(nn.Linear(prev, output_dim))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


@torch.no_grad()
def evaluate_loss(model: nn.Module, loader: DataLoader, loss_fn) -> float:
    model.eval()
    total, n = 0.0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        total += loss.item() * xb.size(0)
        n += xb.size(0)
    return total / max(n, 1)


def train_and_get_val_loss(
    arch: List[int],
    train_loader: DataLoader,
    val_loader: DataLoader,
    input_dim: int,
    output_dim: int,
    lr: float = 1e-3,
    epochs: int = 10,
    weight_decay: float = 0.0,
    dropout: float = 0.0,
    activation: str = "relu",
    seed_for_arch: Optional[int] = None,
) -> float:

    if seed_for_arch is not None:
        set_seed(seed_for_arch)

    model = MLP(input_dim, output_dim, arch, activation=activation, dropout=dropout).to(device)
    loss_fn = nn.MSELoss()
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    for _ in range(epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            opt.step()

    return float(evaluate_loss(model, val_loader, loss_fn))


def propose_architecture(arch: List[int], min_units: int, max_units: int, step: int = 8) -> List[int]:
    new_arch = arch.copy()
    i = random.randrange(len(new_arch))
    direction = random.choice([-1, +1])
    cand = new_arch[i] + direction * step
    if cand < min_units or cand > max_units:
        return new_arch
    new_arch[i] = int(cand)
    return new_arch


@dataclass
class MCMCConfig:
    K: int = 3
    min_units: int = 8
    max_units: int = 128
    step: int = 8
    iters: int = 30
    temperature: float = 0.005

    train_epochs: int = 10
    lr: float = 1e-3
    weight_decay: float = 1e-4
    dropout: float = 0.1
    activation: str = "relu"

    cache: bool = True
    seed_base: int = 12345


def metropolis_hastings_arch_search(
    train_loader: DataLoader,
    val_loader: DataLoader,
    input_dim: int,
    output_dim: int,
    init_arch: List[int],
    cfg: MCMCConfig,
    verbose: bool = True
) -> Dict[str, Any]:

    assert len(init_arch) == cfg.K, "init_arch –¥–æ–ª–∂–Ω–∞ –∏–º–µ—Ç—å –¥–ª–∏–Ω—É K"

    loss_cache: Dict[Tuple[int, ...], float] = {}

    def get_loss(a: List[int]) -> float:
        key = tuple(a)
        if cfg.cache and key in loss_cache:
            return loss_cache[key]


        seed_for_arch = cfg.seed_base + (abs(hash(key)) % 100000)

        val_loss = train_and_get_val_loss(
            arch=a,
            train_loader=train_loader,
            val_loader=val_loader,
            input_dim=input_dim,
            output_dim=output_dim,
            lr=cfg.lr,
            epochs=cfg.train_epochs,
            weight_decay=cfg.weight_decay,
            dropout=cfg.dropout,
            activation=cfg.activation,
            seed_for_arch=seed_for_arch
        )

        if cfg.cache:
            loss_cache[key] = val_loss
        return val_loss

    cur_arch = init_arch.copy()
    cur_loss = get_loss(cur_arch)

    best_arch = cur_arch.copy()
    best_loss = cur_loss

    history = []
    if verbose:
        print(f"Start: arch={cur_arch}, val_loss={cur_loss:.6f}")

    for t in range(cfg.iters):
        prop_arch = propose_architecture(cur_arch, cfg.min_units, cfg.max_units, cfg.step)
        prop_loss = get_loss(prop_arch)

        delta = prop_loss - cur_loss
        if delta <= 0:
            acc_prob = 1.0
        else:
            acc_prob = math.exp(-delta / max(cfg.temperature, 1e-12))

        accepted = (random.random() < acc_prob)
        if accepted:
            cur_arch, cur_loss = prop_arch, prop_loss

        if cur_loss < best_loss:
            best_arch, best_loss = cur_arch.copy(), cur_loss

        history.append({
            "iter": t+1,
            "cur_arch": cur_arch.copy(),
            "cur_loss": float(cur_loss),
            "prop_arch": prop_arch.copy(),
            "prop_loss": float(prop_loss),
            "acc_prob": float(acc_prob),
            "accepted": bool(accepted),
            "best_arch": best_arch.copy(),
            "best_loss": float(best_loss),
        })

        if verbose:
            tag = "ACCEPT ‚úÖ" if accepted else "reject ‚ùå"
            print(f"[{t+1:03d}] {tag} | prop={prop_arch} {prop_loss:.6f} | cur={cur_arch} {cur_loss:.6f} | best={best_arch} {best_loss:.6f} | p={acc_prob:.3f}")

    return {
        "best_arch": best_arch,
        "best_loss": best_loss,
        "history": history,
        "loss_cache_size": len(loss_cache),
    }


In [None]:
from google.colab import files

uploaded = files.upload()
rp5_path = next(iter(uploaded.keys()))
print("Uploaded:", rp5_path)


def _norm_cell(x) -> str:
    if pd.isna(x):
        return ""
    s = str(x)
    s = s.replace("\xa0", " ").replace("\n", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s


def read_rp5_any(path: str) -> pd.DataFrame:
    """
    1) –ü—Ä–æ–±—É–µ–º –ø—Ä–æ—á–∏—Ç–∞—Ç—å –∫–∞–∫ –Ω–∞—Å—Ç–æ—è—â–∏–π Excel .xls
    2) –ï—Å–ª–∏ –≤–¥—Ä—É–≥ —ç—Ç–æ 'xls –∫–∞–∫ HTML' ‚Äî –ø—Ä–æ–±—É–µ–º read_html
    """
    try:
        df0 = pd.read_excel(path, header=None)
        print("Loaded as Excel (.xls). shape:", df0.shape)
        return df0
    except Exception as e_excel:
        try:
            df0 = pd.read_html(path, header=None)[0]
            print("Loaded as HTML-table disguised as xls. shape:", df0.shape)
            return df0
        except Exception as e_html:
            raise RuntimeError(
                "–ù–µ —É–¥–∞–ª–æ—Å—å –ø—Ä–æ—á–∏—Ç–∞—Ç—å —Ñ–∞–π–ª –Ω–∏ –∫–∞–∫ Excel, –Ω–∏ –∫–∞–∫ HTML.\n"
                f"Excel error: {repr(e_excel)}\n"
                f"HTML error: {repr(e_html)}"
            )


def build_table_from_rp5(df0: pd.DataFrame) -> pd.DataFrame:
    """
    rp5 –≤—Å—Ç–∞–≤–ª—è–µ—Ç –Ω–µ—Å–∫–æ–ª—å–∫–æ —Å—Ç—Ä–æ–∫ –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–µ–≤.
    –ú—ã –∏—â–µ–º —Å—Ç—Ä–æ–∫—É, –≥–¥–µ –≤ –ø–µ—Ä–≤–æ–º —Å—Ç–æ–ª–±—Ü–µ –µ—Å—Ç—å '–ú–µ—Å—Ç–Ω–æ–µ –≤—Ä–µ–º—è' (—Å –≤–∞—Ä–∏–∞—Ü–∏—è–º–∏),
    –±–µ—Ä—ë–º –µ—ë –∫–∞–∫ –∑–∞–≥–æ–ª–æ–≤–∫–∏ –∏ –≤—Å—ë –Ω–∏–∂–µ ‚Äî –∫–∞–∫ –¥–∞–Ω–Ω—ã–µ.
    """
    header_row = None

    # 1) –∏—â–µ–º –ø–æ –ø–µ—Ä–≤–æ–º—É —Å—Ç–æ–ª–±—Ü—É (—Å–∞–º—ã–π –Ω–∞–¥—ë–∂–Ω—ã–π –ø—Ä–∏–∑–Ω–∞–∫)
    for i in range(len(df0)):
        first = _norm_cell(df0.iloc[i, 0]).lower()
        if ("–º–µ—Å—Ç–Ω–æ–µ" in first) and ("–≤—Ä–µ–º—è" in first):
            header_row = i
            break

    # 2) –∑–∞–ø–∞—Å–Ω–æ–π –≤–∞—Ä–∏–∞–Ω—Ç: –µ—Å–ª–∏ –≤–¥—Ä—É–≥ "–ú–µ—Å—Ç–Ω–æ–µ –≤—Ä–µ–º—è" –Ω–µ —Å—Ç—Ä–æ–≥–æ –≤ –ø–µ—Ä–≤–æ–º —Å—Ç–æ–ª–±—Ü–µ
    if header_row is None:
        for i in range(min(len(df0), 500)):
            row = " ".join(_norm_cell(v).lower() for v in df0.iloc[i].tolist())
            if ("–º–µ—Å—Ç–Ω–æ–µ" in row) and ("–≤—Ä–µ–º—è" in row):
                header_row = i
                break

    if header_row is None:
        # –ß—Ç–æ–±—ã –Ω–µ –±—ã–ª–æ "–Ω–µ —Ä–∞–±–æ—Ç–∞–µ—Ç": –≤—ã–≤–µ–¥–µ–º –ø–µ—Ä–≤—ã–µ —Å—Ç—Ä–æ–∫–∏
        preview = df0.iloc[:12, :8].copy()
        preview = preview.applymap(_norm_cell)
        raise RuntimeError("–ù–µ –Ω–∞—à—ë–ª —Å—Ç—Ä–æ–∫—É –∑–∞–≥–æ–ª–æ–≤–∫–æ–≤ —Å '–ú–µ—Å—Ç–Ω–æ–µ –≤—Ä–µ–º—è'. –í–æ—Ç –ø–µ—Ä–≤—ã–µ 12 —Å—Ç—Ä–æ–∫ (8 –∫–æ–ª–æ–Ω–æ–∫):\n" + str(preview))

    headers = [_norm_cell(v) for v in df0.iloc[header_row].tolist()]
    # –∑–∞–ø–æ–ª–Ω—è–µ–º –ø—É—Å—Ç—ã–µ –∏–º–µ–Ω–∞
    headers = [h if h not in ("", "nan", "None") else f"col_{j}" for j, h in enumerate(headers)]

    # –¥–µ–ª–∞–µ–º –∏–º–µ–Ω–∞ —É–Ω–∏–∫–∞–ª—å–Ω—ã–º–∏
    seen = {}
    uniq = []
    for h in headers:
        k = seen.get(h, 0)
        uniq.append(h if k == 0 else f"{h}_{k}")
        seen[h] = k + 1

    data = df0.iloc[header_row + 1:].copy()
    data.columns = uniq
    data = data.dropna(how="all").reset_index(drop=True)

    print("Header row index:", header_row)
    print("Columns (first 20):", list(data.columns)[:20])
    return data


df0 = read_rp5_any(rp5_path)
df_raw = build_table_from_rp5(df0)

df_raw.head(5)


Saving 27612.23.12.2020.22.12.2025.1.0.0.ru.utf8.00000000.xls to 27612.23.12.2020.22.12.2025.1.0.0.ru.utf8.00000000 (1).xls
Uploaded: 27612.23.12.2020.22.12.2025.1.0.0.ru.utf8.00000000 (1).xls
Loaded as Excel (.xls). shape: (14609, 29)
Header row index: 6
Columns (first 20): ['–ú–µ—Å—Ç–Ω–æ–µ –≤—Ä–µ–º—è –≤ –ú–æ—Å–∫–≤–µ (–í–î–ù–•)', 'T', 'Po', 'P', 'Pa', 'U', 'DD', 'Ff', 'ff10', 'ff3', 'N', 'WW', 'W1', 'W2', 'Tn', 'Tx', 'Cl', 'Nh', 'H', 'Cm']


Unnamed: 0,–ú–µ—Å—Ç–Ω–æ–µ –≤—Ä–µ–º—è –≤ –ú–æ—Å–∫–≤–µ (–í–î–ù–•),T,Po,P,Pa,U,DD,Ff,ff10,ff3,...,Cm,Ch,VV,Td,RRR,tR,E,Tg,E',sss
0,22.12.2025 21:00,-4.3,746.6,760.9,1.6,62,"–í–µ—Ç–µ—Ä, –¥—É—é—â–∏–π —Å —Å–µ–≤–µ—Ä–æ-—Å–µ–≤–µ—Ä–æ-–∑–∞–ø–∞–¥–∞",4,,10.0,...,"–í—ã—Å–æ–∫–æ–∫—É—á–µ–≤—ã—Ö, –≤—ã—Å–æ–∫–æ—Å–ª–æ–∏—Å—Ç—ã—Ö –∏–ª–∏ —Å–ª–æ–∏—Å—Ç–æ-–¥–æ–∂–¥...","–ü–µ—Ä–∏—Å—Ç—ã—Ö, –ø–µ—Ä–∏—Å—Ç–æ-–∫—É—á–µ–≤—ã—Ö –∏–ª–∏ –ø–µ—Ä–∏—Å—Ç–æ-—Å–ª–æ–∏—Å—Ç—ã—Ö...",20,-10.6,0.2,12.0,,,,
1,22.12.2025 18:00,-2.3,745.0,759.1,2.3,81,"–í–µ—Ç–µ—Ä, –¥—É—é—â–∏–π —Å —Å–µ–≤–µ—Ä–æ-–∑–∞–ø–∞–¥–∞",3,,10.0,...,"–í—ã—Å–æ–∫–æ–∫—É—á–µ–≤—ã—Ö, –≤—ã—Å–æ–∫–æ—Å–ª–æ–∏—Å—Ç—ã—Ö –∏–ª–∏ —Å–ª–æ–∏—Å—Ç–æ-–¥–æ–∂–¥...","–ü–µ—Ä–∏—Å—Ç—ã—Ö, –ø–µ—Ä–∏—Å—Ç–æ-–∫—É—á–µ–≤—ã—Ö –∏–ª–∏ –ø–µ—Ä–∏—Å—Ç–æ-—Å–ª–æ–∏—Å—Ç—ã—Ö...",20,-5.1,0.4,12.0,,,,
2,22.12.2025 15:00,0.0,742.7,756.7,1.6,87,"–í–µ—Ç–µ—Ä, –¥—É—é—â–∏–π —Å —Å–µ–≤–µ—Ä–æ-–∑–∞–ø–∞–¥–∞",3,,10.0,...,,,9,-1.9,,,,,,
3,22.12.2025 12:00,1.8,741.1,754.9,0.6,93,"–í–µ—Ç–µ—Ä, –¥—É—é—â–∏–π —Å —Å–µ–≤–µ—Ä–æ-–∑–∞–ø–∞–¥–∞",2,,,...,,,20,0.7,,,,,,
4,22.12.2025 09:00,1.5,740.5,754.4,0.3,96,"–í–µ—Ç–µ—Ä, –¥—É—é—â–∏–π —Å –∑–∞–ø–∞–¥–∞",1,,,...,,,18,0.9,1.0,12.0,,,,


In [None]:
def pick_col(df: pd.DataFrame, names: List[str]) -> str:
    # —Ç–æ—á–Ω–æ–µ —Å–æ–≤–ø–∞–¥–µ–Ω–∏–µ
    for n in names:
        if n in df.columns:
            return n
    # –Ω–æ—Ä–º–∞–ª–∏–∑–æ–≤–∞–Ω–Ω–æ–µ —Å—Ä–∞–≤–Ω–µ–Ω–∏–µ
    norm_map = {re.sub(r"\s+", " ", str(c)).strip().lower(): c for c in df.columns}
    for n in names:
        key = n.strip().lower()
        if key in norm_map:
            return norm_map[key]
    # —á–∞—Å—Ç–∏—á–Ω–æ–µ —Å–æ–≤–ø–∞–¥–µ–Ω–∏–µ
    for n in names:
        for c in df.columns:
            if n.lower() in str(c).lower():
                return c
    raise KeyError(f"–ù–µ –Ω–∞—à—ë–ª –∫–æ–ª–æ–Ω–∫—É —Å—Ä–µ–¥–∏ –≤–∞—Ä–∏–∞–Ω—Ç–æ–≤: {names}\n–î–æ—Å—Ç—É–ø–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏: {list(df.columns)[:40]} ...")


dt_col = pick_col(df_raw, ["–ú–µ—Å—Ç–Ω–æ–µ –≤—Ä–µ–º—è", "Local time", "Time"])
T_col  = pick_col(df_raw, ["T"])
U_col  = pick_col(df_raw, ["U"])
Ff_col = pick_col(df_raw, ["Ff", "FF", "ff"])

print("Using columns:", {"dt": dt_col, "T": T_col, "U": U_col, "Ff": Ff_col})


def to_float(x):
    if pd.isna(x):
        return np.nan
    s = _norm_cell(x).replace(",", ".")
    m = re.search(r"-?\d+(\.\d+)?", s)
    return float(m.group(0)) if m else np.nan


df = df_raw[[dt_col, T_col, U_col, Ff_col]].copy()
df.columns = ["dt", "T", "U", "Ff"]

df["dt"] = pd.to_datetime(df["dt"], errors="coerce", dayfirst=True)
for c in ["T", "U", "Ff"]:
    df[c] = df[c].apply(to_float)

df = df.dropna(subset=["dt"]).sort_values("dt").reset_index(drop=True)

print("Clean df shape:", df.shape)
print("Date range:", df["dt"].min(), "‚Üí", df["dt"].max())
df.head(5)


Using columns: {'dt': '–ú–µ—Å—Ç–Ω–æ–µ –≤—Ä–µ–º—è –≤ –ú–æ—Å–∫–≤–µ (–í–î–ù–•)', 'T': 'T', 'U': 'U', 'Ff': 'Ff'}
Clean df shape: (14602, 4)
Date range: 2020-12-23 00:00:00 ‚Üí 2025-12-22 21:00:00


Unnamed: 0,dt,T,U,Ff
0,2020-12-23 00:00:00,-2.5,84.0,1.0
1,2020-12-23 03:00:00,-2.5,88.0,1.0
2,2020-12-23 06:00:00,-2.0,86.0,1.0
3,2020-12-23 09:00:00,-2.7,82.0,1.0
4,2020-12-23 12:00:00,-3.1,76.0,1.0


In [None]:
# –ë–µ—Ä—ë–º —Ç–æ–ª—å–∫–æ 15:00 (–æ–¥–Ω–∞ —Ç–æ—á–∫–∞ –Ω–∞ –¥–µ–Ω—å)
df15 = df[df["dt"].dt.hour == 15].copy()
df15["date"] = df15["dt"].dt.date
df15 = df15.drop_duplicates(subset=["date"], keep="last").sort_values("dt").reset_index(drop=True)

print("Rows at 15:00:", len(df15))
print("15:00 range:", df15["dt"].min(), "‚Üí", df15["dt"].max())
df15.head()


Rows at 15:00: 1825
15:00 range: 2020-12-23 15:00:00 ‚Üí 2025-12-22 15:00:00


Unnamed: 0,dt,T,U,Ff,date
0,2020-12-23 15:00:00,-3.7,78.0,3.0,2020-12-23
1,2020-12-24 15:00:00,-3.9,84.0,1.0,2020-12-24
2,2020-12-25 15:00:00,-4.8,90.0,4.0,2020-12-25
3,2020-12-26 15:00:00,0.2,83.0,1.0,2020-12-26
4,2020-12-27 15:00:00,-5.6,70.0,1.0,2020-12-27


In [None]:
def build_dataset_lookback(df15: pd.DataFrame, lookback_days: int = 7):
    """
    X: –ø–æ—Å–ª–µ–¥–Ω–∏–µ 7 –¥–Ω–µ–π (15:00) –ø–æ [T, Ff, U]
    y: —Å–ª–µ–¥—É—é—â–∏–π –¥–µ–Ω—å (15:00) [T, Ff, U]
    –¢–æ–ª—å–∫–æ –¥–ª—è –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω—ã—Ö –∫–∞–ª–µ–Ω–¥–∞—Ä–Ω—ã—Ö –¥–Ω–µ–π (–±–µ–∑ –¥—ã—Ä).
    """
    vals = df15[["T", "Ff", "U"]].to_numpy(np.float32)
    dates = df15["date"].to_list()

    X_list, y_list, y_dates = [], [], []
    for i in range(lookback_days, len(df15)):
        # –ø—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ 7 –ø–æ–¥—Ä—è–¥ –∫–∞–ª–µ–Ω–¥–∞—Ä–Ω—ã—Ö –¥–Ω–µ–π
        ok = True
        for j in range(i - lookback_days + 1, i + 1):
            if (dates[j] - dates[j-1]).days != 1:
                ok = False
                break
        if not ok:
            continue

        window = vals[i - lookback_days:i]   # (7,3)
        target = vals[i]                     # (3,)
        if np.any(np.isnan(window)) or np.any(np.isnan(target)):
            continue

        X_list.append(window.reshape(-1))    # (21,)
        y_list.append(target)                # (3,)
        y_dates.append(dates[i])

    X = np.stack(X_list) if X_list else np.zeros((0, lookback_days*3), np.float32)
    y = np.stack(y_list) if y_list else np.zeros((0, 3), np.float32)
    return X, y, np.array(y_dates)

X, y, y_dates = build_dataset_lookback(df15, lookback_days=7)
print("Dataset shapes:", X.shape, y.shape)
print("First target date:", y_dates[0] if len(y_dates) else None)
print("Last target date:", y_dates[-1] if len(y_dates) else None)


Dataset shapes: (1811, 21) (1811, 3)
First target date: 2020-12-30
Last target date: 2025-12-22


In [None]:
assert len(X) > 200, "–°–ª–∏—à–∫–æ–º –º–∞–ª–æ –ø—Ä–∏–º–µ—Ä–æ–≤. –ü—Ä–æ–≤–µ—Ä—å, —á—Ç–æ —Ñ–∏–ª—å—Ç—Ä 15:00 –∏ –ø–∞—Ä—Å–∏–Ω–≥ –ø—Ä–æ—à–ª–∏ –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ."

n = len(X)
n_train = int(n * 0.70)
n_val   = int(n * 0.15)
n_test  = n - n_train - n_val

X_train, y_train = X[:n_train], y[:n_train]
X_val,   y_val   = X[n_train:n_train+n_val], y[n_train:n_train+n_val]
X_test,  y_test  = X[n_train+n_val:], y[n_train+n_val:]
dates_test = y_dates[n_train+n_val:]

print("Split sizes:", {"train": len(X_train), "val": len(X_val), "test": len(X_test)})
print("Test date range:", dates_test[0], "‚Üí", dates_test[-1])

x_mean = X_train.mean(axis=0, keepdims=True)
x_std  = X_train.std(axis=0, keepdims=True) + 1e-8



y_mean = y_train.mean(axis=0, keepdims=True)
y_std  = y_train.std(axis=0, keepdims=True) + 1e-8

print(y_std )

def norm_X(a): return (a - x_mean) / x_std
def norm_y(a): return (a - y_mean) / y_std
def denorm_y(a): return a * y_std + y_mean

X_train_n = norm_X(X_train).astype(np.float32)
X_val_n   = norm_X(X_val).astype(np.float32)
X_test_n  = norm_X(X_test).astype(np.float32)

y_train_n = norm_y(y_train).astype(np.float32)
y_val_n   = norm_y(y_val).astype(np.float32)
y_test_n  = norm_y(y_test).astype(np.float32)

def make_loader(Xn, yn, bs=256, shuffle=False):
    Xt = torch.tensor(Xn, dtype=torch.float32)
    yt = torch.tensor(yn, dtype=torch.float32)
    return DataLoader(TensorDataset(Xt, yt), batch_size=bs, shuffle=shuffle)

train_loader = make_loader(X_train_n, y_train_n, bs=128, shuffle=True)
val_loader   = make_loader(X_val_n,   y_val_n,   bs=256, shuffle=False)
test_loader  = make_loader(X_test_n,  y_test_n,  bs=256, shuffle=False)

input_dim = X_train_n.shape[1]  # 21
output_dim = y_train_n.shape[1] # 3
print("input_dim:", input_dim, "output_dim:", output_dim, "(outputs: [T, Ff, U])")


Split sizes: {'train': 1267, 'val': 271, 'test': 273}
Test date range: 2025-03-25 ‚Üí 2025-12-22
[[11.792286   0.7996392 21.065529 ]]
input_dim: 21 output_dim: 3 (outputs: [T, Ff, U])


In [None]:
cfg = MCMCConfig(
    K=3,
    min_units=8,
    max_units=128,
    step=8,
    iters=30,
    temperature=0.0005,

    train_epochs=50,
    lr=1e-3,
    weight_decay=1e-4,
    dropout=0.1,
    activation="relu",

    cache=True,
    seed_base=12345
)

init_arch = [32, 32, 32]

res = metropolis_hastings_arch_search(
    train_loader=train_loader,
    val_loader=val_loader,
    input_dim=input_dim,
    output_dim=output_dim,
    init_arch=init_arch,
    cfg=cfg,
    verbose=True
)

print("\nBEST ARCH:", res["best_arch"])
print("BEST VAL LOSS:", res["best_loss"])
print("cache size:", res["loss_cache_size"])


Start: arch=[32, 32, 32], val_loss=0.451857
[001] ACCEPT ‚úÖ | prop=[24, 32, 32] 0.451526 | cur=[24, 32, 32] 0.451526 | best=[24, 32, 32] 0.451526 | p=1.000
[002] ACCEPT ‚úÖ | prop=[24, 32, 24] 0.450167 | cur=[24, 32, 24] 0.450167 | best=[24, 32, 24] 0.450167 | p=1.000
[003] ACCEPT ‚úÖ | prop=[32, 32, 24] 0.448730 | cur=[32, 32, 24] 0.448730 | best=[32, 32, 24] 0.448730 | p=1.000
[004] reject ‚ùå | prop=[32, 32, 32] 0.451857 | cur=[32, 32, 24] 0.448730 | best=[32, 32, 24] 0.448730 | p=0.002
[005] ACCEPT ‚úÖ | prop=[32, 40, 24] 0.446259 | cur=[32, 40, 24] 0.446259 | best=[32, 40, 24] 0.446259 | p=1.000
[006] reject ‚ùå | prop=[40, 40, 24] 0.455089 | cur=[32, 40, 24] 0.446259 | best=[32, 40, 24] 0.446259 | p=0.000
[007] reject ‚ùå | prop=[32, 48, 24] 0.454101 | cur=[32, 40, 24] 0.446259 | best=[32, 40, 24] 0.446259 | p=0.000
[008] reject ‚ùå | prop=[32, 48, 24] 0.454101 | cur=[32, 40, 24] 0.446259 | best=[32, 40, 24] 0.446259 | p=0.000
[009] reject ‚ùå | prop=[40, 40, 24] 0.455089 | cur=

In [None]:
# –§–∏–Ω–∞–ª—å–Ω–∞—è –º–æ–¥–µ–ª—å: –æ–±—É—á–∞–µ–º –Ω–∞ train+val (—á—Ç–æ–±—ã —á–µ—Å—Ç–Ω–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å –±–æ–ª—å—à–µ –¥–∞–Ω–Ω—ã—Ö)
X_trainval_n = np.vstack([X_train_n, X_val_n])
y_trainval_n = np.vstack([y_train_n, y_val_n])

trainval_loader = make_loader(X_trainval_n, y_trainval_n, bs=128, shuffle=True)

best_arch = res["best_arch"]

def train_final(model, loader, epochs=40, lr=1e-3, weight_decay=1e-4):
    loss_fn = nn.MSELoss()
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    for ep in range(epochs):
        model.train()
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            opt.step()
        if (ep+1) % 10 == 0:
            vl = evaluate_loss(model, val_loader, loss_fn)
            print(f"epoch {ep+1}/{epochs} val_loss={vl:.6f}")

final_model = MLP(input_dim, output_dim, best_arch, activation=cfg.activation, dropout=cfg.dropout).to(device)
train_final(final_model, trainval_loader, epochs=40, lr=cfg.lr, weight_decay=cfg.weight_decay)


epoch 10/40 val_loss=0.489552
epoch 20/40 val_loss=0.450756
epoch 30/40 val_loss=0.431904
epoch 40/40 val_loss=0.422949


In [None]:
@torch.no_grad()
def mc_dropout_predict(model: nn.Module, Xn: np.ndarray, n_samples: int = 300, q_lo=0.10, q_hi=0.90):
    """
    MC Dropout: –≤–∫–ª—é—á–∞–µ–º train() –Ω–∞ –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–µ, —á—Ç–æ–±—ã dropout —Ä–∞–±–æ—Ç–∞–ª.
    –í–æ–∑–≤—Ä–∞—â–∞–µ–º mean –∏ –∫–≤–∞–Ω—Ç–∏–ª–∏ (–∏–Ω—Ç–µ—Ä–≤–∞–ª).
    """
    model.train()
    Xt = torch.tensor(Xn, dtype=torch.float32).to(device)

    preds = []
    for _ in range(n_samples):
        preds.append(model(Xt).detach().cpu().numpy())
    preds = np.stack(preds, axis=0)

    mean = preds.mean(axis=0)
    lo = np.quantile(preds, q_lo, axis=0)
    hi = np.quantile(preds, q_hi, axis=0)
    return mean, lo, hi

mean_n, lo_n, hi_n = mc_dropout_predict(final_model, X_test_n, n_samples=300, q_lo=0.10, q_hi=0.90)

mean = denorm_y(mean_n)
lo   = denorm_y(lo_n)
hi   = denorm_y(hi_n)

y_true = y_test

# –ú–µ—Ç—Ä–∏–∫–∏ –ø–æ –∫–∞–∂–¥–æ–º—É —Ç–∞—Ä–≥–µ—Ç—É: [T, Ff, U]
mae = np.mean(np.abs(mean - y_true), axis=0)
rmse = np.sqrt(np.mean((mean - y_true) ** 2, axis=0))

coverage = np.mean((y_true >= lo) & (y_true <= hi), axis=0)

print("\nTEST metrics (outputs = [T, Ff, U])")
print("MAE :", mae)
print("RMSE:", rmse)
print("Coverage in [p10, p90]:", coverage)

# –¥–µ–ª–∞—é –ø—Ä–∏–º–µ—Ä—ã
for k in [0, -1]:
    print("\nDate:", dates_test[k])
    print(f"T  pred={mean[k,0]:.2f}  interval=({lo[k,0]:.2f}, {hi[k,0]:.2f})  true={y_true[k,0]:.2f}")
    print(f"Ff pred={mean[k,1]:.2f}  interval=({lo[k,1]:.2f}, {hi[k,1]:.2f})  true={y_true[k,1]:.2f}")
    print(f"U  pred={mean[k,2]:.2f}  interval=({lo[k,2]:.2f}, {hi[k,2]:.2f})  true={y_true[k,2]:.2f}")



TEST metrics (outputs = [T, Ff, U])
MAE : [ 2.927533    0.65617114 12.8168    ]
RMSE: [ 3.7904253   0.79216594 16.159353  ]
Coverage in [p10, p90]: [0.55311355 0.003663   0.20512821]

Date: 2025-03-25
T  pred=9.31  interval=(6.83, 11.82)  true=5.90
Ff pred=1.82  interval=(1.70, 1.91)  true=3.00
U  pred=42.92  interval=(37.92, 49.02)  true=78.00

Date: 2025-12-22
T  pred=1.20  interval=(-1.17, 4.07)  true=0.00
Ff pred=1.29  interval=(1.19, 1.40)  true=3.00
U  pred=81.77  interval=(76.55, 86.71)  true=87.00


In [None]:
def predict_exam_day(df15: pd.DataFrame, exam_date: date, lookback_days: int = 7, n_samples: int = 500):

    dfc = df15.copy()
    dfc["date"] = dfc["dt"].dt.date
    dfc = dfc.sort_values("dt")

    prev_dates = [exam_date - timedelta(days=i) for i in range(lookback_days, 0, -1)]

    rows = []
    for d in prev_dates:
        r = dfc[dfc["date"] == d]
        if len(r) == 0:
            raise ValueError(f"–ù–µ—Ç –¥–∞–Ω–Ω—ã—Ö –Ω–∞ 15:00 –∑–∞ –¥–∞—Ç—É {d}. –ù—É–∂–Ω—ã 7 –ø—Ä–µ–¥—ã–¥—É—â–∏—Ö –¥–Ω–µ–π –ø–æ–¥—Ä—è–¥.")
        rows.append(r.iloc[-1][["T","Ff","U"]].to_numpy(np.float32))

    X_exam = np.stack(rows, axis=0).reshape(1, -1)     # (1,21)
    X_exam_n = norm_X(X_exam).astype(np.float32)

    mean_n, lo_n, hi_n = mc_dropout_predict(final_model, X_exam_n, n_samples=n_samples, q_lo=0.10, q_hi=0.90)
    mean = denorm_y(mean_n)[0]
    lo   = denorm_y(lo_n)[0]
    hi   = denorm_y(hi_n)[0]

    return mean, lo, hi

last_date = df15["date"].iloc[-1]
exam_date = last_date + timedelta(days=1)
print("Using exam_date =", exam_date)

mean_e, lo_e, hi_e = predict_exam_day(df15, exam_date, lookback_days=7, n_samples=500)

print("\nForecast for 15:00 on exam day (outputs = [T, Ff, U])")
print(f"T  = {mean_e[0]:.2f}  (p10..p90: {lo_e[0]:.2f} .. {hi_e[0]:.2f})")
print(f"Ff = {mean_e[1]:.2f}  (p10..p90: {lo_e[1]:.2f} .. {hi_e[1]:.2f})")
print(f"U  = {mean_e[2]:.2f}  (p10..p90: {lo_e[2]:.2f} .. {hi_e[2]:.2f})")


Using exam_date = 2025-12-23

Forecast for 15:00 on exam day (outputs = [T, Ff, U])
T  = -1.33  (p10..p90: -4.08 .. 1.40)
Ff = 1.37  (p10..p90: 1.28 .. 1.47)
U  = 80.95  (p10..p90: 75.80 .. 85.81)


# **–ü—É–Ω–∫—Ç D**

In [None]:
import time
import math
import random
import numpy as np
import torch
import torch.nn as nn
from dataclasses import dataclass
from typing import List, Dict, Any, Tuple, Optional

def _sync_if_cuda():
    if torch.cuda.is_available():
        torch.cuda.synchronize()


@torch.no_grad()
def evaluate_loss(model: nn.Module, loader, loss_fn) -> float:
    model.eval()
    total_loss, total_n = 0.0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        total_loss += loss.item() * xb.size(0)
        total_n += xb.size(0)
    return total_loss / max(total_n, 1)


def train_model_epochs(model, train_loader, loss_fn, optimizer, epochs: int) -> Dict[str, Any]:
    model.train()
    steps = 0
    for _ in range(epochs):
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            optimizer.step()
            steps += 1
    return {"steps": steps, "epochs_done": epochs}


def train_model_time(model, train_loader, loss_fn, optimizer, time_budget_s: float) -> Dict[str, Any]:
    model.train()
    steps = 0
    epochs_done = 0

    _sync_if_cuda()
    t_end = time.perf_counter() + float(time_budget_s)

    while True:
        batches_in_this_epoch = 0

        for xb, yb in train_loader:
            # –µ—Å–ª–∏ –≤—Ä–µ–º—è –∑–∞–∫–æ–Ω—á–∏–ª–æ—Å—å ‚Äî –≤—ã—Ö–æ–¥–∏–º –ù–ï —É–≤–µ–ª–∏—á–∏–≤–∞—è epochs_done
            if time.perf_counter() >= t_end:
                _sync_if_cuda()
                return {
                    "steps": steps,
                    "epochs_done": epochs_done,                 # —Ç–æ–ª—å–∫–æ –ü–û–õ–ù–´–ï —ç–ø–æ—Ö–∏
                    "batches_in_last_epoch": batches_in_this_epoch,
                    "time_budget_s": time_budget_s
                }

            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            optimizer.step()

            steps += 1
            batches_in_this_epoch += 1

        # –¥–æ—à–ª–∏ –¥–æ –∫–æ–Ω—Ü–∞ train_loader => –∑–∞–≤–µ—Ä—à–∏–ª–∏ —ç–ø–æ—Ö—É
        epochs_done += 1



def train_and_get_val_loss_budget(
    arch: List[int],
    train_loader,
    val_loader,
    input_dim: int,
    output_dim: int,
    activation: str = "relu",
    dropout: float = 0.0,
    lr: float = 1e-3,
    weight_decay: float = 0.0,
    budget_mode: str = "epochs",         # "epochs" | "time"
    train_epochs: int = 10,
    train_time_s: float = 5.0,
    seed_for_arch: Optional[int] = None,
) -> Dict[str, Any]:

    if seed_for_arch is not None:
        random.seed(seed_for_arch)
        np.random.seed(seed_for_arch)
        torch.manual_seed(seed_for_arch)
        torch.cuda.manual_seed_all(seed_for_arch)

    model = MLP(input_dim, output_dim, arch, activation=activation, dropout=dropout).to(device)
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    _sync_if_cuda()
    t0 = time.perf_counter()

    if budget_mode == "epochs":
        train_info = train_model_epochs(model, train_loader, loss_fn, optimizer, epochs=train_epochs)
    elif budget_mode == "time":
        train_info = train_model_time(model, train_loader, loss_fn, optimizer, time_budget_s=train_time_s)
    else:
        raise ValueError("budget_mode must be 'epochs' or 'time'")

    _sync_if_cuda()
    train_seconds = time.perf_counter() - t0

    val_loss = evaluate_loss(model, val_loader, loss_fn)

    return {
        "val_loss": float(val_loss),
        "train_seconds": float(train_seconds),
        **train_info,
        "arch": arch.copy()
    }


In [None]:
@dataclass
class MCMCConfigD:
    K: int = 3
    min_units: int = 8
    max_units: int = 128
    step: int = 8
    iters: int = 30
    temperature: float = 0.05

    activation: str = "relu"
    dropout: float = 0.1
    lr: float = 1e-3
    weight_decay: float = 1e-4

    budget_mode: str = "epochs"  # "epochs" | "time"
    train_epochs: int = 10
    train_time_s: float = 5.0

    cache: bool = True
    seed_base: int = 12345


def metropolis_hastings_arch_search_budget(
    train_loader,
    val_loader,
    input_dim: int,
    output_dim: int,
    init_arch: List[int],
    cfg: MCMCConfigD,
    verbose: bool = True
) -> Dict[str, Any]:

    assert len(init_arch) == cfg.K

    cache: Dict[Tuple[int, ...], Dict[str, Any]] = {}

    def get_eval(arch: List[int]) -> Dict[str, Any]:
        key = tuple(arch)
        if cfg.cache and key in cache:
            return cache[key]

        seed_for_arch = cfg.seed_base + (abs(hash(key)) % 100000)

        out = train_and_get_val_loss_budget(
            arch=arch,
            train_loader=train_loader,
            val_loader=val_loader,
            input_dim=input_dim,
            output_dim=output_dim,
            activation=cfg.activation,
            dropout=cfg.dropout,
            lr=cfg.lr,
            weight_decay=cfg.weight_decay,
            budget_mode=cfg.budget_mode,
            train_epochs=cfg.train_epochs,
            train_time_s=cfg.train_time_s,
            seed_for_arch=seed_for_arch
        )

        if cfg.cache:
            cache[key] = out
        return out

    cur_arch = init_arch.copy()
    cur_eval = get_eval(cur_arch)
    cur_loss = float(cur_eval["val_loss"])

    best_arch = cur_arch.copy()
    best_loss = cur_loss

    history: List[Dict[str, Any]] = []
    total_train_seconds = 0.0

    if verbose:
        print(f"Start ({cfg.budget_mode}): arch={cur_arch} val_loss={cur_loss:.6f}")

    for t in range(cfg.iters):
        prop_arch = propose_architecture(cur_arch, cfg.min_units, cfg.max_units, cfg.step)
        prop_eval = get_eval(prop_arch)
        prop_loss = float(prop_eval["val_loss"])

        total_train_seconds += float(prop_eval.get("train_seconds", 0.0))

        delta = prop_loss - cur_loss
        if delta <= 0:
            acc_prob = 1.0
        else:
            acc_prob = math.exp(-delta / max(cfg.temperature, 1e-12))

        accepted = (random.random() < acc_prob)

        if accepted:
            cur_arch = prop_arch
            cur_loss = prop_loss

        if cur_loss < best_loss:
            best_loss = cur_loss
            best_arch = cur_arch.copy()

        history.append({
            "iter": t + 1,
            "cur_arch": cur_arch.copy(),
            "cur_loss": float(cur_loss),
            "prop_arch": prop_arch.copy(),
            "prop_loss": float(prop_loss),
            "accepted": bool(accepted),
            "acc_prob": float(acc_prob),
            "prop_train_seconds": float(prop_eval.get("train_seconds", 0.0)),
            "prop_steps": int(prop_eval.get("steps", -1)),
            "prop_epochs_done": int(prop_eval.get("epochs_done", -1)),
            "best_arch": best_arch.copy(),
            "best_loss": float(best_loss),
        })

        if verbose:
            tag = "ACCEPT ‚úÖ" if accepted else "reject ‚ùå"
            ep_done = int(prop_eval.get("epochs_done", -1))
            sec = float(prop_eval.get("train_seconds", 0.0))
            print(
                f"[{t+1:03d}] {tag} | "
                f"prop={prop_arch} loss={prop_loss:.6f} | "
                f"epochs_done={ep_done} time={sec:.2f}s | "
                f"cur={cur_arch} loss={cur_loss:.6f} | "
                f"best={best_arch} {best_loss:.6f} | "
                f"p={acc_prob:.3f}"
            )

    acc_rate = float(np.mean([h["accepted"] for h in history])) if history else 0.0

    return {
        "best_arch": best_arch,
        "best_loss": best_loss,
        "history": history,
        "acceptance_rate": acc_rate,
        "total_train_seconds": float(total_train_seconds),
        "cache_size": len(cache),
        "cfg": cfg,
    }


In [None]:
# –í—ã–±–∏—Ä–∞–µ–º –±–∞–∑–æ–≤—É—é –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—É –∏ —á–∏—Å–ª–æ —ç–ø–æ—Ö –¥–ª—è —Ä–µ–∂–∏–º–∞ A
init_arch = [32, 32, 32]  # –∏–ª–∏ —Ç–æ, —Å —á–µ–≥–æ —Ç—ã —Å—Ç–∞—Ä—Ç–æ–≤–∞–ª–∞ –≤ (c)
E = 10

# 1) –∏–∑–º–µ—Ä–∏–º, —Å–∫–æ–ª—å–∫–æ –≤—Ä–µ–º–µ–Ω–∏ –≤ —Å—Ä–µ–¥–Ω–µ–º –∑–∞–Ω–∏–º–∞–µ—Ç –æ–±—É—á–µ–Ω–∏–µ E —ç–ø–æ—Ö –Ω–∞ init_arch
probe = train_and_get_val_loss_budget(
    arch=init_arch,
    train_loader=train_loader,
    val_loader=val_loader,
    input_dim=input_dim,
    output_dim=output_dim,
    activation="relu",
    dropout=0.1,
    lr=1e-3,
    weight_decay=1e-4,
    budget_mode="epochs",
    train_epochs=E,
    seed_for_arch=777
)

time_budget = probe["train_seconds"]
print(f"–ö–∞–ª–∏–±—Ä–æ–≤–∫–∞: {E} —ç–ø–æ—Ö –Ω–∞ arch={init_arch} –∑–∞–Ω—è–ª–∏ ~ {time_budget:.2f} —Å–µ–∫—É–Ω–¥.")
print(f"–ó–Ω–∞—á–∏—Ç –¥–ª—è —Ä–µ–∂–∏–º–∞ 'time' –±–µ—Ä—ë–º train_time_s = {time_budget:.2f} (–ø—Ä–∏–º–µ—Ä–Ω–æ –æ–¥–∏–Ω–∞–∫–æ–≤—ã–π –±—é–¥–∂–µ—Ç).")


–ö–∞–ª–∏–±—Ä–æ–≤–∫–∞: 10 —ç–ø–æ—Ö –Ω–∞ arch=[32, 32, 32] –∑–∞–Ω—è–ª–∏ ~ 0.48 —Å–µ–∫—É–Ω–¥.
–ó–Ω–∞—á–∏—Ç –¥–ª—è —Ä–µ–∂–∏–º–∞ 'time' –±–µ—Ä—ë–º train_time_s = 0.48 (–ø—Ä–∏–º–µ—Ä–Ω–æ –æ–¥–∏–Ω–∞–∫–æ–≤—ã–π –±—é–¥–∂–µ—Ç).


In [None]:
# –†–µ–∂–∏–º A: —Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —ç–ø–æ—Ö–∏
cfg_epochs = MCMCConfigD(
    K=3,
    min_units=8, max_units=128, step=8,
    iters=30,
    temperature=0.05,
    activation="relu",
    dropout=0.1,
    lr=1e-3,
    weight_decay=1e-4,
    budget_mode="epochs",
    train_epochs=E,
    train_time_s=0.0,
    cache=True
)

res_epochs = metropolis_hastings_arch_search_budget(
    train_loader=train_loader,
    val_loader=val_loader,
    input_dim=input_dim,
    output_dim=output_dim,
    init_arch=init_arch,
    cfg=cfg_epochs,
    verbose=True
)

print("\n=== –ò—Ç–æ–≥ —Ä–µ–∂–∏–º–∞ epochs ===")
print("best_arch:", res_epochs["best_arch"])
print("best_loss:", res_epochs["best_loss"])
print("acceptance_rate:", res_epochs["acceptance_rate"])
print("total_train_seconds (approx):", res_epochs["total_train_seconds"])

epochs_list = [h["prop_epochs_done"] for h in res_epochs["history"]]
print("epochs_done values (–¥–æ–ª–∂–Ω—ã –±—ã—Ç—å –æ–¥–∏–Ω–∞–∫–æ–≤—ã–µ):", sorted(set(epochs_list)))
print("expected fixed epochs =", E)


# –†–µ–∂–∏–º B: —Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–Ω–Ω–æ–µ –≤—Ä–µ–º—è
cfg_time = MCMCConfigD(
    K=3,
    min_units=8, max_units=128, step=8,
    iters=30,
    temperature=0.005,
    activation="relu",
    dropout=0.1,
    lr=1e-3,
    weight_decay=1e-4,
    budget_mode="time",
    train_epochs=0,
    train_time_s=float(time_budget),
    cache=True
)

import time

# --- –†–µ–∂–∏–º B: —Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–Ω–Ω–æ–µ –≤—Ä–µ–º—è (time) ---
t_chain0 = time.perf_counter()

res_time = metropolis_hastings_arch_search_budget(
    train_loader=train_loader,
    val_loader=val_loader,
    input_dim=input_dim,
    output_dim=output_dim,
    init_arch=init_arch,
    cfg=cfg_time,
    verbose=True
)

t_chain1 = time.perf_counter()
wall = t_chain1 - t_chain0

expected = cfg_time.iters * cfg_time.train_time_s
print("\n[Time-mode budget check]")
print("EXPECTED budget (iters * time_budget):", expected)
print("WALL-CLOCK time of whole run:", wall)


print("\n=== –ò—Ç–æ–≥ —Ä–µ–∂–∏–º–∞ time ===")
print("best_arch:", res_time["best_arch"])
print("best_loss:", res_time["best_loss"])
print("acceptance_rate:", res_time["acceptance_rate"])
print("total_train_seconds (approx):", res_time["total_train_seconds"])

epochs_list_t = np.array([h["prop_epochs_done"] for h in res_time["history"]], dtype=float)
print("epochs_done stats (time-mode):")
print("  min :", int(np.min(epochs_list_t)))
print("  mean:", float(np.mean(epochs_list_t)))
print("  max :", int(np.max(epochs_list_t)))
print("time_budget_s per eval =", cfg_time.train_time_s)


Start (epochs): arch=[32, 32, 32] val_loss=0.471412
[001] ACCEPT ‚úÖ | prop=[24, 32, 32] loss=0.486095 | epochs_done=10 time=0.45s | cur=[24, 32, 32] loss=0.486095 | best=[32, 32, 32] 0.471412 | p=0.746
[002] ACCEPT ‚úÖ | prop=[24, 32, 24] loss=0.494538 | epochs_done=10 time=0.50s | cur=[24, 32, 24] loss=0.494538 | best=[32, 32, 32] 0.471412 | p=0.845
[003] ACCEPT ‚úÖ | prop=[32, 32, 24] loss=0.479516 | epochs_done=10 time=0.45s | cur=[32, 32, 24] loss=0.479516 | best=[32, 32, 32] 0.471412 | p=1.000
[004] ACCEPT ‚úÖ | prop=[32, 32, 32] loss=0.471412 | epochs_done=10 time=0.45s | cur=[32, 32, 32] loss=0.471412 | best=[32, 32, 32] 0.471412 | p=1.000
[005] reject ‚ùå | prop=[32, 40, 32] loss=0.489770 | epochs_done=10 time=0.33s | cur=[32, 32, 32] loss=0.471412 | best=[32, 32, 32] 0.471412 | p=0.693
[006] ACCEPT ‚úÖ | prop=[32, 32, 24] loss=0.479516 | epochs_done=10 time=0.45s | cur=[32, 32, 24] loss=0.479516 | best=[32, 32, 32] 0.471412 | p=0.850
[007] ACCEPT ‚úÖ | prop=[32, 32, 16] loss=



# **–ü—É–Ω–∫—Ç E**

In [None]:
class MLP(nn.Module):
    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        arch: List[int],
        activation="relu",       # str –∏–ª–∏ List[str] –¥–ª–∏–Ω—ã len(arch)
        dropout: float = 0.0
    ):
        super().__init__()

        act_map = {
            "relu": nn.ReLU,
            "tanh": nn.Tanh,
            "gelu": nn.GELU,
            "sigmoid": nn.Sigmoid
        }


        if isinstance(activation, str):
            activations = [activation.lower()] * len(arch)
        else:
            activations = [str(a).lower() for a in activation]
            if len(activations) != len(arch):
                raise ValueError("activation list must have same length as arch")

        layers = []
        prev = input_dim

        for width, act_name in zip(arch, activations):
            if act_name not in act_map:
                raise ValueError(f"Unknown activation: {act_name}. Allowed: {list(act_map.keys())}")
            layers.append(nn.Linear(prev, width))
            layers.append(act_map[act_name]())
            if dropout and dropout > 0:
                layers.append(nn.Dropout(dropout))
            prev = width

        layers.append(nn.Linear(prev, output_dim))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


In [None]:
from dataclasses import dataclass
from typing import Tuple, List, Dict, Any
import random
import math
import numpy as np

@dataclass
class MCMCConfigE:
    K: int = 3
    min_units: int = 8
    max_units: int = 128
    step: int = 8
    iters: int = 30
    temperature: float = 0.05

    budget_mode: str = "epochs"     # "epochs" | "time"
    train_epochs: int = 10
    train_time_s: float = 0.5

    lr: float = 1e-3
    weight_decay: float = 1e-4
    dropout: float = 0.1

    # –ø—É–Ω–∫—Ç E: –∫–∞–∫–∏–µ –∞–∫—Ç–∏–≤–∞—Ü–∏–∏ —Ä–∞–∑—Ä–µ—à–µ–Ω—ã + –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —à–∞–≥–∞ –ø–æ –∞–∫—Ç–∏–≤–∞—Ü–∏–∏
    allowed_activations: Tuple[str, ...] = ("relu", "tanh", "gelu", "sigmoid")
    p_change_activation: float = 0.30   # 30% —à–∞–≥–æ–≤ –º–µ–Ω—è—é—Ç activation, 70% –º–µ–Ω—è—é—Ç —à–∏—Ä–∏–Ω—É

    cache: bool = True
    seed_base: int = 12345


def propose_state_with_activation(
    arch: Tuple[int, ...],
    acts: Tuple[str, ...],
    cfg: MCMCConfigE
) -> Tuple[Tuple[int, ...], Tuple[str, ...]]:
    """
    –°–∏–º–º–µ—Ç—Ä–∏—á–Ω—ã–π proposal:
    - —Å –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å—é p: –º–µ–Ω—è–µ–º activation –æ–¥–Ω–æ–≥–æ —Å–ª–æ—è
    - –∏–Ω–∞—á–µ: –º–µ–Ω—è–µ–º —à–∏—Ä–∏–Ω—É –æ–¥–Ω–æ–≥–æ —Å–ª–æ—è –Ω–∞ ¬±step (–≤ –≥—Ä–∞–Ω–∏—Ü–∞—Ö), –∏–Ω–∞—á–µ –Ω—É–ª–µ–≤–æ–π —Ö–æ–¥
    """

    # 1) —à–∞–≥ –ø–æ activation
    if random.random() < cfg.p_change_activation:
        i = random.randrange(cfg.K)
        cur = acts[i]
        choices = [a for a in cfg.allowed_activations if a != cur]
        if not choices:
            return arch, acts  # –Ω—É–ª–µ–≤–æ–π —Ö–æ–¥
        new_act = random.choice(choices)

        new_acts = list(acts)
        new_acts[i] = new_act
        return arch, tuple(new_acts)

    # 2) —à–∞–≥ –ø–æ —à–∏—Ä–∏–Ω–µ —Å–ª–æ—è (–∫–∞–∫ —Ä–∞–Ω—å—à–µ)
    new_arch = list(arch)
    i = random.randrange(cfg.K)
    direction = random.choice([-1, +1])
    cand = new_arch[i] + direction * cfg.step

    if cand < cfg.min_units or cand > cfg.max_units:
        return arch, acts  # –Ω—É–ª–µ–≤–æ–π —Ö–æ–¥

    new_arch[i] = int(cand)
    return tuple(new_arch), acts


In [None]:
def metropolis_hastings_search_arch_and_activation(
    train_loader,
    val_loader,
    input_dim: int,
    output_dim: int,
    init_arch: List[int],
    init_acts: List[str],
    cfg: MCMCConfigE,
    verbose: bool = True
) -> Dict[str, Any]:

    assert len(init_arch) == cfg.K
    assert len(init_acts) == cfg.K

    # cache –ø–æ –ø–æ–ª–Ω–æ–º—É —Å–æ—Å—Ç–æ—è–Ω–∏—é (arch + acts)
    cache: Dict[Tuple[Tuple[int, ...], Tuple[str, ...]], Dict[str, Any]] = {}

    def get_eval(arch_t: Tuple[int, ...], acts_t: Tuple[str, ...]) -> Dict[str, Any]:
        key = (arch_t, acts_t)
        if cfg.cache and key in cache:
            out = cache[key].copy()
            out["_cached"] = True
            return out

        seed_for_state = cfg.seed_base + (abs(hash(key)) % 100000)

        out = train_and_get_val_loss_budget(
            arch=list(arch_t),
            train_loader=train_loader,
            val_loader=val_loader,
            input_dim=input_dim,
            output_dim=output_dim,
            activation=list(acts_t),       # <-- –í–û–¢ –û–ù–û: —Å–ø–∏—Å–æ–∫ –∞–∫—Ç–∏–≤–∞—Ü–∏–π –ø–æ —Å–ª–æ—è–º
            dropout=cfg.dropout,
            lr=cfg.lr,
            weight_decay=cfg.weight_decay,
            budget_mode=cfg.budget_mode,
            train_epochs=cfg.train_epochs,
            train_time_s=cfg.train_time_s,
            seed_for_arch=seed_for_state
        )
        out["_cached"] = False

        if cfg.cache:
            cache[key] = out
        return out

    cur_arch = tuple(init_arch)
    cur_acts = tuple(a.lower() for a in init_acts)

    cur_eval = get_eval(cur_arch, cur_acts)
    cur_loss = float(cur_eval["val_loss"])

    best_arch, best_acts = cur_arch, cur_acts
    best_loss = cur_loss

    history = []
    if verbose:
        print(f"Start: arch={list(cur_arch)} acts={list(cur_acts)} val_loss={cur_loss:.6f}")

    for t in range(cfg.iters):
        prop_arch, prop_acts = propose_state_with_activation(cur_arch, cur_acts, cfg)
        prop_eval = get_eval(prop_arch, prop_acts)
        prop_loss = float(prop_eval["val_loss"])

        delta = prop_loss - cur_loss
        if delta <= 0:
            acc_prob = 1.0
        else:
            acc_prob = math.exp(-delta / max(cfg.temperature, 1e-12))

        accepted = (random.random() < acc_prob)
        if accepted:
            cur_arch, cur_acts, cur_loss = prop_arch, prop_acts, prop_loss

        if cur_loss < best_loss:
            best_arch, best_acts, best_loss = cur_arch, cur_acts, cur_loss

        history.append({
            "iter": t + 1,
            "cur_arch": list(cur_arch),
            "cur_acts": list(cur_acts),
            "cur_loss": float(cur_loss),
            "prop_arch": list(prop_arch),
            "prop_acts": list(prop_acts),
            "prop_loss": float(prop_loss),
            "accepted": bool(accepted),
            "acc_prob": float(acc_prob),
            "epochs_done": int(prop_eval.get("epochs_done", -1)),
            "train_seconds": float(prop_eval.get("train_seconds", 0.0)),
            "_cached": bool(prop_eval.get("_cached", False)),
            "best_arch": list(best_arch),
            "best_acts": list(best_acts),
            "best_loss": float(best_loss),
        })

        if verbose:
            tag = "ACCEPT ‚úÖ" if accepted else "reject ‚ùå"
            print(
                f"[{t+1:03d}] {tag} | "
                f"prop_arch={list(prop_arch)} prop_acts={list(prop_acts)} loss={prop_loss:.6f} | "
                f"cur_arch={list(cur_arch)} cur_acts={list(cur_acts)} cur_loss={cur_loss:.6f} | "
                f"best_loss={best_loss:.6f} p={acc_prob:.3f}"
            )

    return {
        "best_arch": list(best_arch),
        "best_acts": list(best_acts),
        "best_loss": float(best_loss),
        "history": history,
        "cache_size": len(cache),
    }


In [None]:
cfgE = MCMCConfigE(
    K=3,
    iters=30,
    temperature=0.05,
    budget_mode="time",       # –∏–ª–∏ "epochs"
    train_time_s=0.3,         # –µ—Å–ª–∏ time
    train_epochs=50,          # –µ—Å–ª–∏ epochs
    p_change_activation=0.30,
    allowed_activations=("relu", "tanh", "gelu", "sigmoid"),
    dropout=0.1,
    lr=1e-3,
    weight_decay=1e-4,
    cache=True
)

init_arch = [32, 32, 32]
init_acts = ["relu", "relu", "relu"]

resE = metropolis_hastings_search_arch_and_activation(
    train_loader=train_loader,
    val_loader=val_loader,
    input_dim=input_dim,
    output_dim=output_dim,
    init_arch=init_arch,
    init_acts=init_acts,
    cfg=cfgE,
    verbose=True
)

print("\n=== –ø—É–Ω–∫—Ç (e) –∏—Ç–æ–≥ ===")
print("best_arch:", resE["best_arch"])
print("best_acts:", resE["best_acts"])
print("best_loss:", resE["best_loss"])


Start: arch=[32, 32, 32] acts=['relu', 'relu', 'relu'] val_loss=0.599587
[001] ACCEPT ‚úÖ | prop_arch=[24, 32, 32] prop_acts=['relu', 'relu', 'relu'] loss=0.600671 | cur_arch=[24, 32, 32] cur_acts=['relu', 'relu', 'relu'] cur_loss=0.600671 | best_loss=0.599587 p=0.979
[002] ACCEPT ‚úÖ | prop_arch=[32, 32, 32] prop_acts=['relu', 'relu', 'relu'] loss=0.599587 | cur_arch=[32, 32, 32] cur_acts=['relu', 'relu', 'relu'] cur_loss=0.599587 | best_loss=0.599587 p=1.000
[003] reject ‚ùå | prop_arch=[32, 32, 32] prop_acts=['relu', 'relu', 'sigmoid'] loss=0.902975 | cur_arch=[32, 32, 32] cur_acts=['relu', 'relu', 'relu'] cur_loss=0.599587 | best_loss=0.599587 p=0.002
[004] reject ‚ùå | prop_arch=[32, 32, 24] prop_acts=['relu', 'relu', 'relu'] loss=0.934116 | cur_arch=[32, 32, 32] cur_acts=['relu', 'relu', 'relu'] cur_loss=0.599587 | best_loss=0.599587 p=0.001
[005] ACCEPT ‚úÖ | prop_arch=[32, 32, 40] prop_acts=['relu', 'relu', 'relu'] loss=0.478268 | cur_arch=[32, 32, 40] cur_acts=['relu', 'relu',

In [None]:
acts_seen = [tuple(h["cur_acts"]) for h in resE["history"]]
unique_acts = sorted(set(acts_seen))

print("–°–∫–æ–ª—å–∫–æ —Ä–∞–∑–Ω—ã—Ö –Ω–∞–±–æ—Ä–æ–≤ –∞–∫—Ç–∏–≤–∞—Ü–∏–π –≤—Å—Ç—Ä–µ—Ç–∏–ª–æ—Å—å:", len(unique_acts))
print("–ü—Ä–∏–º–µ—Ä—ã:", unique_acts[:10])

# –°–∫–æ–ª—å–∫–æ —Ä–∞–∑ –≤ —Ü–µ–ø–∏ —Ä–µ–∞–ª—å–Ω–æ –º–µ–Ω—è–ª–∏—Å—å –∞–∫—Ç–∏–≤–∞—Ü–∏–∏ (–ø–æ –ø—Ä–∏–Ω—è—Ç—ã–º —à–∞–≥–∞–º)
changes = 0
prev = tuple(["relu","relu","relu"])  # –∏–ª–∏ init_acts
for h in resE["history"]:
    cur = tuple(h["cur_acts"])
    if cur != prev:
        changes += 1
    prev = cur

print("–°–∫–æ–ª—å–∫–æ —Ä–∞–∑ —Å–æ—Å—Ç–æ—è–Ω–∏–µ (–∞–∫—Ç–∏–≤–∞—Ü–∏–∏/–∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞) –º–µ–Ω—è–ª–æ—Å—å:", changes)


–°–∫–æ–ª—å–∫–æ —Ä–∞–∑–Ω—ã—Ö –Ω–∞–±–æ—Ä–æ–≤ –∞–∫—Ç–∏–≤–∞—Ü–∏–π –≤—Å—Ç—Ä–µ—Ç–∏–ª–æ—Å—å: 7
–ü—Ä–∏–º–µ—Ä—ã: [('gelu', 'tanh', 'relu'), ('relu', 'relu', 'relu'), ('relu', 'relu', 'sigmoid'), ('relu', 'tanh', 'gelu'), ('relu', 'tanh', 'relu'), ('relu', 'tanh', 'tanh'), ('tanh', 'tanh', 'relu')]
–°–∫–æ–ª—å–∫–æ —Ä–∞–∑ —Å–æ—Å—Ç–æ—è–Ω–∏–µ (–∞–∫—Ç–∏–≤–∞—Ü–∏–∏/–∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞) –º–µ–Ω—è–ª–æ—Å—å: 9


# **–ü—É–Ω–∫—Ç F**

In [None]:
from dataclasses import dataclass
from typing import Tuple, List, Dict, Any
import random, math
import numpy as np

@dataclass
class MCMCConfigF:
    # –≥–ª—É–±–∏–Ω–∞ —Ç–µ–ø–µ—Ä—å –ù–ï —Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–Ω–∞
    min_layers: int = 1          # –º–∏–Ω–∏–º—É–º —Å–∫—Ä—ã—Ç—ã—Ö —Å–ª–æ—ë–≤
    max_layers: int = 5          # –º–∞–∫—Å–∏–º—É–º —Å–∫—Ä—ã—Ç—ã—Ö —Å–ª–æ—ë–≤

    # —à–∏—Ä–∏–Ω—ã —Å–ª–æ—ë–≤
    min_units: int = 8
    max_units: int = 128
    step: int = 8

    iters: int = 50
    temperature: float = 0.05

    # –æ–±—É—á–µ–Ω–∏–µ
    budget_mode: str = "time"    # "epochs" | "time"
    train_epochs: int = 10
    train_time_s: float = 0.3

    lr: float = 1e-3
    weight_decay: float = 1e-4
    dropout: float = 0.1

    # –∞–∫—Ç–∏–≤–∞—Ü–∏–∏ (–∫–∞–∫ –≤ e)
    allowed_activations: Tuple[str, ...] = ("relu", "tanh", "gelu", "sigmoid")

    # –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏ —Ç–∏–ø–æ–≤ —à–∞–≥–æ–≤ (—Å—É–º–º–∞ = 1)
    p_width_move: float = 0.55
    p_act_move: float = 0.25
    p_depth_move: float = 0.20

    cache: bool = True
    seed_base: int = 12345


In [None]:
def _depth_add_remove_probs(k: int, cfg: MCMCConfigF) -> Tuple[float, float]:
    """
    –í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏ –≤—ã–±—Ä–∞—Ç—å ADD/REMOVE –≤–Ω—É—Ç—Ä–∏ depth-move —Å —É—á—ë—Ç–æ–º –≥—Ä–∞–Ω–∏—Ü.
    """
    if k <= cfg.min_layers:
        return 1.0, 0.0  # –º–æ–∂–Ω–æ —Ç–æ–ª—å–∫–æ –¥–æ–±–∞–≤–∏—Ç—å
    if k >= cfg.max_layers:
        return 0.0, 1.0  # –º–æ–∂–Ω–æ —Ç–æ–ª—å–∫–æ —É–¥–∞–ª–∏—Ç—å
    return 0.5, 0.5      # –∏–Ω–∞—á–µ –ø–æ—Ä–æ–≤–Ω—É


def _width_choices(cfg: MCMCConfigF) -> List[int]:
    # –¥–∏—Å–∫—Ä–µ—Ç–Ω—ã–π –Ω–∞–±–æ—Ä —à–∏—Ä–∏–Ω (—á—Ç–æ–±—ã –æ–±—Ä–∞—Ç–Ω—ã–µ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏ —Å—á–∏—Ç–∞–ª–∏—Å—å —Ä–æ–≤–Ω–æ)
    return list(range(cfg.min_units, cfg.max_units + 1, cfg.step))


def propose_state_depth(
    arch: Tuple[int, ...],
    acts: Tuple[str, ...],
    cfg: MCMCConfigF
) -> Dict[str, Any]:
    """
    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç dict:
      - new_arch, new_acts
      - move_type
      - q_fwd, q_rev (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è –≤–ø–µ—Ä—ë–¥/–Ω–∞–∑–∞–¥)
    """
    k = len(arch)
    assert k == len(acts)

    # –≤—ã–±–∏—Ä–∞–µ–º —Ç–∏–ø —à–∞–≥–∞
    r = random.random()
    if r < cfg.p_width_move:
        move_type = "width"
    elif r < cfg.p_width_move + cfg.p_act_move:
        move_type = "act"
    else:
        move_type = "depth"

    # ---------- 1) WIDTH MOVE (—Å–∏–º–º–µ—Ç—Ä–∏—á–Ω—ã–π) ----------
    if move_type == "width":
        i = random.randrange(k)
        direction = random.choice([-1, +1])
        cand = arch[i] + direction * cfg.step

        # –Ω—É–ª–µ–≤–æ–π —Ö–æ–¥ –ø—Ä–∏ –≤—ã—Ö–æ–¥–µ –∑–∞ –≥—Ä–∞–Ω–∏—Ü—ã
        if cand < cfg.min_units or cand > cfg.max_units:
            return {"new_arch": arch, "new_acts": acts, "move_type": "width_null", "q_fwd": 1.0, "q_rev": 1.0}

        new_arch = list(arch)
        new_arch[i] = int(cand)

        # –¥–ª—è —Ä–µ–∞–ª—å–Ω—ã—Ö —Ö–æ–¥–æ–≤ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Ç—É–¥–∞ –∏ –æ–±—Ä–∞—Ç–Ω–æ –æ–¥–∏–Ω–∞–∫–æ–≤–∞—è
        return {"new_arch": tuple(new_arch), "new_acts": acts, "move_type": "width", "q_fwd": 1.0, "q_rev": 1.0}

    # ---------- 2) ACT MOVE (—Å–∏–º–º–µ—Ç—Ä–∏—á–Ω—ã–π) ----------
    if move_type == "act":
        i = random.randrange(k)
        cur = acts[i]
        candidates = [a for a in cfg.allowed_activations if a != cur]
        if not candidates:
            return {"new_arch": arch, "new_acts": acts, "move_type": "act_null", "q_fwd": 1.0, "q_rev": 1.0}

        new_act = random.choice(candidates)
        new_acts = list(acts)
        new_acts[i] = new_act

        return {"new_arch": arch, "new_acts": tuple(new_acts), "move_type": "act", "q_fwd": 1.0, "q_rev": 1.0}

    # ---------- 3) DEPTH MOVE (ADD/REMOVE, —É—á–∏—Ç—ã–≤–∞–µ–º q_rev/q_fwd) ----------
    add_p, rem_p = _depth_add_remove_probs(k, cfg)
    widths = _width_choices(cfg)
    W = len(widths)
    A = len(cfg.allowed_activations)

    # –µ—Å–ª–∏ –≤—ã–±–æ—Ä–∞ –Ω–µ—Ç (–Ω–∞ –≤—Å—è–∫–∏–π)
    if add_p == 0.0 and rem_p == 0.0:
        return {"new_arch": arch, "new_acts": acts, "move_type": "depth_null", "q_fwd": 1.0, "q_rev": 1.0}

    # decide add/remove
    u = random.random()
    do_add = (u < add_p)

    # --- ADD LAYER ---
    if do_add:
        # –≤—ã–±–∏—Ä–∞–µ–º –ø–æ–∑–∏—Ü–∏—é –≤—Å—Ç–∞–≤–∫–∏ (k+1 –≤–æ–∑–º–æ–∂–Ω—ã—Ö)
        pos = random.randrange(k + 1)
        w_new = random.choice(widths)
        a_new = random.choice(cfg.allowed_activations)

        new_arch = list(arch)
        new_acts = list(acts)
        new_arch.insert(pos, w_new)
        new_acts.insert(pos, a_new)

        k_new = k + 1

        # q_forward = P(depth)*P(add|k)*P(pos)*P(width)*P(act)
        q_fwd = cfg.p_depth_move * add_p * (1.0 / (k + 1)) * (1.0 / W) * (1.0 / A)

        # –æ–±—Ä–∞—Ç–Ω—ã–π —Ö–æ–¥: –∏–∑ k+1 —Å–ª–æ—ë–≤ —É–¥–∞–ª–∏—Ç—å –∏–º–µ–Ω–Ω–æ –≤—Å—Ç–∞–≤–ª–µ–Ω–Ω—ã–π —Å–ª–æ–π (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å 1/(k+1))
        add_p_back, rem_p_back = _depth_add_remove_probs(k_new, cfg)
        q_rev = cfg.p_depth_move * rem_p_back * (1.0 / (k_new))

        return {
            "new_arch": tuple(new_arch),
            "new_acts": tuple(new_acts),
            "move_type": "depth_add",
            "q_fwd": q_fwd,
            "q_rev": q_rev
        }

    # --- REMOVE LAYER ---
    else:
        # —É–¥–∞–ª–∏—Ç—å –º–æ–∂–Ω–æ —Ç–æ–ª—å–∫–æ –µ—Å–ª–∏ k > min_layers (–≥–∞—Ä–∞–Ω—Ç–∏—Ä–æ–≤–∞–Ω–æ rem_p>0)
        pos = random.randrange(k)
        w_removed = arch[pos]
        a_removed = acts[pos]

        new_arch = list(arch)
        new_acts = list(acts)
        new_arch.pop(pos)
        new_acts.pop(pos)

        k_new = k - 1

        # q_forward = P(depth)*P(remove|k)*P(pos)
        q_fwd = cfg.p_depth_move * rem_p * (1.0 / k)

        # –æ–±—Ä–∞—Ç–Ω—ã–π —Ö–æ–¥: –∏–∑ k-1 –≤—Å—Ç–∞–≤–∏—Ç—å –Ω–∞ –Ω—É–∂–Ω—É—é –ø–æ–∑–∏—Ü–∏—é –Ω—É–∂–Ω—É—é —à–∏—Ä–∏–Ω—É/–∞–∫—Ç–∏–≤–∞—Ü–∏—é
        add_p_back, rem_p_back = _depth_add_remove_probs(k_new, cfg)
        q_rev = cfg.p_depth_move * add_p_back * (1.0 / (k_new + 1)) * (1.0 / W) * (1.0 / A)

        return {
            "new_arch": tuple(new_arch),
            "new_acts": tuple(new_acts),
            "move_type": "depth_remove",
            "q_fwd": q_fwd,
            "q_rev": q_rev
        }


In [None]:
def metropolis_hastings_depth_search(
    train_loader,
    val_loader,
    input_dim: int,
    output_dim: int,
    init_arch: List[int],
    init_acts: List[str],
    cfg: MCMCConfigF,
    verbose: bool = True
) -> Dict[str, Any]:

    assert len(init_arch) == len(init_acts)
    assert cfg.min_layers <= len(init_arch) <= cfg.max_layers

    cache: Dict[Tuple[Tuple[int, ...], Tuple[str, ...]], Dict[str, Any]] = {}

    def get_eval(arch_t: Tuple[int, ...], acts_t: Tuple[str, ...]) -> Dict[str, Any]:
        key = (arch_t, acts_t)
        if cfg.cache and key in cache:
            out = cache[key].copy()
            out["_cached"] = True
            return out

        seed_for_state = cfg.seed_base + (abs(hash(key)) % 100000)

        out = train_and_get_val_loss_budget(
            arch=list(arch_t),
            train_loader=train_loader,
            val_loader=val_loader,
            input_dim=input_dim,
            output_dim=output_dim,
            activation=list(acts_t),      # —Å–ø–∏—Å–æ–∫ –∞–∫—Ç–∏–≤–∞—Ü–∏–π –ø–æ —Å–ª–æ—è–º
            dropout=cfg.dropout,
            lr=cfg.lr,
            weight_decay=cfg.weight_decay,
            budget_mode=cfg.budget_mode,
            train_epochs=cfg.train_epochs,
            train_time_s=cfg.train_time_s,
            seed_for_arch=seed_for_state
        )
        out["_cached"] = False

        if cfg.cache:
            cache[key] = out
        return out

    cur_arch = tuple(int(x) for x in init_arch)
    cur_acts = tuple(str(a).lower() for a in init_acts)

    cur_eval = get_eval(cur_arch, cur_acts)
    cur_loss = float(cur_eval["val_loss"])

    best_arch, best_acts, best_loss = cur_arch, cur_acts, cur_loss

    history: List[Dict[str, Any]] = []

    if verbose:
        print(f"Start: depth={len(cur_arch)} arch={list(cur_arch)} acts={list(cur_acts)} loss={cur_loss:.6f}")

    for t in range(cfg.iters):
        prop = propose_state_depth(cur_arch, cur_acts, cfg)
        prop_arch = prop["new_arch"]
        prop_acts = prop["new_acts"]

        prop_eval = get_eval(prop_arch, prop_acts)
        prop_loss = float(prop_eval["val_loss"])

        # MH acceptance —Å —É—á–µ—Ç–æ–º q_rev/q_fwd (–≤–∞–∂–Ω–æ –∏–º–µ–Ω–Ω–æ –¥–ª—è depth_add/depth_remove)
        T = max(cfg.temperature, 1e-12)
        log_accept = -(prop_loss - cur_loss) / T

        q_fwd = float(prop.get("q_fwd", 1.0))
        q_rev = float(prop.get("q_rev", 1.0))
        # –∑–∞—â–∏—Ç–∞ –æ—Ç –Ω—É–ª–µ–π
        q_fwd = max(q_fwd, 1e-300)
        q_rev = max(q_rev, 1e-300)
        log_accept += math.log(q_rev) - math.log(q_fwd)

        acc_prob = min(1.0, math.exp(log_accept))
        accepted = (random.random() < acc_prob)

        if accepted:
            cur_arch, cur_acts, cur_loss = prop_arch, prop_acts, prop_loss

        if cur_loss < best_loss:
            best_arch, best_acts, best_loss = cur_arch, cur_acts, cur_loss

        history.append({
            "iter": t + 1,
            "move_type": prop["move_type"],
            "accepted": bool(accepted),
            "acc_prob": float(acc_prob),

            "cur_depth": len(cur_arch),
            "cur_arch": list(cur_arch),
            "cur_acts": list(cur_acts),
            "cur_loss": float(cur_loss),

            "prop_depth": len(prop_arch),
            "prop_arch": list(prop_arch),
            "prop_acts": list(prop_acts),
            "prop_loss": float(prop_loss),

            "epochs_done": int(prop_eval.get("epochs_done", -1)),
            "train_seconds": float(prop_eval.get("train_seconds", 0.0)),
            "_cached": bool(prop_eval.get("_cached", False)),

            "best_depth": len(best_arch),
            "best_arch": list(best_arch),
            "best_acts": list(best_acts),
            "best_loss": float(best_loss),
        })

        if verbose:
            tag = "ACCEPT ‚úÖ" if accepted else "reject ‚ùå"
            print(
                f"[{t+1:03d}] {tag} {prop['move_type']} | "
                f"prop_depth={len(prop_arch)} loss={prop_loss:.6f} -> "
                f"cur_depth={len(cur_arch)} loss={cur_loss:.6f} | "
                f"best={best_loss:.6f} p={acc_prob:.3f}"
            )

    return {
        "best_arch": list(best_arch),
        "best_acts": list(best_acts),
        "best_depth": len(best_arch),
        "best_loss": float(best_loss),
        "history": history,
        "cache_size": len(cache),
    }


In [None]:
cfgF = MCMCConfigF(
    min_layers=1,
    max_layers=5,
    iters=60,
    temperature=0.005,
    budget_mode="time",
    train_time_s=0.3,

    # –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–∏ —à–∞–≥–æ–≤
    p_width_move=0.50,
    p_act_move=0.20,
    p_depth_move=0.30,

    allowed_activations=("relu", "tanh", "gelu", "sigmoid"),
    dropout=0.1,
    lr=1e-3,
    weight_decay=1e-4,
    cache=True
)

# —Å—Ç–∞—Ä—Ç—É–µ–º, –Ω–∞–ø—Ä–∏–º–µ—Ä, —Å –≥–ª—É–±–∏–Ω—ã 3
init_arch = [32, 32, 32]
init_acts = ["relu", "tanh", "gelu"]   # –º–æ–∂–Ω–æ –∏ –æ–¥–∏–Ω–∞–∫–æ–≤—ã–µ ‚Äî –≥–ª—É–±–∏–Ω–∞ –≤—Å—ë —Ä–∞–≤–Ω–æ –±—É–¥–µ—Ç –º–µ–Ω—è—Ç—å—Å—è

resF = metropolis_hastings_depth_search(
    train_loader=train_loader,
    val_loader=val_loader,
    input_dim=input_dim,
    output_dim=output_dim,
    init_arch=init_arch,
    init_acts=init_acts,
    cfg=cfgF,
    verbose=True
)

print("\n=== –ø—É–Ω–∫—Ç (f) –∏—Ç–æ–≥ ===")
print("best_depth:", resF["best_depth"])
print("best_arch :", resF["best_arch"])
print("best_acts :", resF["best_acts"])
print("best_loss :", resF["best_loss"])


Start: depth=3 arch=[32, 32, 32] acts=['relu', 'tanh', 'gelu'] loss=0.495836
[001] ACCEPT ‚úÖ act | prop_depth=3 loss=0.464683 -> cur_depth=3 loss=0.464683 | best=0.464683 p=1.000
[002] reject ‚ùå act | prop_depth=3 loss=0.476014 -> cur_depth=3 loss=0.464683 | best=0.464683 p=0.868
[003] reject ‚ùå act | prop_depth=3 loss=0.489259 -> cur_depth=3 loss=0.464683 | best=0.464683 p=0.736
[004] reject ‚ùå depth_remove | prop_depth=2 loss=0.476880 -> cur_depth=3 loss=0.464683 | best=0.464683 p=0.013
[005] ACCEPT ‚úÖ width | prop_depth=3 loss=0.470320 -> cur_depth=3 loss=0.470320 | best=0.464683 p=0.932
[006] ACCEPT ‚úÖ act | prop_depth=3 loss=0.452347 -> cur_depth=3 loss=0.452347 | best=0.452347 p=1.000
[007] reject ‚ùå width | prop_depth=3 loss=0.465787 -> cur_depth=3 loss=0.452347 | best=0.452347 p=0.845
[008] ACCEPT ‚úÖ act | prop_depth=3 loss=0.470320 -> cur_depth=3 loss=0.470320 | best=0.452347 p=0.799
[009] ACCEPT ‚úÖ act | prop_depth=3 loss=0.487841 -> cur_depth=3 loss=0.487841 | best=

In [None]:
depths = [h["cur_depth"] for h in resF["history"]]
print("–£–Ω–∏–∫–∞–ª—å–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≥–ª—É–±–∏–Ω—ã –≤ —Ü–µ–ø–∏:", sorted(set(depths)))
print("–°–∫–æ–ª—å–∫–æ —Ä–∞–∑ –±—ã–ª depth-move:", sum("depth" in h["move_type"] for h in resF["history"]))


–£–Ω–∏–∫–∞–ª—å–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≥–ª—É–±–∏–Ω—ã –≤ —Ü–µ–ø–∏: [3, 4, 5]
–°–∫–æ–ª—å–∫–æ —Ä–∞–∑ –±—ã–ª depth-move: 15
