<a href="https://colab.research.google.com/github/fishan/Veector/blob/base/Veector_split_DeepSeek_R1_Distill_Qwen_1_5b_int8_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# === Cell 0: Install Dependencies ===
!pip install numpy psutil torch transformers accelerate bitsandbytes ipfshttpclient qiskit qiskit-aer requests huggingface_hub -q
print("Dependencies installed/checked.")

Dependencies installed/checked.


In [None]:
# === Cell 1: Imports (Corrected and Simplified - FINAL) ===

# --- Standard Imports ---
import numpy as np
import queue
import threading
import time
import random
import psutil
import os
import gc
import pickle
import hashlib
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Any, Optional, Tuple, Union
from google.colab import drive, files, userdata # Keep Colab imports
from huggingface_hub import login             # Keep HF import
from transformers import AutoModelForCausalLM, AutoTokenizer # Keep Transformers imports

print("Standard/External imports loaded.")

# --- Optional Imports ---
try:
    import torch
    TORCH_AVAILABLE = True
except ImportError:
    TORCH_AVAILABLE = False
    print("Warning: PyTorch not found. GPU features may be limited.")

try:
    import ipfshttpclient
    IPFS_AVAILABLE = True
except ImportError:
    IPFS_AVAILABLE = False
    # print("Warning: ipfshttpclient not found. IPFS features disabled.")

try:
    from qiskit import QuantumCircuit
    from qiskit.providers.aer import Aer
    from qiskit import execute
    QISKIT_AVAILABLE = True
except ImportError:
    QISKIT_AVAILABLE = False
    # print("Warning: Qiskit not found. Quantum operations disabled.")

print("Optional imports checked.")

# --- Veector Project Imports (Single Correct Block) ---
# Ensure core.py, tensors.py (v0.5.1+), veectordb.py (v0.7.1+),
# operations.py, memory.py are uploaded and accessible.
PROJECT_IMPORTS_OK = False
try:
    # Import core classes/functions needed by THIS script (converter/inference)
    from core import Veector
    from veectordb import VeectorDB # Needed if we re-initialize DB here? Usually not.
    from tensors import (
        TENSORS_VERSION, TensorCoordinate, create_tensor, # Основные функции
        validate_tensor, validate_tensor_tuple, get_tensor_hash, # Валидаторы и хеш
        # Импортируем ТОЛЬКО ТЕ константы, которые реально используются
        # и существуют в tensors.py v0.7.6
        TAG_TYPE_PROCESSOR, TAG_TYPE_KNOWLEDGE, TAG_TYPE_CONVERTER, TAG_TYPE_STATE,
        TAG_MODEL_QWEN2, TAG_MODEL_LLAMA3, TAG_MODEL_DEEPSEEK,
        TAG_PREC_FLOAT32, TAG_PREC_FLOAT16, TAG_PREC_BFLOAT16,
        TAG_PREC_INT8, TAG_PREC_INT4,
        TAG_COMP_WEIGHTS, TAG_COMP_BIAS, TAG_COMP_EMBEDDING, TAG_COMP_ATTN_Q,
        TAG_COMP_ATTN_K, TAG_COMP_ATTN_V, TAG_COMP_ATTN_O, TAG_COMP_ATTN_QKV,
        TAG_COMP_FFN_GATE, TAG_COMP_FFN_UP, TAG_COMP_FFN_DOWN, TAG_COMP_LAYERNORM,
        TAG_COMP_LM_HEAD,
        TAG_FUNC_LINEAR, TAG_FUNC_ATTENTION, TAG_FUNC_FFN,
        TAG_FUNC_EMBED_LOOKUP, TAG_FUNC_CAST_DTYPE, TAG_FUNC_RESHAPE,
        TAG_SEMANTIC_HIDDEN_STATE, TAG_SEMANTIC_LOGITS, TAG_SEMANTIC_TOKEN_IDS,
        TAG_SEMANTIC_KV_CACHE,
        tag_layer, # Функция для тега слоя
        GROUP_IDX_QWEN_KNOWLEDGE, GROUP_IDX_QWEN_PROCESSOR, # ID Групп
        GROUP_IDX_DEEPSEEK_KNOWLEDGE
    )
    # Only import from operations/memory if DIRECTLY used in THIS script, otherwise core.py handles it
    # from operations import * # Generally not needed here
    # from memory import Memory # Generally not needed here

    print("Veector project components imported successfully for this script.")
    PROJECT_IMPORTS_OK = True

except ImportError as e:
    print(f"---!!! FATAL ERROR (ImportError) !!! ---")
    print(f"Specific error: {e}")
    print(f"Could not import required name from core.py or tensors.py.")
    print(f"Ensure files are UP-TO-DATE (tensors v0.5.1+, core v0.5.2+), CORRECT, and ACCESSIBLE.")
    print(f"-----------------------------------------")
    # Optionally define dummies if needed for notebook structure
except Exception as other_e:
    print(f"---!!! FATAL ERROR (Other Exception during Import) !!! ---")
    print(f"Specific error: {other_e}")
    import traceback
    traceback.print_exc()
    print(f"Check imported files for syntax errors.")
    print(f"----------------------------------------------------------")

# Removed the redundant import check block ('Checking imports...')

Standard/External imports loaded.
Optional imports checked.
  [VeectorDB] Successfully imported tensors v0.7.6
  Imported VeectorDB (v0.9.8)
  Imported tensors (v0.7.6)
  Imported operations (v0.8.9)
  Imported Memory (v0.1.0)
Veector Qwen2 Ops Module Loaded. Found 3 operations.
  Found optional module: veector_models.qwen2.ops
Core components imported successfully.
Veector project components imported successfully for this script.


In [None]:
# Очистка директории для чистоты эксперимента
!rm -rf data/
output_dir = "data"
os.makedirs(output_dir, exist_ok=True)


In [None]:
# --- Configuration ---

# Аутентификация с Hugging Face
hf_token = userdata.get('HF_TOKEN')
if not hf_token:
    raise ValueError("Добавь HF_TOKEN в секреты Colab!")
login(hf_token)
print("Аутентификация прошла успешно")

# Подключение Google Drive
drive.mount('/content/drive')
print("Google Drive подключён")

model_NAME = "DeepSeek-R1-Distill-Qwen-1.5B"
# Определяем ОДИН основной путь к БД (например, в data/db/)
DB_PATH = Path("./data/db/")
DB_PATH.mkdir(parents=True, exist_ok=True) # Создаем data/db, если ее нет
print(f"Using Main Veector DB Path: {DB_PATH.resolve()}")

# Set data type (bfloat16 might not be fully supported everywhere, float16 is safer)
TORCH_DTYPE = torch.float16 # Use float16 for wider compatibility

print(f"Model to convert: {model_NAME}")
print(f"Target Veector DB: {DB_PATH}")
print(f"Target dtype: {TORCH_DTYPE}")

Аутентификация прошла успешно
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive подключён
Using Main Veector DB Path: /content/data/db
Model to convert: DeepSeek-R1-Distill-Qwen-1.5B
Target Veector DB: data/db
Target dtype: torch.float16


In [None]:
# === Cell 2: Tag Ontology and Mappings Definition (Sync with tensors.py v0.7.0) ===

import torch # Ensure torch is imported for dtype checking if needed later
import numpy as np # Ensure numpy is imported
from typing import Dict, List, Any, Optional, Tuple, Union # Import typing for hints

# --- Version (for tracking changes in this cell) ---
CONVERTER_CELL2_VERSION = "Synced with tensors.py v0.7.0"
print(f"--- Running Converter Cell 2 v{CONVERTER_CELL2_VERSION} ---")

# --- Type Hint for Metadata Tuple (from tensors.py) ---
# Needed if any functions within Colab cells might use this type hint
MetadataTuple = Tuple[
    List[Union[float, int]],         # [0] data_description
    List[int],                       # [1] coord
    List[int],                       # [2] shape
    List[int],                       # [3] tags
    Optional[Dict],                  # [4] ops_sequences
    Optional[Dict],                  # [5] interface
    Optional[List],                  # [6] filters
    Optional[List],                  # [7] exit_gates
    List[int],                       # [8] lifecycle
    Optional[List[str]]              # [9] parents
]

# --- Simplified Tag Ontology (Flat Integers with Ranges - from tensors.py v0.7.0) ---
# 1-9: Tensor Type
TAG_TYPE_PROCESSOR = 1
TAG_TYPE_KNOWLEDGE = 2
TAG_TYPE_CONVERTER = 3
TAG_TYPE_STATE = 4
# 10-19: Model Family
TAG_MODEL_QWEN2 = 10
TAG_MODEL_LLAMA3 = 11
TAG_MODEL_DEEPSEEK = 12
# 20-29: Precision
TAG_PREC_FLOAT32 = 20
TAG_PREC_FLOAT16 = 21
TAG_PREC_BFLOAT16 = 22
TAG_PREC_INT8 = 23
TAG_PREC_INT4 = 24
# 30-49: Component Type
TAG_COMP_WEIGHTS = 30
TAG_COMP_BIAS = 31
TAG_COMP_EMBEDDING = 32
TAG_COMP_ATTN_Q = 33
TAG_COMP_ATTN_K = 34
TAG_COMP_ATTN_V = 35
TAG_COMP_ATTN_O = 36
TAG_COMP_ATTN_QKV = 37
TAG_COMP_FFN_GATE = 38
TAG_COMP_FFN_UP = 39
TAG_COMP_FFN_DOWN = 40
TAG_COMP_LAYERNORM = 41
TAG_COMP_LM_HEAD = 42
# 50-59: Function
TAG_FUNC_LINEAR = 50
TAG_FUNC_ATTENTION = 51
TAG_FUNC_FFN = 52
TAG_FUNC_EMBED_LOOKUP = 53
TAG_FUNC_CAST_DTYPE = 54
TAG_FUNC_RESHAPE = 55
# 60-69: Data Semantic Type
TAG_SEMANTIC_HIDDEN_STATE = 60
TAG_SEMANTIC_LOGITS = 61
TAG_SEMANTIC_TOKEN_IDS = 62
TAG_SEMANTIC_KV_CACHE = 63
# 100-999: Layer Index
LAYER_IDX_TAG_OFFSET = 100

def tag_layer(idx: int) -> int:
    """Generates a layer tag using an offset."""
    if not isinstance(idx, int): raise TypeError(f"Layer index must be an integer, got {type(idx)}")
    if idx < 0: raise ValueError(f"Invalid layer index for tagging: {idx}. Must be non-negative.")
    return LAYER_IDX_TAG_OFFSET + idx
# 1000+: User Defined Tags
USER_TAG_OFFSET = 1000
# --- End of Tags ---
print("Simplified tag ontology (flat integers) defined.")

# --- Group ID Constants (from tensors.py v0.7.0) ---
GROUP_IDX_QWEN_KNOWLEDGE = 100
GROUP_IDX_QWEN_PROCESSOR = 500
GROUP_IDX_LLAMA_KNOWLEDGE = 101
GROUP_IDX_LLAMA_PROCESSOR = 501
GROUP_IDX_DEEPSEEK_KNOWLEDGE = 102 # Added constant
# GROUP_IDX_DEEPSEEK_PROCESSOR = 502 # Optional
GROUP_IDX_GENERIC_PROCESSOR = 50
print(f"Group Indices defined: QwenK={GROUP_IDX_QWEN_KNOWLEDGE}, QwenP={GROUP_IDX_QWEN_PROCESSOR}, DeepSeekK={GROUP_IDX_DEEPSEEK_KNOWLEDGE}")


# --- Mappings (from tensors.py v0.7.0) ---
# 1. DATA_TYPE_MAPPING
DATA_TYPE_MAPPING = {
    "knowledge": 1,
    "processor": 2,
    "converter": 3,
    "state": 4,
}
REVERSE_DATA_TYPE_MAPPING = {
    1: "knowledge",
    2: "processor",
    3: "converter",
    4: "state",
}
print(f"DATA_TYPE_MAPPING defined: {DATA_TYPE_MAPPING}")

# 2. DTYPE_MAPPING
DTYPE_MAPPING = {
    # Standard Names
    'float32': 1, 'float16': 2, 'bfloat16': 3, 'int8': 4, 'int4': 5,
    'int32': 6, 'int64': 7, 'bool': 8, 'complex64': 9, 'complex128': 10,
    # Numpy Types
    np.float32: 1, np.float16: 2, np.int8: 4, np.int32: 6, np.int64: 7,
    np.bool_: 8, np.complex64: 9, np.complex128: 10,
    # PyTorch Types (as strings and potentially objects if torch loaded)
    'torch.float32': 1, 'torch.float16': 2, 'torch.bfloat16': 3, 'torch.int8': 4,
    'torch.int32': 6, 'torch.int64': 7, 'torch.bool': 8,
    'torch.complex64': 9, 'torch.complex128': 10,
}
# Add torch objects if torch is available
if 'torch' in globals():
    DTYPE_MAPPING[torch.float32] = 1
    DTYPE_MAPPING[torch.float16] = 2
    DTYPE_MAPPING[torch.bfloat16] = 3
    DTYPE_MAPPING[torch.int8] = 4
    DTYPE_MAPPING[torch.int32] = 6
    DTYPE_MAPPING[torch.int64] = 7
    DTYPE_MAPPING[torch.bool] = 8
    DTYPE_MAPPING[torch.complex64] = 9
    DTYPE_MAPPING[torch.complex128] = 10

REVERSE_DTYPE_MAPPING = {
    1: 'float32', 2: 'float16', 3: 'bfloat16', 4: 'int8', 5: 'int4',
    6: 'int32', 7: 'int64', 8: 'bool', 9: 'complex64', 10: 'complex128',
}
print(f"DTYPE_MAPPING defined.")

# 3. STATUS_MAPPING
STATUS_MAPPING = {
    "active": 1,
    "archived": 0
}
REVERSE_STATUS_MAPPING = {
    1: "active",
    0: "archived"
}
print(f"STATUS_MAPPING defined: {STATUS_MAPPING}")

# --- Metadata Encoding Configuration (from tensors.py v0.7.0) ---
METADATA_STRUCTURE_VERSION = 1.1
print(f"Metadata Structure Version: {METADATA_STRUCTURE_VERSION}")

print("Tag ontology, Group IDs, Mappings, and Config defined for Cell 2.")

--- Running Converter Cell 2 vSynced with tensors.py v0.7.0 ---
Simplified tag ontology (flat integers) defined.
Group Indices defined: QwenK=100, QwenP=500, DeepSeekK=102
DATA_TYPE_MAPPING defined: {'knowledge': 1, 'processor': 2, 'converter': 3, 'state': 4}
DTYPE_MAPPING defined.
STATUS_MAPPING defined: {'active': 1, 'archived': 0}
Metadata Structure Version: 1.1
Tag ontology, Group IDs, Mappings, and Config defined for Cell 2.


In [None]:
# === Cell 3: Initialize Veector (SINGLE Instance) ===
from core import Veector # Импортируем класс Veector из core.py
DB_PATH = Path("./data/db/")
try:
    # Используем этот путь при инициализации
    vec = Veector(db_dir=DB_PATH, ipfs_enabled=False)
    print(f"Veector core initialized using DB at: {DB_PATH.resolve()}")
except Exception as e:
    print(f"FATAL: Veector initialization failed: {e}")
    raise RuntimeError("Veector Core failed to initialize") from e

--- Initializing Veector Core v0.7.13 ---
    Requires: tensors v0.7.6+, veectordb v0.9.8+, operations v0.8.9+
    IPFS: False, Address: /ip4/127.0.0.1/tcp/5001
--- Initializing VeectorDB v0.9.8 ---
DEBUG INDEX LOAD: Index file 'tensor_index.pkl' not found. Starting with empty index.
VeectorDB initialized at /content/data/db. Index entries loaded: 0 from 'tensor_index.pkl'.
VeectorDB initialized by Veector Core.
Cache initialized: Size=1000, Strategy=LRU
Registered 82 standard operations.
  Successfully registered 3 operations for Qwen2.
Registered 3 model-specific operations.
Initialized 85 total core operations.
  [MEM_LOG] Veector Initialized: RSS=734.02 MB, RAM Used=11.4%
Veector core initialized using DB at: /content/data/db


In [None]:
# === Cell 4: Load Hugging Face Model ===

model = None
tokenizer = None
try:
    model = AutoModelForCausalLM.from_pretrained(f"deepseek-ai/{model_NAME}", torch_dtype=TORCH_DTYPE, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(f"deepseek-ai/{model_NAME}", trust_remote_code=True)
    model.eval() # Set to evaluation mode
    print(f"Successfully loaded HF model: {model_NAME}")
    print(f"Model config: {model.config}")
except Exception as e:
    print(f"FATAL: Failed to load HF model '{model_NAME}': {e}")
    # Stop execution
    raise RuntimeError(f"Hugging Face model loading failed") from e

# Clean up GPU memory if possible after loading
if TORCH_AVAILABLE and torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()
print("Model loaded and memory potentially cleaned.")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Successfully loaded HF model: DeepSeek-R1-Distill-Qwen-1.5B
Model config: Qwen2Config {
  "_attn_implementation_autoset": true,
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 1536,
  "initializer_range": 0.02,
  "intermediate_size": 8960,
  "max_position_embeddings": 131072,
  "max_window_layers": 21,
  "model_type": "qwen2",
  "num_attention_heads": 12,
  "num_hidden_layers": 28,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.50.3",
  "use_cache": true,
  "use_mrope": false,
  "use_sliding_window": false,
  "vocab_size": 151936
}

Model loaded and memory potentially cleaned.


In [None]:
!python core.py

  [VeectorDB] Successfully imported tensors v0.7.6
  Imported VeectorDB (v0.9.8)
  Imported tensors (v0.7.6)
  Imported operations (v0.8.9)
  Imported Memory (v0.1.0)
Veector Qwen2 Ops Module Loaded. Found 3 operations.
  Found optional module: veector_models.qwen2.ops
Core components imported successfully.
\n--- Veector Core Example ---
--- Example needs update/implementation ---
\n--- Example Finished ---


In [None]:
# === Скрипт для прохода HF модели в float32 и сохранения ВСЕХ промежуточных выходов ===
# Version: 2.3 (Убраны лишние print, исправлен nonlocal)

import time
import pickle
import numpy as np
import traceback
import os
import gc
from pathlib import Path
from functools import partial
from typing import Dict, List, Any, Optional, Tuple, Union

# --- Необходимые библиотеки ---
try:
    import torch
    from torch import nn
    from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, PreTrainedTokenizer
    print("Torch and Transformers imported successfully.")
except ImportError as e:
    print(f"FATAL ERROR: Missing essential libraries: {e}")
    raise SystemExit(f"Missing essential libraries: {e}")

# --- Конфигурация ---
# Убедитесь, что эти переменные установлены в предыдущих ячейках Colab
if 'MODEL_SOURCE' not in locals(): MODEL_SOURCE = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
if 'TOKENIZER_SOURCE' not in locals(): TOKENIZER_SOURCE = MODEL_SOURCE
if 'DB_PATH_FOR_OUTPUT' not in locals(): DB_PATH_FOR_OUTPUT = Path("./data/db")
if 'PROMPT' not in locals(): PROMPT = "Hello, how are you?"

OUTPUT_FILENAME = f"{MODEL_SOURCE.split('/')[-1]}_hf_reference_outputs_fp32.pkl"
OUTPUT_FILEPATH = DB_PATH_FOR_OUTPUT / OUTPUT_FILENAME

# --- Создаем директорию ---
try:
    DB_PATH_FOR_OUTPUT.mkdir(parents=True, exist_ok=True)
    print(f"Output directory set to: {DB_PATH_FOR_OUTPUT.resolve()}")
except Exception as e:
    print(f"Error creating output directory {DB_PATH_FOR_OUTPUT}: {e}")

# --- Загрузка Токенизатора ---
tokenizer: Optional[PreTrainedTokenizer] = None
bos_token_id: Optional[int] = None
eos_token_id: Optional[int] = None
user_token_id: Optional[int] = None
assistant_token_id: Optional[int] = None
try:
    print(f"Loading Tokenizer from: {TOKENIZER_SOURCE}")
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_SOURCE, trust_remote_code=True, use_fast=False)
    print(f"Tokenizer class: {tokenizer.__class__.__name__}")

    # Явно добавляем спецтокены ПЕРЕД получением ID
    user_token = "<|User|>"
    assistant_token = "<|Assistant|>"
    num_added = tokenizer.add_special_tokens({
        'additional_special_tokens': [user_token, assistant_token]
    })
    print(f"Added {num_added} special tokens explicitly ('{user_token}', '{assistant_token}').")

    # Получаем ID после добавления
    bos_token_id = tokenizer.bos_token_id
    eos_token_id = tokenizer.eos_token_id
    user_token_id = tokenizer.convert_tokens_to_ids(user_token)
    assistant_token_id = tokenizer.convert_tokens_to_ids(assistant_token)

    # Проверка, что ID найдены
    if isinstance(user_token_id, str) or user_token_id == tokenizer.unk_token_id:
        raise ValueError(f"Could not find ID for token '{user_token}' even after adding.")
    if isinstance(assistant_token_id, str) or assistant_token_id == tokenizer.unk_token_id:
        raise ValueError(f"Could not find ID for token '{assistant_token}' even after adding.")

    # Установка PAD токена
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = eos_token_id if eos_token_id is not None else tokenizer.vocab_size
        print(f"Set pad_token_id to {tokenizer.pad_token_id}")

    print(f"Tokens: BOS={bos_token_id}, EOS={eos_token_id}, PAD={tokenizer.pad_token_id}, User={user_token_id}, Assistant={assistant_token_id}")

except Exception as e:
    print(f"FATAL ERROR loading tokenizer or getting special tokens: {e}")
    traceback.print_exc()
    raise SystemExit(f"Failed to load tokenizer: {e}")

# --- Подготовка входных данных ---
input_ids_torch: Optional[torch.Tensor] = None
input_seq_len: int = 0
try:
    print("Constructing prompt tokens (GGUF-style)...")
    user_text_ids = tokenizer.encode(PROMPT, add_special_tokens=False)
    input_ids_list = []
    if bos_token_id is not None:
        input_ids_list.append(bos_token_id)

    if user_token_id is None or assistant_token_id is None:
         raise ValueError("User or Assistant token ID is None after attempting to load.")

    input_ids_list.append(user_token_id)
    input_ids_list.extend(user_text_ids)
    input_ids_list.append(assistant_token_id)

    prompt_input_ids_np = np.array([input_ids_list], dtype=np.int64)
    input_seq_len = prompt_input_ids_np.shape[1]
    input_ids_torch = torch.tensor(prompt_input_ids_np)

    print(f"Input IDs shape: {input_ids_torch.shape}")
    # print(f"Input IDs list: {input_ids_list}") # Можно раскомментировать для отладки
    print(f"Decoded Input: '{tokenizer.decode(input_ids_list)}'")
except Exception as e:
    print(f"FATAL ERROR preparing input: {e}")
    traceback.print_exc()
    raise SystemExit(f"Failed to prepare input: {e}")

# --- Загрузка и Прогон Эталонной Модели в Float32 ---
hf_outputs: Dict[str, np.ndarray] = {}
hook_handles: List[Any] = []
model_fp32 = None

# --- Функция-хук для захвата выходов (ИСПРАВЛЕНО: убран nonlocal) ---
def get_hook(name: str):
    """Creates a hook to capture the layer's output."""
    def hook_fn(module: nn.Module, input_args: Tuple[Any, ...], output: Any):
        """Captures the layer output and stores it in hf_outputs."""
        # No nonlocal needed, hook_fn has access to hf_outputs from outer scope
        actual_output: Optional[torch.Tensor] = None
        if isinstance(output, torch.Tensor):
            actual_output = output
        elif isinstance(output, tuple) and len(output) > 0 and isinstance(output[0], torch.Tensor):
            actual_output = output[0]
        elif isinstance(output, dict) and 'last_hidden_state' in output and isinstance(output['last_hidden_state'], torch.Tensor):
             actual_output = output['last_hidden_state']
        elif isinstance(output, tuple) and len(output) > 0 and name.endswith("_attn_out"):
             if isinstance(output[0], torch.Tensor):
                 actual_output = output[0]

        if actual_output is not None:
            hf_outputs[name] = actual_output.detach().cpu().numpy().astype(np.float32)
        else:
            print(f"  [HOOK] WARN: Could not capture tensor output for {name}. Output type: {type(output)}")
    return hook_fn

try:
    print(f"Loading HF Model {MODEL_SOURCE} with float32...")
    model_fp32 = AutoModelForCausalLM.from_pretrained(MODEL_SOURCE, torch_dtype=torch.float32, trust_remote_code=True)

    model_fp32.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model_fp32.to(device)

    if input_ids_torch is None:
         raise ValueError("FATAL ERROR: input_ids_torch is None before moving to device.")
    input_ids_torch = input_ids_torch.to(device)

    print(f"HF Model loaded to device: {model_fp32.device}")

    # --- Регистрация хуков ---
    print("Registering detailed hooks...")
    model_config = model_fp32.config
    num_layers = model_config.num_hidden_layers

    hook_handles.append(model_fp32.model.embed_tokens.register_forward_hook(get_hook("embed_tokens")))
    for i in range(num_layers):
        layer = model_fp32.model.layers[i]
        hook_handles.append(layer.input_layernorm.register_forward_hook(get_hook(f"L{i}_input_norm_out")))
        hook_handles.append(layer.self_attn.register_forward_hook(get_hook(f"L{i}_attn_out")))
        hook_handles.append(layer.post_attention_layernorm.register_forward_hook(get_hook(f"L{i}_post_attn_norm_out")))
        hook_handles.append(layer.mlp.register_forward_hook(get_hook(f"L{i}_mlp_out")))
        hook_handles.append(layer.register_forward_hook(get_hook(f"L{i}_layer_output")))
    hook_handles.append(model_fp32.model.norm.register_forward_hook(get_hook("final_norm")))
    hook_handles.append(model_fp32.lm_head.register_forward_hook(get_hook("lm_head")))
    print(f"Registered {len(hook_handles)} hooks.")

    # --- Запуск прямого прохода ---
    print("Running HF model forward pass (float32)...")
    with torch.no_grad():
        hf_model_output = model_fp32(input_ids=input_ids_torch, use_cache=False, output_attentions=False, output_hidden_states=False)
    print("HF forward pass complete.")

except Exception as e:
    print(f"FATAL ERROR during HF float32 execution: {e}")
    traceback.print_exc()
finally:
    # --- Удаление хуков и очистка ---
    print(f"Removing {len(hook_handles)} hooks...")
    for handle in hook_handles:
        handle.remove()
    print(f"Removed hooks.")

    if 'model_fp32' in locals() and model_fp32 is not None:
        print("Cleaning up float32 model...")
        del model_fp32
        if 'torch' in locals() and hasattr(torch, 'cuda') and torch.cuda.is_available():
            torch.cuda.empty_cache()
            print("Cleared CUDA cache.")
        import gc
        gc.collect()
        print("Cleaned up float32 model.")

# --- Сохранение результатов ---
if hf_outputs:
    print(f"Saving Captured Float32 Outputs to {OUTPUT_FILEPATH}...")
    try:
        OUTPUT_FILEPATH.parent.mkdir(parents=True, exist_ok=True)
        with open(OUTPUT_FILEPATH, 'wb') as f:
            pickle.dump(hf_outputs, f, pickle.HIGHEST_PROTOCOL)
        print(f"Successfully saved {len(hf_outputs)} captured outputs.")
        # print(f"Saved keys: {list(hf_outputs.keys())}") # Можно раскомментировать для отладки
    except Exception as e:
        print(f"FATAL ERROR saving outputs: {e}")
        traceback.print_exc()
else:
    print("No outputs captured from HF model, skipping save.")

print("Reference Output Script Finished.")



Torch and Transformers imported successfully.
Output directory set to: /content/data/db
Loading Tokenizer from: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
Tokenizer class: LlamaTokenizerFast
Added 2 special tokens explicitly ('<|User|>', '<|Assistant|>').
Tokens: BOS=151646, EOS=151643, PAD=151643, User=151665, Assistant=151666
Constructing prompt tokens (GGUF-style)...
Input IDs shape: torch.Size([1, 9])
Decoded Input: '<｜begin▁of▁sentence｜><|User|>Hello, how are you?<|Assistant|>'
Loading HF Model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B with float32...
Using device: cpu
HF Model loaded to device: cpu
Registering detailed hooks...
Registered 143 hooks.
Running HF model forward pass (float32)...
HF forward pass complete.
Removing 143 hooks...
Removed hooks.
Cleaning up float32 model...
Cleaned up float32 model.
Saving Captured Float32 Outputs to data/db/DeepSeek-R1-Distill-Qwen-1.5B_hf_reference_outputs_fp32.pkl...
Successfully saved 143 captured outputs.
Reference Output Script Finis

In [None]:
# === Cell 5: Convert Parameters to Knowledge Tensors (Transposed Weights) ===

import gc
import pickle
import time
import traceback
from pathlib import Path

import numpy as np
import torch

try:
    from tensors import (
        TENSORS_VERSION, TensorCoordinate, create_tensor, MetadataTuple,
        validate_tensor_tuple, validate_tensor, DTYPE_MAPPING,
        TAG_TYPE_KNOWLEDGE, TAG_MODEL_DEEPSEEK, TAG_COMP_WEIGHTS, TAG_COMP_BIAS,
        TAG_COMP_EMBEDDING, TAG_COMP_LM_HEAD, TAG_COMP_LAYERNORM, TAG_COMP_ATTN_Q,
        TAG_COMP_ATTN_K, TAG_COMP_ATTN_V, TAG_COMP_ATTN_O, TAG_COMP_FFN_GATE,
        TAG_COMP_FFN_UP, TAG_COMP_FFN_DOWN, tag_layer, GROUP_IDX_QWEN_KNOWLEDGE,
        TAG_PREC_FLOAT32, TAG_PREC_FLOAT16, TAG_PREC_BFLOAT16, TAG_PREC_INT8
    )
    if TENSORS_VERSION < "0.7.6":
        raise ImportError(f"Requires tensors v0.7.6+, found v{TENSORS_VERSION}")
    from core import Veector, CORE_VERSION
    if CORE_VERSION < "0.6.12":
        raise ImportError(f"Requires core v0.6.12+, found v{CORE_VERSION}")
except ImportError as e:
    print(f"FATAL ERROR: Import failed: {e}")
    raise

# --- Версия Ячейки ---
CONVERTER_CELL5_VERSION = "Hybrid v0.7.6 + Quant + Transpose v2"
# --- Конец Версии ---

print(f"--- Running Converter Cell 5 v{CONVERTER_CELL5_VERSION} ---")
start_cell5_time = time.time()

# --- Проверка необходимых переменных ---
if 'vec' not in locals() or vec is None:
    raise NameError("'vec' object not defined.")
if 'DB_PATH' not in locals() or not isinstance(DB_PATH, Path):
    raise NameError("DB_PATH not defined or invalid.")
if 'model' not in locals() or model is None:
    raise NameError("HF 'model' not loaded.")
if 'model_NAME' not in locals() or not model_NAME:
    raise NameError("model_NAME not defined.")

# --- Переинициализация DB (если необходимо) ---
if not hasattr(vec, 'db') or vec.db is None:
    try:
        print("Attempting DB re-init for Cell 5...")
        # Импортируем только если нужно, чтобы избежать ненужных импортов вверху
        from veectordb import VeectorDB
        vec.db = VeectorDB(db_dir=DB_PATH)
        print("DB connection re-established.")
    except Exception as db_reinit_e:
        raise AttributeError(f"DB re-init failed: {db_reinit_e}")
else:
    print("'vec' object found and DB connection seems active.")

# --- Инициализация ---
ORIGINAL_NAME_TO_ID_MAP: Dict[str, int] = {}
ID_TO_ORIGINAL_NAME_MAP: Dict[int, str] = {}
NEXT_NAME_ID: int = 0
print("Initialized Name <-> ID mapping dictionaries.")

knowledge_map: Dict[str, str] = {} # Карта Имя -> ID Знания
param_count: int = 0
conversion_errors: int = 0

# --- Вспомогательная функция для ID ---
def get_or_create_name_id(name: Optional[str]) -> int:
    """Assigns and returns a unique ID for a parameter name."""
    global NEXT_NAME_ID, ORIGINAL_NAME_TO_ID_MAP, ID_TO_ORIGINAL_NAME_MAP
    if not name:
        return -1
    if name in ORIGINAL_NAME_TO_ID_MAP:
        return ORIGINAL_NAME_TO_ID_MAP[name]
    current_id = NEXT_NAME_ID
    ORIGINAL_NAME_TO_ID_MAP[name] = current_id
    ID_TO_ORIGINAL_NAME_MAP[current_id] = name
    NEXT_NAME_ID += 1
    return current_id

# --- Параметры конвертации ---
default_precision_tag = TAG_PREC_FLOAT16
default_torch_dtype = torch.float16
if 'TORCH_DTYPE' in locals(): # Определено в Cell 1
    default_torch_dtype = TORCH_DTYPE
    if TORCH_DTYPE == torch.float16: default_precision_tag = TAG_PREC_FLOAT16
    elif TORCH_DTYPE == torch.bfloat16: default_precision_tag = TAG_PREC_BFLOAT16
    elif TORCH_DTYPE == torch.float32: default_precision_tag = TAG_PREC_FLOAT32
    elif TORCH_DTYPE == torch.int8: default_precision_tag = TAG_PREC_INT8

knowledge_group_idx = GROUP_IDX_QWEN_KNOWLEDGE # 100
model_tag = TAG_MODEL_DEEPSEEK # 12

print(f"\n--- Creating Knowledge Tensors (Group: {knowledge_group_idx}) ---")
print(f"    Model Tag: {model_tag}")
print(f"    Default Precision Tag: {default_precision_tag}")
print(f"    Quantizing Embed/LMHead to INT8. Transposing Linear Weights.")

# --- Основной цикл конвертации ---
total_params = sum(1 for _ in model.named_parameters())
print(f"Found {total_params} parameters to process.")

for idx, (name, param) in enumerate(model.named_parameters()):
    loop_start_time = time.time()
    print(f"\nProcessing Param {idx+1}/{total_params}: {name}")
    print(f"  Original Shape: {param.shape} | Dtype: {param.dtype}")

    # Инициализация переменных цикла
    param_data_fp32: Optional[np.ndarray] = None
    knowledge_data_to_pass: Optional[np.ndarray] = None
    tags: List[int] = []
    metadata_extra_to_pass: Optional[Dict] = None
    dtype_to_pass: Any = None
    final_tags: List[int] = []
    knowledge_coord: Optional[TensorCoordinate] = None
    name_id: int = -1
    create_result: Optional[List] = None
    knowledge_id: Optional[str] = None
    requires_transpose: bool = False

    try:
        # Шаг 1-3: Получение данных, ID, Тегов, Координат
        param_data_fp32 = param.data.cpu().to(torch.float32).numpy()
        name_id = get_or_create_name_id(name)
        tags = [TAG_TYPE_KNOWLEDGE, model_tag]
        layer_idx = -1
        group_idx = knowledge_group_idx
        coord_x = 0
        current_nest = 1 # По умолчанию Nest=1 для знаний
        is_weight = name.endswith(".weight")
        is_bias = name.endswith(".bias")

        if is_weight: tags.append(TAG_COMP_WEIGHTS)
        elif is_bias: tags.append(TAG_COMP_BIAS)

        # Определение компонента, X координа и флага транспонирования
        if "model.embed_tokens.weight" in name:
             tags.append(TAG_COMP_EMBEDDING); coord_x = 0
        elif "lm_head.weight" in name:
             tags.append(TAG_COMP_LM_HEAD); coord_x = 1; requires_transpose = True
        elif "model.norm.weight" in name:
             layer_idx = model.config.num_hidden_layers; tags.append(TAG_COMP_LAYERNORM); coord_x = 0
        elif ".layers." in name:
            try:
                layer_part = name.split('.layers.')[1]
                layer_idx = int(layer_part.split('.')[0])
                if layer_idx >= 0: tags.append(tag_layer(layer_idx))
                else: raise ValueError(f"Invalid L idx: {layer_idx}")

                component_tag_layer = None
                if "self_attn" in name:
                    if "q_proj.weight" in name: component_tag_layer = TAG_COMP_ATTN_Q; coord_x = 10; requires_transpose = True
                    elif "q_proj.bias" in name: component_tag_layer = TAG_COMP_ATTN_Q; coord_x = 11
                    elif "k_proj.weight" in name: component_tag_layer = TAG_COMP_ATTN_K; coord_x = 20; requires_transpose = True
                    elif "k_proj.bias" in name: component_tag_layer = TAG_COMP_ATTN_K; coord_x = 21
                    elif "v_proj.weight" in name: component_tag_layer = TAG_COMP_ATTN_V; coord_x = 30; requires_transpose = True
                    elif "v_proj.bias" in name: component_tag_layer = TAG_COMP_ATTN_V; coord_x = 31
                    elif "o_proj.weight" in name: component_tag_layer = TAG_COMP_ATTN_O; coord_x = 40; requires_transpose = True
                elif "mlp" in name:
                    if "gate_proj.weight" in name: component_tag_layer = TAG_COMP_FFN_GATE; coord_x = 50; requires_transpose = True
                    elif "up_proj.weight" in name: component_tag_layer = TAG_COMP_FFN_UP; coord_x = 60; requires_transpose = True
                    elif "down_proj.weight" in name: component_tag_layer = TAG_COMP_FFN_DOWN; coord_x = 70; requires_transpose = True
                elif "input_layernorm.weight" in name: component_tag_layer = TAG_COMP_LAYERNORM; coord_x = 1
                elif "post_attention_layernorm.weight" in name: component_tag_layer = TAG_COMP_LAYERNORM; coord_x = 2

                if component_tag_layer: tags.append(component_tag_layer)
                elif not is_weight and not is_bias: print(f"  WARN: Unrecognized comp in L{layer_idx}: {name}"); coord_x = 99
            except Exception as parse_e:
                print(f"  Error parsing layer for {name}: {parse_e}"); conversion_errors += 1; continue
        else:
            print(f"  WARN: Param unmatched: {name}"); layer_idx = -1; coord_x = 999

        knowledge_coord = TensorCoordinate(layer=layer_idx, group=group_idx, nest=current_nest, x=coord_x)

        # Шаг 4: Квантование / Приведение типов / Транспонирование
        quantization_scale = None
        current_precision_tag = default_precision_tag
        data_before_save = None

        if name == "model.embed_tokens.weight" or name == "lm_head.weight":
            if np.issubdtype(param_data_fp32.dtype, np.floating):
                try:
                    abs_max = np.max(np.abs(param_data_fp32)); scale = 1.0
                    if abs_max >= 1e-9: scale = abs_max / 127.0
                    scale = max(scale, 1e-9) # Prevent division by zero
                    quantized_data = np.round(param_data_fp32 / scale).astype(np.int8)
                    data_before_save = quantized_data; dtype_to_pass = np.int8
                    quantization_scale = float(scale); current_precision_tag = TAG_PREC_INT8
                    metadata_extra_to_pass = {"quantization_scale": quantization_scale}
                    # Транспонируем только LM Head ПОСЛЕ квантования
                    if name == "lm_head.weight": # requires_transpose is True here
                        print("  Transposing quantized LM Head weights...")
                        data_before_save = data_before_save.T
                except Exception as quant_e:
                     print(f"  ERROR quantizing {name}: {quant_e}"); conversion_errors += 1; continue
            else: # Не float - не квантуем
                 data_before_save = param_data_fp32; dtype_to_pass = data_before_save.dtype; current_precision_tag = DTYPE_MAPPING.get(dtype_to_pass, default_precision_tag); metadata_extra_to_pass = None
                 if requires_transpose: # Все равно транспонируем, если нужно
                      print(f"  Transposing non-quantized {name}...")
                      data_before_save = data_before_save.T
        else: # Не embedding и не lm_head
            try:
                target_np_dtype = default_torch_dtype.numpy_dtype if hasattr(default_torch_dtype, 'numpy_dtype') else np.float16
                data_before_save = param_data_fp32.astype(target_np_dtype)
                dtype_to_pass = data_before_save.dtype; current_precision_tag = default_precision_tag
                metadata_extra_to_pass = None
                # Транспонируем если нужно
                if requires_transpose:
                    print(f"  Transposing {name} weights...")
                    data_before_save = data_before_save.T
            except Exception as cast_e:
                 print(f"  ERROR casting/transposing {name}: {cast_e}"); conversion_errors += 1; continue

        # Финальные данные для сохранения
        knowledge_data_to_pass = data_before_save
        final_shape_to_save = knowledge_data_to_pass.shape if knowledge_data_to_pass is not None else None

        # Шаг 5: Финализация тегов
        final_tags = list(tags)
        if current_precision_tag != default_precision_tag and default_precision_tag in final_tags:
            final_tags.remove(default_precision_tag)
        if current_precision_tag:
            final_tags.append(current_precision_tag)
        final_tags = sorted(list(set(final_tags)))

        print(f"  Final Tags: {final_tags}"); print(f"  Coordinate: {knowledge_coord}")
        print(f"  Data to save: dtype={dtype_to_pass}, shape={final_shape_to_save}") # Используем final_shape_to_save
        if metadata_extra_to_pass: print(f"  Extra Metadata: {metadata_extra_to_pass}")

        # Шаг 6: Создание Тензора
        create_result = vec.create_tensor(
             coord=knowledge_coord,
             tensor_type="knowledge",
             knowledge_data=knowledge_data_to_pass, # Передаем возможно транспонированные данные
             tags=final_tags,
             dtype=dtype_to_pass,
             shape=final_shape_to_save, # Передаем правильную форму
             name_id=name_id,
             metadata_extra=metadata_extra_to_pass,
             status="active"
         )

        # Шаг 8: Сохранение Тензора
        knowledge_id = vec.save_tensor(create_result) # Передаем список

        if knowledge_id:
            knowledge_map[name] = knowledge_id
            param_count += 1
        else:
            conversion_errors += 1
            print(f"  ERROR saving tensor for {name}")

    except Exception as create_save_e:
        print(f"  ERROR during create/save for {name}: {create_save_e}")
        traceback.print_exc(); conversion_errors += 1
    finally:
        if param_data_fp32 is not None:
            del param_data_fp32 # Освобождаем память
        loop_end_time = time.time()
        # print(f"  Param {idx+1} time: {loop_end_time - loop_start_time:.2f}s") # Сократим лог

# --- Конец Цикла ---

print(f"\n--- Finished saving {param_count} knowledge tensors to {vec.db.db_root_path if vec.db else 'N/A'} ---")
if conversion_errors > 0:
    print(f"!!! WARNING: {conversion_errors} errors occurred during knowledge conversion !!!")

# --- Сохранение Name ID Map ---
name_map_file = DB_PATH / f"{model_NAME}_name_id_map.pkl"
try:
    map_data_to_save = {
        "name_to_id": ORIGINAL_NAME_TO_ID_MAP,
        "id_to_name": ID_TO_ORIGINAL_NAME_MAP,
        "next_id": NEXT_NAME_ID
    }
    with open(name_map_file, 'wb') as f:
        pickle.dump(map_data_to_save, f)
    print(f"\nName <-> ID map saved to {name_map_file}")
except Exception as e:
    print(f"  Error saving name ID map: {e}")

# --- Сохранение Knowledge Map (для Cell 5.5) ---
# Имя файла определяется в Cell 4.5, но мы его здесь переопределим для надежности
# ... (код сохранения knowledge_map) ...
print(f"--- Saving Knowledge Map (for Cell 5.5) ---")
knowledge_map_filename = f"{model_NAME}_knowledge_map.pkl"
knowledge_map_filepath = DB_PATH / knowledge_map_filename
try:
    with open(knowledge_map_filepath, 'wb') as f:
        pickle.dump(knowledge_map, f) # Убедись, что knowledge_map здесь актуальна
    print(f"  Knowledge map saved to {knowledge_map_filepath}")
except Exception as e:
    print(f"  Error saving knowledge map: {e}")

# --- ВАЖНО: Сохранение индекса знаний ---
if 'vec' in locals() and vec.db:
    knowledge_index_filename = f"{model_NAME}_knowledge_index.pkl" # Имя файла индекса знаний
    knowledge_index_filepath = DB_PATH / knowledge_index_filename
    print(f"\nINFO: Attempting to save knowledge index ({len(vec.db.index)} entries) via save_index_as to {knowledge_index_filepath}...")
    try:
        # Вызов нового метода для сохранения индекса в отдельный файл
        vec.db.save_index_as(knowledge_index_filepath)
        print(f"INFO: Call to save_index_as completed for {knowledge_index_filepath.name}.")
    except Exception as sia_e:
        print(f"ERROR during save_index_as: {sia_e}")
        traceback.print_exc()

    # --- ВАЖНО: Закрытие соединения ---
    print("\nClosing DB connection for Cell 5...")
    vec.db.close() # Сохраняет основной индекс (если он был изменен) и закрывает
    print("DB connection closed.")
else:
    print("Warning: vec or vec.db not found at the end of Cell 5.")

# --- Очистка памяти ---
del vec # Удаляем объект, чтобы освободить ресурсы
if 'torch' in locals() and hasattr(torch, 'cuda'): torch.cuda.empty_cache()
import gc
gc.collect()
print("\nMemory cleanup attempted.")
print(f"--- Cell 5 Finished ---")

# --- Завершение Ячейки 5 ---
end_cell5_time = time.time()
print(f"--- Cell 5 Finished in {end_cell5_time - start_cell5_time:.2f} seconds ---")

--- Running Converter Cell 5 vHybrid v0.7.6 + Quant + Transpose v2 ---
'vec' object found and DB connection seems active.
Initialized Name <-> ID mapping dictionaries.

--- Creating Knowledge Tensors (Group: 100) ---
    Model Tag: 12
    Default Precision Tag: 21
    Quantizing Embed/LMHead to INT8. Transposing Linear Weights.
Found 339 parameters to process.

Processing Param 1/339: model.embed_tokens.weight
  Original Shape: torch.Size([151936, 1536]) | Dtype: torch.float16
  Final Tags: [2, 12, 23, 30, 32]
  Coordinate: L-1_G100_N1_X0_Y0_Z0
  Data to save: dtype=<class 'numpy.int8'>, shape=(151936, 1536)
  Extra Metadata: {'quantization_scale': 0.0025990402791649103}
DEBUG INDEX UPDATE: Adding/Updating ID fb4ef375e208da470d6841a89c441ca29e016873a80c8519660c22cfa227be8d -> Type: knowledge, Status: active, Coords: L-1_G100_N1_X0_Y0_Z0

Processing Param 2/339: model.layers.0.self_attn.q_proj.weight
  Original Shape: torch.Size([1536, 1536]) | Dtype: torch.float16
  Transposing model.l

In [None]:
# === Cell 5.5: Save Intermediate Data for Cell 6 (Corrected) ===

import pickle
from pathlib import Path
import os
import traceback

print("\\n--- Running Cell 5.5: Saving Intermediate Data ---")

# --- Проверка наличия необходимых переменных из предыдущих ячеек ---
# ИЗМЕНЕНО: Убрали 'name_id_map' из проверки, так как сама карта не нужна,
# только имя ее файла, которое мы получаем из intermediate_data.
required_vars = ['model_NAME', 'DB_PATH', 'knowledge_map']
for var_name in required_vars:
    if var_name not in locals():
        raise NameError(f"Variable '{var_name}' not found. Ensure previous cells ran successfully.")
    # Дополнительные проверки типов для надежности
    if var_name == 'DB_PATH' and not isinstance(DB_PATH, Path):
         raise TypeError(f"'{var_name}' should be a Path object.")
    if var_name == 'knowledge_map' and not isinstance(locals()[var_name], dict):
         raise TypeError(f"'{var_name}' should be a dictionary.")

# --- Определяем пути ---
# Используем model_NAME для уникальности файлов
intermediate_data_filename = f"{model_NAME}_cell6_input_data.pkl"
intermediate_data_filepath = DB_PATH / intermediate_data_filename

# Имена файлов карт и индекса, которые должны были быть сохранены в Cell 5
knowledge_map_filename = f"{model_NAME}_knowledge_map.pkl"
name_map_filename = f"{model_NAME}_name_id_map.pkl" # Имя файла для карты имен
knowledge_index_filename = f"{model_NAME}_knowledge_index.pkl" # Имя файла индекса знаний

# --- Данные для сохранения ---
# Сохраняем пути к файлам карт и индексного файла знаний, а не сами карты
# Также сохраняем конфиг модели (если он нужен в Cell 6)
model_config_to_save = None
if 'model' in locals() and hasattr(model, 'config'):
    model_config_to_save = model.config
    print("Found model config to save.")
elif 'model_config' in locals(): # Если конфиг был загружен отдельно
    model_config_to_save = model_config
    print("Found separately loaded model_config to save.")
else:
    print("Warning: Model config not found, Cell 6 might need it loaded separately.")

cell6_input_data = {
    'model_name': model_NAME,
    'db_path_str': str(DB_PATH.resolve()), # Сохраняем путь к БД как строку
    'model_config': model_config_to_save, # Сохраняем конфиг (или None)
    # Сохраняем имена файлов, чтобы Cell 6 знала, что загружать
    'knowledge_map_filename': knowledge_map_filename,
    'name_map_filename': name_map_filename, # Сохраняем имя файла карты имен
    'knowledge_index_filename': knowledge_index_filename
}

# --- Сохранение ---
try:
    # Убедимся, что директория DB_PATH существует
    DB_PATH.mkdir(parents=True, exist_ok=True)

    print(f"Saving intermediate data for Cell 6 to: {intermediate_data_filepath}")
    with open(intermediate_data_filepath, 'wb') as f:
        pickle.dump(cell6_input_data, f, pickle.HIGHEST_PROTOCOL)
    print("Intermediate data saved successfully.")
    print(f"  Included model name: {model_NAME}")
    print(f"  Included DB path: {cell6_input_data['db_path_str']}")
    print(f"  Included knowledge map filename: {knowledge_map_filename}")
    print(f"  Included name map filename: {name_map_filename}") # Убедимся, что имя файла сохранено
    print(f"  Included knowledge index filename: {knowledge_index_filename}")
    if model_config_to_save:
        print(f"  Included Model Config Type: {type(model_config_to_save)}")
    else:
        print("  Model Config was not included.")

except Exception as e:
    print(f"---!!! ERROR saving intermediate data: {e} !!!---")
    traceback.print_exc()
    # Можно добавить raise e, если критично прервать выполнение
else:
    print("--- Cell 5.5 Finished ---")

# --- Очистка памяти (опционально) ---
# import gc
# if 'model' in locals(): del model
# if 'torch' in locals() and hasattr(torch, 'cuda') and torch.cuda.is_available(): torch.cuda.empty_cache()
# gc.collect()
# print("Cleaned up model from memory (optional).")



\n--- Running Cell 5.5: Saving Intermediate Data ---
Found model config to save.
Saving intermediate data for Cell 6 to: data/db/DeepSeek-R1-Distill-Qwen-1.5B_cell6_input_data.pkl
Intermediate data saved successfully.
  Included model name: DeepSeek-R1-Distill-Qwen-1.5B
  Included DB path: /content/data/db
  Included knowledge map filename: DeepSeek-R1-Distill-Qwen-1.5B_knowledge_map.pkl
  Included name map filename: DeepSeek-R1-Distill-Qwen-1.5B_name_id_map.pkl
  Included knowledge index filename: DeepSeek-R1-Distill-Qwen-1.5B_knowledge_index.pkl
  Included Model Config Type: <class 'transformers.models.qwen2.configuration_qwen2.Qwen2Config'>
--- Cell 5.5 Finished ---


In [None]:
!rm -rf data/db/g500

In [None]:
# === Cell 6 (Updated v7 - Correct Path Definition Order & FFN Fix) ===
# Создает процессоры Veector, загружая индекс знаний из отдельного файла
# и передавая hidden_size в операцию Attention, hidden_act в MLP.

import time
import pickle
import numpy as np
import traceback
import os
import gc
from pathlib import Path
from functools import partial
from typing import Dict, List, Any, Optional, Tuple, Union

# --- Проверка наличия ГЛОБАЛЬНЫХ переменных ---
# Эти переменные должны быть установлены в предыдущих ячейках
if 'DB_PATH' not in globals() or not isinstance(DB_PATH, Path):
     print("WARN: Global variable DB_PATH not found, using default './data/db'")
     DB_PATH = Path("./data/db")
if 'model_NAME' not in globals() or not isinstance(model_NAME, str):
     print("WARN: Global variable model_NAME not found, using default 'DeepSeek-R1-Distill-Qwen-1.5B'")
     model_NAME = "DeepSeek-R1-Distill-Qwen-1.5B"

print(f"Using DB_PATH: {DB_PATH.resolve()}")
print(f"Using model_NAME: {model_NAME}")
# --- Конец проверки ---

# --- Необходимые библиотеки ---
try:
    import torch
    from torch import nn
    from transformers import AutoTokenizer, AutoConfig
    print("Torch and Transformers imported successfully.")
except ImportError as e:
    print(f"FATAL ERROR: Missing essential libraries (torch, transformers): {e}")
    raise

# --- Импорты проекта Veector (v0.7.13+, v0.9.8+) ---
try:
    from core import Veector, CORE_VERSION
    from tensors import (
        TENSORS_VERSION, TensorCoordinate, create_tensor, MetadataTuple,
        validate_tensor_tuple, validate_tensor, DTYPE_MAPPING, get_tensor_hash,
        TAG_TYPE_PROCESSOR, TAG_FUNC_EMBED_LOOKUP, TAG_FUNC_ATTENTION,
        TAG_FUNC_FFN, TAG_FUNC_LINEAR, TAG_COMP_LAYERNORM, TAG_MODEL_DEEPSEEK,
        tag_layer, GROUP_IDX_QWEN_PROCESSOR, GROUP_IDX_QWEN_KNOWLEDGE,
        TAG_COMP_EMBEDDING, TAG_COMP_WEIGHTS, TAG_COMP_BIAS, TAG_COMP_ATTN_Q,
        TAG_COMP_ATTN_K, TAG_COMP_ATTN_V, TAG_COMP_ATTN_O, TAG_COMP_FFN_GATE,
        TAG_COMP_FFN_UP, TAG_COMP_FFN_DOWN, TAG_COMP_LM_HEAD,
        TAG_PREC_FLOAT32, TAG_PREC_FLOAT16, TAG_PREC_BFLOAT16, TAG_PREC_INT8
    )
    from veectordb import VeectorDB, VEECTORDB_VERSION
    from operations import OPERATIONS_VERSION

    print(f"Using Core: {CORE_VERSION}, Tensors: {TENSORS_VERSION}, Ops: {OPERATIONS_VERSION}, DB: {VEECTORDB_VERSION}")
    # Проверка версий (можно добавить свои требования)
    if CORE_VERSION < "0.7.13": raise ImportError("Core version (expected 0.7.13+) too old")
    if OPERATIONS_VERSION < "0.8.9": print("WARN: Expected operations v0.8.9+") # Рекомендация
    if TENSORS_VERSION < "0.7.6": raise ImportError("Tensors version (expected 0.7.6+) too old")
    if VEECTORDB_VERSION < "0.9.8": raise ImportError("VeectorDB version (expected 0.9.8+) too old")
    print("Veector components imported successfully.")

    # OP Kody (необходимо определить здесь или импортировать)
    OP_ADD=[0,0,2]
    OP_MATRIX_MULTIPLY=[30,0,0]
    OP_LINEAR=OP_MATRIX_MULTIPLY
    OP_EMBEDDING_LOOKUP=[40,6,0]
    OP_LINEAR_HEAD=OP_LINEAR
    META_OP_CATEGORY=99
    OP_STORE=[99,0,0]
    OP_LOAD=[99,0,1]
    OP_QWEN2_RMSNORM = [300, 0, 0]
    OP_QWEN2_ATTENTION = [300, 1, 0]
    OP_QWEN2_MLP = [300, 2, 0]
    OP_GET_TUPLE_ELEM_0 = [99, 3, 0]
    OP_GET_TUPLE_ELEM_1 = [99, 3, 1]
    OP_GET_TUPLE_ELEM_2 = [99, 3, 2]
    print("Operation codes defined.")

except ImportError as e:
    print(f"FATAL ERROR: Failed to import Veector components: {e}")
    raise
except Exception as e_other:
    print(f"FATAL ERROR during Veector imports: {e_other}")
    raise

# --- Загрузка Промежуточных Данных из Cell 5.5 ---
print("\n--- Loading Intermediate Data from Cell 5.5 ---")
intermediate_data = None
model_config = None
knowledge_map_filename = None
name_map_filename = None
knowledge_index_filename = None
knowledge_map = None
name_id_map_data = None
intermediate_data_filepath = None # Initialize None

try:
    # Формируем путь к файлу промежуточных данных
    intermediate_data_filepath = DB_PATH / f"{model_NAME}_cell6_input_data.pkl"

    if not intermediate_data_filepath.is_file():
        raise FileNotFoundError(f"Intermediate data file not found at path: {intermediate_data_filepath}. Please ensure Cell 5.5 ran successfully and DB_PATH/model_NAME are correct.")

    print(f"Loading intermediate data from: {intermediate_data_filepath}")
    with open(intermediate_data_filepath, 'rb') as f:
        intermediate_data = pickle.load(f)

    # Извлекаем данные из загруженного словаря
    loaded_model_name = intermediate_data.get('model_name')
    loaded_db_path_str = intermediate_data.get('db_path_str')
    model_config = intermediate_data.get('model_config') # Загружаем конфиг
    knowledge_map_filename = intermediate_data.get('knowledge_map_filename')
    name_map_filename = intermediate_data.get('name_map_filename')
    knowledge_index_filename = intermediate_data.get('knowledge_index_filename')

    # Проверяем согласованность и наличие ключей
    if loaded_model_name != model_NAME: print(f"WARN: model_NAME mismatch ('{model_NAME}' vs loaded '{loaded_model_name}')")
    if loaded_db_path_str != str(DB_PATH.resolve()): print(f"WARN: DB_PATH mismatch ('{str(DB_PATH.resolve())}' vs loaded '{loaded_db_path_str}')")
    if not all([model_config, knowledge_map_filename, name_map_filename, knowledge_index_filename]):
        raise ValueError("Intermediate data file is missing required keys (model_config, map filenames, index filename).")

    # Загружаем карты знаний и имен
    knowledge_map_filepath = DB_PATH / knowledge_map_filename
    name_map_filepath = DB_PATH / name_map_filename
    print(f"Loading knowledge map from: {knowledge_map_filepath}")
    with open(knowledge_map_filepath, 'rb') as f: knowledge_map = pickle.load(f)
    if name_map_filepath.is_file():
         print(f"Loading name ID map from: {name_map_filepath}")
         with open(name_map_filepath, 'rb') as f: name_id_map_data = pickle.load(f)
         print(f"Loaded name ID map.")
    else: print(f"Warning: Name ID map file not found at {name_map_filepath}")

    print("Intermediate data loaded successfully.")
    print(f"  Knowledge Map Entries: {len(knowledge_map)}")
    print(f"  Knowledge Index File: {knowledge_index_filename}")

except FileNotFoundError as e:
    print(f"FATAL ERROR: {e}")
    raise
except Exception as e:
    print(f"FATAL ERROR loading intermediate data or maps: {e}")
    traceback.print_exc()
    raise

# --- Извлечение параметров модели из конфига ---
try:
    num_layers = model_config.num_hidden_layers
    num_attention_heads = model_config.num_attention_heads
    num_key_value_heads = getattr(model_config, 'num_key_value_heads', num_attention_heads)
    hidden_size = model_config.hidden_size
    head_dim = hidden_size // num_attention_heads
    rms_norm_eps = model_config.rms_norm_eps
    # Получаем имя функции активации из конфига
    hidden_act_function_name = model_config.hidden_act
    print(f"Model Config Params: L={num_layers}, H={num_attention_heads}, KVH={num_key_value_heads}, HDim={head_dim}, Hidden={hidden_size}, Epsilon={rms_norm_eps}, HiddenAct='{hidden_act_function_name}'")
except AttributeError as e:
    print(f"FATAL ERROR: Could not get required parameters from loaded model_config: {e}")
    raise

# --- Инициализация Veector с Загрузкой Индекса Знаний ---
print("\n--- Initializing Veector for Processor Tensors ---")
vec_processor: Optional[Veector] = None
knowledge_index_filepath = DB_PATH / knowledge_index_filename
main_index_filepath = DB_PATH / VeectorDB.INDEX_FILENAME # Стандартное имя основного индекса

try:
    print(f"Loading initial index from: '{knowledge_index_filepath.name}'")
    # Инициализируем Veector, передавая путь к ИНДЕКСУ ЗНАНИЙ
    vec_processor = Veector(db_dir=DB_PATH, initial_index_path=knowledge_index_filepath)
    if len(vec_processor.db.index) == 0:
        print(f"WARNING: Loaded knowledge index from '{knowledge_index_filepath.name}' is empty. Ensure Cell 5 ran and saved its index correctly.")
    else:
        print(f"Successfully loaded {len(vec_processor.db.index)} entries from knowledge index.")
    # Устанавливаем путь для СОХРАНЕНИЯ основного индекса (tensor_index.pkl)
    vec_processor.db.index_path = main_index_filepath
    print(f"Default save path set to: '{vec_processor.db.index_path.name}'")
    # Помечаем индекс как "грязный", чтобы он сохранился при закрытии
    # (т.к. мы будем добавлять процессоры)
    vec_processor.db._index_dirty = True
except Exception as e:
    print(f"FATAL: Veector initialization failed: {e}")
    traceback.print_exc()
    raise

# --- Вспомогательная функция для поиска ID знаний ---
def find_knowledge_id(hf_param_name: str) -> Optional[str]:
    """Finds the Veector knowledge tensor ID for a given HF parameter name."""
    if knowledge_map is None:
        print("ERROR: knowledge_map is None in find_knowledge_id!")
        return None
    return knowledge_map.get(hf_param_name)

# --- Определение и Сохранение Процессоров Veector ---
print("\n--- Defining and Saving Veector Processor Tensors (using High-Level OPs) ---")
processor_errors: int = 0
processor_map: Dict[str, str] = {} # Карта для ID процессоров

# --- Вспомогательная функция для создания и сохранения процессоров ---
def create_and_save_processor(name: str, coord: TensorCoordinate, tags: List[int], interface: Dict, ops_sequences: Dict):
    """Helper function to create and save a processor tensor."""
    global processor_errors, processor_map, vec_processor
    proc_id: Optional[str] = None
    try:
        print(f"  Defining Processor: {name} at {coord}")
        # Создаем структуру списка процессора
        tensor_structure = vec_processor.create_tensor(
            coord=coord,
            tensor_type="processor",
            tags=tags,
            interface=interface,
            ops_sequences=ops_sequences,
            status="active",
            name_id=-1 # Имя процессора не так важно, как имя знания
        )
        # Валидируем созданную структуру списка
        if not validate_tensor(tensor_structure):
            raise ValueError(f"Invalid list structure created for processor {name}")

        # Сохраняем процессор (передаем список)
        proc_id = vec_processor.save_tensor(tensor_structure)

        if proc_id:
            # Определяем ключ для карты процессоров
            map_key = ""
            if "Embedding" in name: map_key = "embedding"
            elif "Final Norm" in name: map_key = "final_norm"
            elif "LM Head" in name: map_key = "lm_head"
            elif "Attention Processor L" in name:
              try: layer_idx = int(name.split("L")[-1]); map_key = f"attn_{layer_idx}"
              except: pass
            elif "FFN Processor L" in name:
              try: layer_idx = int(name.split("L")[-1]); map_key = f"ffn_{layer_idx}"
              except: pass

            if map_key:
                processor_map[map_key] = proc_id
                print(f"    SUCCESS: Saved {name} with ID: {proc_id} (Key: {map_key})")
            else:
                # Это не должно происходить для стандартных процессоров
                print(f"    WARN: Saved {name} with ID: {proc_id}, but could not determine map key.")
        else:
            processor_errors += 1
            print(f"    ERROR saving processor {name}")

    except Exception as e:
        print(f"    ERROR during definition/saving of processor {name}: {e}")
        traceback.print_exc()
        processor_errors += 1
    return proc_id # Возвращаем ID или None

# --- Параметры для процессоров ---
processor_group_idx = GROUP_IDX_QWEN_PROCESSOR # 500
model_tag = TAG_MODEL_DEEPSEEK # 12 (или TAG_MODEL_QWEN2, если точнее)
# Определяем теги точности на основе конфига или знаний (можно уточнить)
prec_tag_weights = TAG_PREC_FLOAT16 # По умолчанию для весов
prec_tag_quant = TAG_PREC_INT8     # Для квантованных Embed/LMHead

# --- 1. Embedding Processor ---
print("\n--- Defining Embedding Processor ---")
try:
    coord = TensorCoordinate(layer=-1, group=processor_group_idx, nest=0, x=0)
    tags = [TAG_TYPE_PROCESSOR, TAG_FUNC_EMBED_LOOKUP, model_tag]
    param_name = "embedding_matrix"
    hf_name = "model.embed_tokens.weight"
    # Теги для поиска соответствующего тензора знаний
    kn_tags = [TAG_COMP_EMBEDDING, model_tag, TAG_COMP_WEIGHTS, prec_tag_quant] # Ищем квантованный
    kid = find_knowledge_id(hf_name)
    if not kid:
        raise ValueError(f"Embedding knowledge tensor ID not found in map for '{hf_name}'.")
    # Описываем интерфейс процессора
    interface = {
        "inputs": [{"name":"token_ids", "dtype":"int64"}],
        "outputs": [{"name":"hidden_states", "dtype":"float16"}], # Ожидаем float16 после деквантования
        "knowledge_needed": [{"param_name": param_name, "tags": kn_tags, "knowledge_id": kid}]
    }
    # Определяем последовательность операций
    ops_sequences = {
        'default': [
            [OP_EMBEDDING_LOOKUP, {"embedding_matrix": param_name}]
        ]
    }
    # Создаем и сохраняем
    create_and_save_processor("Embedding Processor", coord, tags, interface, ops_sequences)
except Exception as e:
    print(f"Error defining Embedding Processor: {e}")
    traceback.print_exc()
    processor_errors += 1

# --- 2. Слои Transformera ---
print(f"\n--- Defining Transformer Layer Processors (0 to {num_layers-1}) ---")
for layer_idx in range(num_layers):
    layer_tag = tag_layer(layer_idx) # Тег для текущего слоя
    print(f"  Processing Layer {layer_idx}...")

    # --- 2.A Attention Processor ---
    try:
        coord_attn = TensorCoordinate(layer=layer_idx, group=processor_group_idx, nest=0, x=0)
        tags_attn = [TAG_TYPE_PROCESSOR, TAG_FUNC_ATTENTION, layer_tag, model_tag]
        # Определяем необходимые тензоры знаний для Attention
        kn_defs_attn = [
            # Имя параметра в ops, Теги для поиска, Имя параметра в HF модели, Опционально?
            {"p":f"L{layer_idx}_input_norm_w", "t":[TAG_COMP_LAYERNORM, layer_tag, model_tag, TAG_COMP_WEIGHTS, prec_tag_weights], "f":f"model.layers.{layer_idx}.input_layernorm.weight"},
            {"p":f"L{layer_idx}_q_w",   "t":[TAG_COMP_ATTN_Q, layer_tag, model_tag, TAG_COMP_WEIGHTS, prec_tag_weights], "f":f"model.layers.{layer_idx}.self_attn.q_proj.weight"},
            {"p":f"L{layer_idx}_q_b",   "t":[TAG_COMP_ATTN_Q, layer_tag, model_tag, TAG_COMP_BIAS, prec_tag_weights],    "f":f"model.layers.{layer_idx}.self_attn.q_proj.bias", "opt": True}, # Bias опционален
            {"p":f"L{layer_idx}_k_w",   "t":[TAG_COMP_ATTN_K, layer_tag, model_tag, TAG_COMP_WEIGHTS, prec_tag_weights], "f":f"model.layers.{layer_idx}.self_attn.k_proj.weight"},
            {"p":f"L{layer_idx}_k_b",   "t":[TAG_COMP_ATTN_K, layer_tag, model_tag, TAG_COMP_BIAS, prec_tag_weights],    "f":f"model.layers.{layer_idx}.self_attn.k_proj.bias", "opt": True},
            {"p":f"L{layer_idx}_v_w",   "t":[TAG_COMP_ATTN_V, layer_tag, model_tag, TAG_COMP_WEIGHTS, prec_tag_weights], "f":f"model.layers.{layer_idx}.self_attn.v_proj.weight"},
            {"p":f"L{layer_idx}_v_b",   "t":[TAG_COMP_ATTN_V, layer_tag, model_tag, TAG_COMP_BIAS, prec_tag_weights],    "f":f"model.layers.{layer_idx}.self_attn.v_proj.bias", "opt": True},
            {"p":f"L{layer_idx}_o_w",   "t":[TAG_COMP_ATTN_O, layer_tag, model_tag, TAG_COMP_WEIGHTS, prec_tag_weights], "f":f"model.layers.{layer_idx}.self_attn.o_proj.weight"},
            # o_proj.bias обычно нет в Qwen2
        ]
        # Собираем список knowledge_needed, ища ID в карте
        knowledge_needs_attn = []
        missing_essential = False
        for kdef in kn_defs_attn:
            kid = find_knowledge_id(kdef["f"])
            is_opt = kdef.get("opt", False)
            if kid:
                knowledge_needs_attn.append({"param_name": kdef["p"], "tags": kdef["t"], "knowledge_id": kid, "optional": is_opt})
            elif not is_opt:
                missing_essential = True
                print(f"ERROR: Missing essential knowledge for Attn L{layer_idx}: {kdef['p']} ({kdef['f']})")

        # Создаем процессор, только если все НЕОБЯЗАТЕЛЬНЫЕ знания найдены
        # Используем код из артефакта cell6_attn_processor_block
        # Check if all essential knowledge tensors were found
        if not missing_essential:
            # Define the interface for the Attention processor
            interface_attn = {
                "inputs": [
                    {"name": "hidden_state_in"},    # Input hidden state from previous layer/embedding
                    {"name": "residual_input"},     # Input for the first residual connection
                    {"name": "position_ids"},       # Positional IDs for RoPE
                    {"name": "past_key", "optional": True},       # Previous K cache state
                    {"name": "past_value", "optional": True},     # Previous V cache state
                    {"name": "start_pos", "dtype": "int", "optional": True}, # Start position for KV cache update
                    {"name": "total_seq_len", "dtype": "int", "optional": True} # Total sequence length for masking/RoPE
                ],
                "outputs": [
                    {"name": "attn_block_output"} # Output after attention and residual add
                ],
                "knowledge_needed": knowledge_needs_attn # List of required knowledge tensors
            }

            # Define the sequence of operations for the Attention processor
            ops_sequences_attn = {'default': [
                # 1. Store the initial input for the residual connection
                [OP_STORE, 'residual_attn'],
                # 2. Apply RMS Normalization to the input hidden state
                [OP_QWEN2_RMSNORM, {
                    "norm_weight": f"L{layer_idx}_input_norm_w", # Reference to the norm weight knowledge tensor
                    "eps": rms_norm_eps                          # Epsilon value for RMSNorm
                }],
                # 3. Execute the high-level Qwen2 Attention operation
                [OP_QWEN2_ATTENTION, {
                    # --- Knowledge Tensor References ---
                    "q_weights": f"L{layer_idx}_q_w",
                    "k_weights": f"L{layer_idx}_k_w",
                    "v_weights": f"L{layer_idx}_v_w",
                    "o_weights": f"L{layer_idx}_o_w",
                    "q_bias": f"L{layer_idx}_q_b",    # Optional bias
                    "k_bias": f"L{layer_idx}_k_b",    # Optional bias
                    "v_bias": f"L{layer_idx}_v_b",    # Optional bias
                    # --- Input/Context References ---
                    "position_ids": "position_ids",   # From processor input
                    "past_key": "past_key",           # From processor input (optional)
                    "past_value": "past_value",       # From processor input (optional)
                    "start_pos": "start_pos",         # From processor input (optional)
                    "total_seq_len": "total_seq_len", # From processor input (optional)
                    # --- Model Configuration Parameters ---
                    "num_heads": num_attention_heads,
                    "num_kv_heads": num_key_value_heads,
                    "head_dim": head_dim,
                    "hidden_size": hidden_size,       # Needed for reshaping
                    "layer_idx": layer_idx,           # For potential layer-specific logic inside op
                    "rope_theta": model_config.rope_theta
                }],
                # 4. The OP_QWEN2_ATTENTION returns a tuple: (attn_output_proj, updated_k_cache, updated_v_cache)
                #    Store the tuple temporarily
                [OP_STORE, 'attn_tuple_output'],
                # 5. Extract and store the updated K cache
                [OP_LOAD, 'attn_tuple_output'],      # Load the tuple
                [OP_GET_TUPLE_ELEM_1],               # Get element at index 1 (K cache)
                [OP_STORE, 'k_cache_out'],           # Store it in context for potential use/return
                # 6. Extract and store the updated V cache
                [OP_LOAD, 'attn_tuple_output'],      # Load the tuple again
                [OP_GET_TUPLE_ELEM_2],               # Get element at index 2 (V cache)
                [OP_STORE, 'v_cache_out'],           # Store it in context
                # 7. Extract the actual attention output
                [OP_LOAD, 'attn_tuple_output'],      # Load the tuple one last time
                [OP_GET_TUPLE_ELEM_0],               # Get element at index 0 (attention output)
                # 8. Add the residual connection
                # The current data is the attention output (from OP_GET_TUPLE_ELEM_0)
                # We add the original input stored in 'residual_attn'
                [OP_ADD, {"input_a": "residual_attn", "input_b": "_"}] # "_" refers to the current data
            ]}

            # Create and save the processor using the helper function
            create_and_save_processor(
                name=f"Attention Processor L{layer_idx}",
                coord=coord_attn,
                tags=tags_attn,
                interface=interface_attn,
                ops_sequences=ops_sequences_attn
            )
        else:
            # If essential knowledge was missing, increment error count
            processor_errors += 1
            print(f"    Skipping Attention Processor L{layer_idx} due to missing essential knowledge.")

    except Exception as e:
        print(f"Error defining Attn L{layer_idx}: {e}")
        traceback.print_exc()
        processor_errors += 1

    # --- 2.B FFN Processor (ИСПРАВЛЕНО: добавлен hidden_act) ---
    try:
        coord_ffn = TensorCoordinate(layer=layer_idx, group=processor_group_idx, nest=0, x=1)
        tags_ffn = [TAG_TYPE_PROCESSOR, TAG_FUNC_FFN, layer_tag, model_tag]
        # Определяем необходимые тензоры знаний для FFN
        kn_defs_ffn = [
            {"p": f"L{layer_idx}_post_attn_norm_w", "t": [TAG_COMP_LAYERNORM, layer_tag, model_tag, TAG_COMP_WEIGHTS, prec_tag_weights], "f": f"model.layers.{layer_idx}.post_attention_layernorm.weight"},
            {"p": f"L{layer_idx}_gate_w", "t": [TAG_COMP_FFN_GATE, layer_tag, model_tag, TAG_COMP_WEIGHTS, prec_tag_weights],  "f": f"model.layers.{layer_idx}.mlp.gate_proj.weight"},
            {"p": f"L{layer_idx}_up_w",   "t": [TAG_COMP_FFN_UP, layer_tag, model_tag, TAG_COMP_WEIGHTS, prec_tag_weights],    "f": f"model.layers.{layer_idx}.mlp.up_proj.weight"},
            {"p": f"L{layer_idx}_down_w", "t": [TAG_COMP_FFN_DOWN, layer_tag, model_tag, TAG_COMP_WEIGHTS, prec_tag_weights],  "f": f"model.layers.{layer_idx}.mlp.down_proj.weight"},
            # MLP bias обычно нет в Qwen2
        ]
        # Собираем список knowledge_needed
        knowledge_needs_ffn = []
        missing_essential = False
        for kdef in kn_defs_ffn:
            kid = find_knowledge_id(kdef["f"])
            is_opt = kdef.get("opt", False)
            if kid:
                knowledge_needs_ffn.append({"param_name": kdef["p"], "tags": kdef["t"], "knowledge_id": kid, "optional": is_opt})
            elif not is_opt:
                missing_essential = True
                print(f"ERROR: Missing essential knowledge for FFN L{layer_idx}: {kdef['p']} ({kdef['f']})")

        if not missing_essential:
            # Определяем интерфейс FFN процессора
            interface_ffn = {
                "inputs": [
                    {"name":"attn_block_output"}, # Выход предыдущего Attention блока
                    {"name":"residual_input"}     # Вход для второго residual соединения
                ],
                "outputs": [
                    {"name":"layer_output"}      # Финальный выход слоя
                ],
                "knowledge_needed": knowledge_needs_ffn
            }
            # --- Определение последовательности операций для FFN (ИСПРАВЛЕНО) ---
            ops_sequences_ffn = {'default': [
                # 1. Сохраняем вход для второго residual соединения
                [OP_STORE, 'residual_ffn'],
                # 2. Применяем RMS Norm (post-attention layernorm)
                [OP_QWEN2_RMSNORM, {
                    "norm_weight": f"L{layer_idx}_post_attn_norm_w",
                    "eps": rms_norm_eps
                }],
                # 3. Выполняем MLP блок
                [OP_QWEN2_MLP, {
                    "gate_weights": f"L{layer_idx}_gate_w",
                    "up_weights": f"L{layer_idx}_up_w",
                    "down_weights": f"L{layer_idx}_down_w",
                    # ---> ИСПРАВЛЕНО: Передаем имя функции активации <---
                    "hidden_act": hidden_act_function_name # Используем переменную из конфига
                }],
                # 4. Добавляем второе residual соединение
                [OP_ADD, {"input_a": "residual_ffn", "input_b": "_"}]
            ]}
            # --- Конец исправления ---

            # Создаем и сохраняем FFN процессор
            create_and_save_processor(
                name=f"FFN Processor L{layer_idx}",
                coord=coord_ffn,
                tags=tags_ffn,
                interface=interface_ffn,
                ops_sequences=ops_sequences_ffn
            )
        else:
            processor_errors += 1
            print(f"    Skipping FFN Processor L{layer_idx} due to missing essential knowledge.")

    except Exception as e:
        print(f"Error defining FFN L{layer_idx}: {e}")
        traceback.print_exc()
        processor_errors += 1
# --- Конец цикла по слоям ---

# --- 3. Final Norm Processor ---
print("\n--- Defining Final Norm Processor ---")
try:
    coord = TensorCoordinate(layer=-1, group=processor_group_idx, nest=0, x=1)
    tags = [TAG_TYPE_PROCESSOR, TAG_COMP_LAYERNORM, model_tag]
    kn_tags = [TAG_COMP_LAYERNORM, model_tag, TAG_COMP_WEIGHTS, prec_tag_weights]
    hf_name = "model.norm.weight"
    kid = find_knowledge_id(hf_name)
    if not kid:
        raise ValueError(f"Final Norm knowledge tensor ID not found in map for '{hf_name}'.")
    knowledge_needs = [{"param_name": "norm_weight", "tags": kn_tags, "knowledge_id": kid}]
    interface = {
        "inputs": [{"name":"final_hidden_state"}],
        "outputs": [{"name":"final_normed_state"}],
        "knowledge_needed": knowledge_needs
    }
    ops_sequences = {
        'default': [
            [OP_QWEN2_RMSNORM, {"norm_weight": "norm_weight", "eps": rms_norm_eps}]
        ]
    }
    create_and_save_processor("Final Norm Processor", coord, tags, interface, ops_sequences)
except Exception as e:
    print(f"Error defining Final Norm Processor: {e}")
    traceback.print_exc()
    processor_errors += 1

# --- 4. LM Head Processor ---
print("\n--- Defining LM Head Processor ---")
try:
    coord = TensorCoordinate(layer=-1, group=processor_group_idx, nest=0, x=2)
    tags = [TAG_TYPE_PROCESSOR, TAG_FUNC_LINEAR, model_tag]
    # Ищем квантованный тензор знаний для LM Head
    kn_tags = [TAG_COMP_LM_HEAD, model_tag, TAG_COMP_WEIGHTS, prec_tag_quant]
    hf_name = "lm_head.weight"
    kid = find_knowledge_id(hf_name)
    if not kid:
        raise ValueError(f"LM Head knowledge tensor ID not found in map for '{hf_name}'.")
    knowledge_needs = [{"param_name": "lm_head_weights", "tags": kn_tags, "knowledge_id": kid}]
    interface = {
        "inputs": [{"name":"final_normed_state"}],
        "outputs": [{"name":"logits"}], # Логиты будут float16 или float32 в зависимости от nest_level
        "knowledge_needed": knowledge_needs
    }
    ops_sequences = {
        'default': [
            # Используем стандартный OP_LINEAR, т.к. он должен уметь работать
            # с деквантованными данными из knowledge tensor
            [OP_LINEAR_HEAD, {"weights": "lm_head_weights"}]
        ]
    }
    create_and_save_processor("LM Head Processor", coord, tags, interface, ops_sequences)
except Exception as e:
    print(f"Error defining LM Head Processor: {e}")
    traceback.print_exc()
    processor_errors += 1


# --- Финализация ---
print(f"\n--- Finalizing Cell 6 ({processor_errors} errors during processor creation) ---")

# Сохранение карты процессоров
processor_map_filepath = DB_PATH / f"{model_NAME}_proc_map.pkl"
try:
    if processor_errors == 0:
        expected_proc_count = 3 + 2 * num_layers # Embed, Norm, Head + 2*Layers
        if len(processor_map) == expected_proc_count:
             print(f"Saving processor map ({len(processor_map)} entries) to {processor_map_filepath}...")
             with open(processor_map_filepath, 'wb') as f:
                 pickle.dump(processor_map, f)
             print(f"Processor map saved successfully.")
        else:
             print(f"WARN: Processor map has incorrect entry count ({len(processor_map)} vs {expected_proc_count}). NOT SAVED.")
    else:
        print(f"Processor map NOT saved due to {processor_errors} errors during creation.")
except Exception as e:
    print(f"Error saving processor map: {e}")

# Проверка файла эталонных выходов (необязательно здесь)
ref_output_filename = f"{model_NAME}_hf_reference_outputs_fp32.pkl"
ref_output_path = DB_PATH / ref_output_filename
if not ref_output_path.is_file():
     print(f"Warning: Reference output file {ref_output_path.name} not found. Comparison cell might fail.")
else:
     print(f"Reference HF outputs assumed to exist at {ref_output_path.name}")

# --- Закрытие соединения с БД ---
# Сохраняет основной индекс (tensor_index.pkl), который теперь включает и знания, и процессоры
if 'vec_processor' in locals() and vec_processor and hasattr(vec_processor, 'db') and vec_processor.db:
    print("\nClosing Veector DB connection (this will save the main index)...")
    print(f"Index size before final save in Cell 6: {len(vec_processor.db.index)}")
    vec_processor.db.close()
    print("DB connection closed by Cell 6.")
else:
    print("\nWarning: Veector instance for processors not found or already closed.")

# --- Очистка ---
gc.collect()
if 'torch' in locals() and hasattr(torch, 'cuda') and torch.cuda.is_available():
    torch.cuda.empty_cache()
print("\nMemory cleanup attempted.")

if processor_errors == 0:
    print(f"\n--- Cell 6 Finished Successfully ---")
else:
    print(f"\n--- Cell 6 Finished with {processor_errors} ERRORS ---")



Using DB_PATH: /content/data/db
Using model_NAME: DeepSeek-R1-Distill-Qwen-1.5B
Torch and Transformers imported successfully.
Using Core: 0.7.13, Tensors: 0.7.6, Ops: 0.8.9, DB: 0.9.8
Veector components imported successfully.
Operation codes defined.

--- Loading Intermediate Data from Cell 5.5 ---
Loading intermediate data from: data/db/DeepSeek-R1-Distill-Qwen-1.5B_cell6_input_data.pkl
Loading knowledge map from: data/db/DeepSeek-R1-Distill-Qwen-1.5B_knowledge_map.pkl
Loading name ID map from: data/db/DeepSeek-R1-Distill-Qwen-1.5B_name_id_map.pkl
Loaded name ID map.
Intermediate data loaded successfully.
  Knowledge Map Entries: 339
  Knowledge Index File: DeepSeek-R1-Distill-Qwen-1.5B_knowledge_index.pkl
Model Config Params: L=28, H=12, KVH=2, HDim=128, Hidden=1536, Epsilon=1e-06, HiddenAct='silu'

--- Initializing Veector for Processor Tensors ---
Loading initial index from: 'DeepSeek-R1-Distill-Qwen-1.5B_knowledge_index.pkl'
--- Initializing Veector Core v0.7.13 ---
    Requires: 

In [1]:
# === Inference Cell (Detailed Compare & Stop v7 - Add Special Tokens Fix) ===
# Цель: Запуск инференса Veector и сравнение ВСЕХ промежуточных выходов
#       с эталонными HF, остановка при первом расхождении.
# ИЗМЕНЕНО: Загрузка токенизатора с HF Hub и явное добавление <|User|>, <|Assistant|>

import time
import pickle
import numpy as np
import traceback
import os
import gc
import psutil
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple, Union

# --- Необходимые библиотеки ---
try:
    import torch
    from transformers import AutoTokenizer, AutoConfig, PreTrainedTokenizer
    print("Torch and Transformers imported successfully.")
except ImportError as e: print(f"FATAL ERROR: Missing essential libraries: {e}"); raise

# --- Импорты проекта Veector ---
PROJECT_IMPORTS_OK = False
CORE_VERSION_REQ = "0.7.13"; TENSORS_VERSION_REQ = "0.7.6"; VEECTORDB_VERSION_REQ = "0.9.8"; OPERATIONS_VERSION_REQ = "0.8.9"
try:
    from core import Veector, CORE_VERSION
    from tensors import TensorCoordinate, TENSORS_VERSION, GROUP_IDX_QWEN_KNOWLEDGE
    from operations import OPERATIONS_VERSION, softmax
    from veectordb import VeectorDB, VEECTORDB_VERSION
    print(f"Using Core: {CORE_VERSION}, Tensors: {TENSORS_VERSION}, Ops: {OPERATIONS_VERSION}, DB: {VEECTORDB_VERSION}")
    if CORE_VERSION < CORE_VERSION_REQ: raise ImportError(f"Core version too old (req: {CORE_VERSION_REQ})")
    if TENSORS_VERSION < TENSORS_VERSION_REQ: raise ImportError(f"Tensors version too old (req: {TENSORS_VERSION_REQ})")
    if VEECTORDB_VERSION < VEECTORDB_VERSION_REQ: raise ImportError(f"VeectorDB version too old (req: {VEECTORDB_VERSION_REQ})")
    if OPERATIONS_VERSION < OPERATIONS_VERSION_REQ: print(f"WARN: operations version {OPERATIONS_VERSION} < {OPERATIONS_VERSION_REQ}")
    print("Veector components imported successfully.")
    PROJECT_IMPORTS_OK = True
except ImportError as e: print(f"FATAL ERROR (ImportError): {e}"); raise
except Exception as import_e: print(f"FATAL ERROR (Other Import Error): {import_e}"); traceback.print_exc(); raise

# --- Вспомогательные функции ---
def log_memory_usage(stage: str):
    try: process = psutil.Process(os.getpid()); mem_info = process.memory_info(); vmem = psutil.virtual_memory(); print(f"  [MEM_LOG] {stage}: RSS={mem_info.rss / (1024**2):.2f} MB, RAM Used={vmem.percent:.1f}%")
    except Exception as e: print(f"  [MEM_LOG] Error getting memory usage: {e}")

def sample_top_p(logits: np.ndarray, temperature: float, top_p: float) -> int:
    if np.any(np.isnan(logits)): print("ERROR: NaN detected in logits before sampling! Returning argmax."); return int(np.argmax(logits))
    if temperature < 1e-9: return int(np.argmax(logits))
    logits_f32 = logits.astype(np.float32); scaled_logits = logits_f32 / temperature; probabilities = softmax(scaled_logits)
    if np.any(np.isnan(probabilities)): print("ERROR: NaN detected in probabilities after softmax! Returning argmax."); return int(np.argmax(logits_f32))
    if 0.0 < top_p < 1.0:
        sorted_indices = np.argsort(probabilities)[::-1]; sorted_probabilities = probabilities[sorted_indices]; cumulative_probabilities = np.cumsum(sorted_probabilities); cutoff_index = np.searchsorted(cumulative_probabilities, top_p); cutoff_index = min(cutoff_index, len(sorted_probabilities) - 1); cutoff_prob = sorted_probabilities[cutoff_index]; probabilities[probabilities < cutoff_prob] = 0.0
    prob_sum = np.sum(probabilities)
    if prob_sum > 1e-9: final_probabilities = probabilities / prob_sum
    else: print("Warning: All probabilities became zero after top-p. Using argmax."); return int(np.argmax(logits_f32))
    if np.any(np.isnan(final_probabilities)): print("ERROR: NaN detected in final_probabilities before choice! Using argmax."); return int(np.argmax(logits_f32))
    vocab_size = len(final_probabilities); token_ids = np.arange(vocab_size)
    try: final_probabilities /= final_probabilities.sum(); predicted_token_id = np.random.choice(token_ids, p=final_probabilities)
    except ValueError as e: print(f"ERROR in np.random.choice (Top-P): {e}. Prob sum: {np.sum(final_probabilities)}. Using argmax."); predicted_token_id = np.argmax(logits_f32)
    return int(predicted_token_id)

def log_tensor_stats(name: str, tensor: Optional[np.ndarray], log_values: bool = False):
    if tensor is None: print(f"  [STATS] {name}: None"); return
    has_nan = np.any(np.isnan(tensor)); shape_str = str(tensor.shape); dtype_str = str(tensor.dtype)
    print(f"  [STATS] {name}: shape={shape_str}, dtype={dtype_str}, NaN={has_nan}")
    if (has_nan or log_values) and tensor.size > 0 :
        try: sample_slice = tensor.flatten()[:5].tolist(); print(f"          Sample: {sample_slice}")
        except Exception as e: print(f"          Error getting sample: {e}")

# --- Основная функция инференса и сравнения ---

def run_inference_comparison_cell(
    text: str,
    db_path_str: str = "./data/db",
    model_name_hf: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    nest_level: int = 1, # float16
    temperature: float = 0.1,
    top_p: float = 0.9,
    max_new_tokens: int = 10,
    max_seq_len: Optional[int] = None,
    use_kv_cache: bool = True,
    compare_outputs: bool = True,
    atol: float = 5e-3,
    rtol: float = 1e-3
    ):
    """
    Запускает инференс Veector и сравнивает ВСЕ промежуточные выходы с эталонными HF.
    Останавливается при первом расхождении.
    """
    print(f"--- Running Inference & Detailed Comparison Cell ---")
    log_memory_usage("Start of function")

    db_path = Path(db_path_str)
    map_model_name = model_name_hf.split('/')[-1]

    # --- Параметры ---
    print(f"Prompt: '{text}'")
    print(f"DB Path: {db_path.resolve()}")
    print(f"Model Source: {model_name_hf}")
    print(f"Nest Level (Precision): {nest_level}")
    print(f"Sampling: Temp={temperature}, TopP={top_p}")
    print(f"Max New Tokens: {max_new_tokens}")
    print(f"Use KV Cache: {use_kv_cache}")
    print(f"Compare Outputs: {compare_outputs}")
    if compare_outputs: print(f"Comparison Tolerances: atol={atol}, rtol={rtol}")

    if not db_path.is_dir(): print(f"ERROR: DB directory not found: {db_path.resolve()}"); return

    # --- Загрузка Токенизатора, Конфига, Карты Процессоров ---
    tokenizer = None; model_config = None; processor_map = None
    num_layers = 0; num_kv_heads = 0; head_dim = 0; eos_token_id = None; bos_token_id = None
    user_token_id = None; assistant_token_id = None; # ID для User/Assistant
    fallback_max_seq_len = 2048
    try:
        # Загружаем токенизатор с HF Hub
        print(f"\nLoading Tokenizer directly from HF Hub: {model_name_hf}")
        tokenizer = AutoTokenizer.from_pretrained(model_name_hf, trust_remote_code=True, use_fast=False)
        print(f"Tokenizer loaded: {tokenizer.__class__.__name__}")

        # --- ИЗМЕНЕНО: Явно добавляем спецтокены перед получением ID ---
        user_token = "<|User|>"
        assistant_token = "<|Assistant|>"
        num_added = tokenizer.add_special_tokens({
            'additional_special_tokens': [user_token, assistant_token]
        })
        print(f"Added {num_added} special tokens explicitly ('{user_token}', '{assistant_token}').")
        # ВАЖНО: Если num_added = 0, значит токены уже были известны токенизатору.
        # Если num_added > 0, словарь токенизатора был расширен.

        # Теперь получаем ID (должны найтись)
        bos_token_id = tokenizer.bos_token_id
        eos_token_id = tokenizer.eos_token_id
        user_token_id = tokenizer.convert_tokens_to_ids(user_token)
        assistant_token_id = tokenizer.convert_tokens_to_ids(assistant_token)

        if isinstance(user_token_id, str) or user_token_id == tokenizer.unk_token_id: raise ValueError(f"Could not find ID for token '{user_token}' even after adding.")
        if isinstance(assistant_token_id, str) or assistant_token_id == tokenizer.unk_token_id: raise ValueError(f"Could not find ID for token '{assistant_token}' even after adding.")
        # --- Конец изменения ---

        if tokenizer.pad_token_id is None: tokenizer.pad_token_id = eos_token_id if eos_token_id is not None else tokenizer.vocab_size
        print(f"Tokens: BOS={bos_token_id}, EOS={eos_token_id}, PAD={tokenizer.pad_token_id}, User={user_token_id}, Assistant={assistant_token_id}")

        print(f"\nLoading Config from: {model_name_hf}")
        model_config = AutoConfig.from_pretrained(model_name_hf, trust_remote_code=True)
        num_layers = model_config.num_hidden_layers; num_attention_heads = model_config.num_attention_heads
        num_kv_heads = getattr(model_config, 'num_key_value_heads', num_attention_heads)
        hidden_size = model_config.hidden_size; head_dim = hidden_size // num_attention_heads
        rms_norm_eps = model_config.rms_norm_eps
        if max_seq_len is None: max_seq_len = getattr(model_config, 'max_position_embeddings', fallback_max_seq_len)
        print(f"Config: L={num_layers}, H={num_attention_heads}, KVH={num_kv_heads}, HDim={head_dim}, Hidden={hidden_size}, MaxSeqLen={max_seq_len}")

        proc_map_file = db_path / f"{map_model_name}_proc_map.pkl"
        if not proc_map_file.is_file(): raise FileNotFoundError(f"Processor map file not found: {proc_map_file}")
        with open(proc_map_file, 'rb') as f: processor_map = pickle.load(f)
        print(f"\nLoaded processor map ({len(processor_map)} entries)")

    except Exception as e: print(f"ERROR loading prerequisites: {e}"); traceback.print_exc(); return

    # --- Загрузка Эталонных Выходов ---
    hf_outputs = None
    if compare_outputs:
        print(f"\n--- Loading Reference HF Outputs ---")
        ref_output_filename = f"{map_model_name}_hf_reference_outputs_fp32.pkl"
        ref_output_path = db_path / ref_output_filename
        print(f"Attempting to load reference outputs from: {ref_output_path}")
        try:
            if not ref_output_path.is_file(): print(f"Warning: Reference output file not found: {ref_output_path}. Comparison will be skipped."); compare_outputs = False
            else:
                with open(ref_output_path, 'rb') as f: hf_outputs = pickle.load(f)
                if not isinstance(hf_outputs, dict): print("Warning: Loaded reference data is not a dictionary."); hf_outputs = None; compare_outputs = False
                else: print(f"Successfully loaded {len(hf_outputs)} reference outputs.")
        except Exception as e: print(f"Warning: Error loading reference outputs: {e}."); hf_outputs = None; compare_outputs = False

    # --- Инициализация Veector ---
    vec: Optional[Veector] = None
    try:
        vec = Veector(db_dir=db_path)
        print(f"\nVeector core v{CORE_VERSION} initialized using DB at: {vec.db.db_root_path.resolve()}")
        print(f"  Index size loaded: {len(vec.db.index)}")
        if len(vec.db.index) == 0: print("ERROR: Loaded main index (tensor_index.pkl) is empty!"); return
    except Exception as e: print(f"FATAL: Veector init failed: {e}"); traceback.print_exc(); return

    # --- Проверка процессоров ---
    required_proc_keys = ["embedding", "final_norm", "lm_head"] + [f"attn_{i}" for i in range(num_layers)] + [f"ffn_{i}" for i in range(num_layers)]
    missing_procs = [key for key in required_proc_keys if key not in processor_map]
    if missing_procs: print(f"ERROR: Required processors missing from map: {missing_procs}"); vec.db.close(); return
    embedding_processor_id = processor_map["embedding"]; final_norm_id = processor_map["final_norm"]; lm_head_id = processor_map["lm_head"]
    print("All required processor IDs found in map.")

    # --- Подготовка Входных Данных ---
    prompt_input_ids_np: Optional[np.ndarray] = None
    input_ids_list_for_log = []
    try:
        # --- Используем формат с <|User|> и <|Assistant|> ---
        print("\nConstructing prompt tokens (User/Assistant format)...")
        user_text_ids = tokenizer.encode(text, add_special_tokens=False)
        input_ids_list = []
        if bos_token_id is not None: input_ids_list.append(bos_token_id)
        # Используем ID, полученные после add_special_tokens
        input_ids_list.append(user_token_id)
        input_ids_list.extend(user_text_ids)
        input_ids_list.append(assistant_token_id)
        # --- Конец изменения ---

        prompt_input_ids_np = np.array([input_ids_list], dtype=np.int64)
        input_ids_list_for_log = input_ids_list
        print(f"\n--- Prepared Input ---"); print(f"Input IDs shape: {prompt_input_ids_np.shape}")
        print(f"Input IDs list: {input_ids_list_for_log}") # Печатаем список ID
        print(f"Decoded String: '{tokenizer.decode(input_ids_list_for_log)}'") # Проверяем декодирование
    except Exception as e: print(f"Error constructing prompt tokens: {e}"); traceback.print_exc(); vec.db.close(); return

    # --- Инициализация KV Кэша ---
    kv_cache_list: Optional[List[Tuple[np.ndarray, np.ndarray]]] = None
    if use_kv_cache:
        kv_cache_list = []
        cache_dtype = np.float16; batch_size = prompt_input_ids_np.shape[0]
        print(f"\nInitializing KV Cache for {num_layers} layers..."); cache_shape = (batch_size, num_kv_heads, max_seq_len, head_dim)
        print(f"  Shape per layer: K={cache_shape}, V={cache_shape}, dtype={cache_dtype}")
        for i in range(num_layers): kv_cache_list.append((np.zeros(cache_shape, dtype=cache_dtype), np.zeros(cache_shape, dtype=cache_dtype)))
        print("KV Cache initialized."); log_memory_usage("After KV Cache Init")
    else: print("\nKV Cache is disabled.")

    # --- Запуск Генерации и Сравнения ---
    start_inference_time = time.time(); knowledge_group_id = GROUP_IDX_QWEN_KNOWLEDGE
    print(f"\n--- Starting Autoregressive Generation & Comparison ---")
    generated_ids: List[int] = []; current_input_ids_for_step: np.ndarray = prompt_input_ids_np
    prompt_len = current_input_ids_for_step.shape[1]; total_seq_len = prompt_len
    full_response_ids = list(prompt_input_ids_np[0])
    error_occurred = False; difference_found = False

    # --- Функция для сравнения ---
    def compare_and_log(key: str, vec_out: Optional[np.ndarray]) -> bool:
        nonlocal difference_found
        if difference_found or not compare_outputs or hf_outputs is None: return difference_found
        print(f"  Comparing: {key}")
        hf_out = hf_outputs.get(key)
        if hf_out is None or vec_out is None: print(f"    ERROR: Output missing for comparison (HF: {'OK' if hf_out is not None else 'MISSING'}, Veector: {'OK' if vec_out is not None else 'MISSING'})"); difference_found = True; return True
        current_len_vec = vec_out.shape[1] if vec_out.ndim > 1 else 1; current_len_hf = hf_out.shape[1] if hf_out.ndim > 1 else 1
        compare_len = current_len_vec if current_len_vec <= current_len_hf else current_len_hf
        if vec_out.ndim > 1 and hf_out.ndim > 1: hf_out_sliced = hf_out[:, :compare_len, ...]; vec_out_sliced = vec_out[:, :compare_len, ...]
        elif vec_out.ndim == hf_out.ndim : hf_out_sliced = hf_out; vec_out_sliced = vec_out
        else: print(f"    ERROR: Dimension mismatch for slicing {key} (HF: {hf_out.ndim}D, Vec: {vec_out.ndim}D)"); difference_found = True; return True
        print(f"    HF Shape (fp32): {hf_out_sliced.shape}, dtype: {hf_out_sliced.dtype}"); print(f"    Veector Shape (fp16): {vec_out_sliced.shape}, dtype: {vec_out_sliced.dtype}")
        if hf_out_sliced.shape != vec_out_sliced.shape: print(f"    ERROR: Shape mismatch for {key} after slicing!"); difference_found = True; return True
        try:
            hf_out_f32 = hf_out_sliced; vec_out_f32 = vec_out_sliced.astype(np.float32)
            are_close = np.allclose(hf_out_f32, vec_out_f32, atol=atol, rtol=rtol)
            print(f"    Result: {'CLOSE' if are_close else '!!! DIFFERENT !!!'}")
            if not are_close: diff = np.abs(hf_out_f32 - vec_out_f32); max_diff = np.max(diff); mean_diff = np.mean(diff); print(f"      Max Abs Difference:  {max_diff:.6f}"); print(f"      Mean Abs Difference: {mean_diff:.6f}"); print(f"      HF Sample (fp32):      {hf_out_sliced.flatten()[:5].tolist()}"); print(f"      Veector Sample (fp16): {vec_out_sliced.flatten()[:5].tolist()}"); difference_found = True; return True
        except Exception as cmp_e: print(f"    ERROR during comparison for {key}: {cmp_e}"); difference_found = True; return True
        return False

    # --- Основной цикл ---
    try:
        veector_step0_outputs = {}
        for step in range(max_new_tokens):
            step_start_time = time.time(); current_seq_length = current_input_ids_for_step.shape[1]
            start_pos = total_seq_len - current_seq_length; position_ids = np.arange(start_pos, total_seq_len, dtype=np.int64).reshape(1, current_seq_length)
            if total_seq_len > max_seq_len: print(f"\nERROR: total_seq_len ({total_seq_len}) exceeds max_seq_len ({max_seq_len})."); break
            print(f"\n--- Step {step + 1}/{max_new_tokens} (Pos: {start_pos}..{total_seq_len-1}) ---")

            # 1. Embedding
            print(f"  Running Embedding..."); compute_context_embed = { "input_data": current_input_ids_for_step, "required_nest": nest_level, "target_knowledge_group": knowledge_group_id }; embed_result = vec.compute(embedding_processor_id, context=compute_context_embed)
            if not (embed_result and embed_result.get("status") == "completed"): raise RuntimeError(f"Embedding failed: {embed_result.get('provenance', {}).get('error', 'Unknown error')}")
            current_hidden_states = embed_result.get("data");
            if current_hidden_states is None: raise RuntimeError(f"Embedding returned None data.")
            print("    Embedding OK.");
            if step == 0: veector_step0_outputs["embed_tokens"] = current_hidden_states
            if step == 0 and compare_outputs and compare_and_log("embed_tokens", current_hidden_states): break

            # 2. Слои Трансформера
            residual_input = current_hidden_states
            for layer_idx in range(num_layers):
                print(f"  Running Layer {layer_idx}...")
                if current_hidden_states is None: raise RuntimeError(f"Input for Layer {layer_idx} is None.")
                attn_proc_id = processor_map[f"attn_{layer_idx}"]; ffn_proc_id = processor_map[f"ffn_{layer_idx}"]
                attn_context = { "input_data": current_hidden_states, "residual_input": residual_input, "required_nest": nest_level, "target_knowledge_group": knowledge_group_id, "position_ids": position_ids, "total_seq_len": total_seq_len }
                if use_kv_cache and kv_cache_list: attn_context["past_key"], attn_context["past_value"] = kv_cache_list[layer_idx]; attn_context["start_pos"] = start_pos
                attn_result = vec.compute(attn_proc_id, context=attn_context)
                if not (attn_result and attn_result.get("status") == "completed"): raise RuntimeError(f"Attn L{layer_idx} failed: {attn_result.get('provenance', {}).get('error', 'Unknown error')}")
                attn_block_output = attn_result.get("data");
                if attn_block_output is None: raise RuntimeError(f"Attn L{layer_idx} returned None data.")
                result_step_context = attn_result.get("step_context", {})
                if use_kv_cache and kv_cache_list: new_k, new_v = result_step_context.get('k_cache_out'), result_step_context.get('v_cache_out'); kv_cache_list[layer_idx] = (new_k, new_v) if new_k is not None and new_v is not None else kv_cache_list[layer_idx]
                print(f"    Attn L{layer_idx} OK.")
                ffn_input = attn_block_output; residual_input_ffn = ffn_input
                ffn_context = { "input_data": ffn_input, "residual_input": residual_input_ffn, "required_nest": nest_level, "target_knowledge_group": knowledge_group_id }
                ffn_result = vec.compute(ffn_proc_id, context=ffn_context)
                if not (ffn_result and ffn_result.get("status") == "completed"): raise RuntimeError(f"FFN L{layer_idx} failed: {ffn_result.get('provenance', {}).get('error', 'Unknown error')}")
                layer_output = ffn_result.get("data")
                if layer_output is None: raise RuntimeError(f"FFN L{layer_idx} returned None data.")
                print(f"    FFN L{layer_idx} OK.")
                current_hidden_states = layer_output; residual_input = layer_output
                if step == 0: veector_step0_outputs[f"L{layer_idx}_layer_output"] = layer_output
                if step == 0 and compare_outputs and compare_and_log(f"L{layer_idx}_layer_output", layer_output): break
            if difference_found: break

            if difference_found: break

            # 3. Final Norm
            print("  Running Final Norm..."); norm_context = { "input_data": current_hidden_states, "required_nest": nest_level, "target_knowledge_group": knowledge_group_id }; norm_result = vec.compute(final_norm_id, context=norm_context)
            if not (norm_result and norm_result.get("status") == "completed"): raise RuntimeError(f"Final Norm failed: {norm_result.get('provenance', {}).get('error', 'Unknown error')}")
            final_normed_states = norm_result.get("data");
            if final_normed_states is None: raise RuntimeError(f"Final Norm returned None data.")
            print("    Final Norm OK.");
            if step == 0: veector_step0_outputs["final_norm"] = final_normed_states
            if step == 0 and compare_outputs and compare_and_log("final_norm", final_normed_states): break

            # 4. LM Head
            print("  Running LM Head..."); last_token_hidden_state = final_normed_states[:, -1:, :]; lm_head_context = { "input_data": last_token_hidden_state, "required_nest": nest_level, "target_knowledge_group": knowledge_group_id }; logits_result = vec.compute(lm_head_id, context=lm_head_context)
            if not (logits_result and logits_result.get("status") == "completed"): raise RuntimeError(f"LM Head failed: {logits_result.get('provenance', {}).get('error', 'Unknown error')}")
            final_logits = logits_result.get("data");
            if final_logits is None: raise RuntimeError(f"LM Head returned None data.")
            print("    LM Head OK.");
            if step == 0: veector_step0_outputs["lm_head"] = final_logits
            if step == 0 and compare_outputs:
                hf_lm_head_out = hf_outputs.get("lm_head")
                if hf_lm_head_out is not None: hf_last_token_logits = hf_lm_head_out[:, -1:, :]; difference_found = compare_and_log("lm_head", final_logits)
                else: print("    WARN: Reference 'lm_head' output not found for comparison.")
                if difference_found: break

            # 5. Семплирование
            print("  Sampling next token..."); last_token_logits = final_logits[0, -1, :]; predicted_token_id = sample_top_p(logits=last_token_logits, temperature=temperature, top_p=top_p)
            print(f"  --> Generated token ID = {predicted_token_id}, Decoded = '{tokenizer.decode([predicted_token_id])}'")

            # 6. Остановка по EOS
            if eos_token_id is not None and predicted_token_id == eos_token_id: print(f"\nEOS token generated. Stopping."); break

            # 7. Подготовка к следующей итерации
            generated_ids.append(predicted_token_id); full_response_ids.append(predicted_token_id); current_input_ids_for_step = np.array([[predicted_token_id]], dtype=np.int64); total_seq_len += 1
            current_token_str = tokenizer.decode([predicted_token_id]); print(current_token_str, end='', flush=True)
            if vec: vec.clear_cache(clear_knowledge=False, clear_compute=True)
            log_memory_usage(f"End of Step {step+1}"); print(f"  Step {step+1} time: {time.time() - step_start_time:.3f}s")
            if total_seq_len >= max_seq_len: print(f"\nMax sequence length reached."); break
        # --- Konec cikla generacii ---
        print()

        # --- Вывод результата ---
        print("\n--- Final Generated Sequence (Decoded) ---"); generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True); print(f"Generated Text Only: '{generated_text}'")
        full_response = tokenizer.decode(full_response_ids, skip_special_tokens=False); print(f"\nFull Response (incl. prompt): '{full_response}'")

    except Exception as e: print(f"\n--- ERROR during inference execution ---"); print(f"{e}"); traceback.print_exc(); error_occurred = True
    finally:
        if vec and hasattr(vec, 'db') and vec.db:
          try: vec.db.close(); print("\nDatabase connection closed.")
          except Exception as db_close_e: print(f"Error closing DB connection: {db_close_e}")

    end_inference_time = time.time(); print(f"\n--- Inference & Comparison Cell Finished in {end_inference_time - start_inference_time:.3f} seconds ---"); log_memory_usage("End of function")

    # --- Финальный вердикт сравнения ---
    if compare_outputs and not error_occurred:
        if difference_found: print("\\n--- RESULT: Differences found during comparison. Stopped at first mismatch. ---")
        else: print("\\n--- RESULT: All compared outputs are CLOSE! ---")
    elif error_occurred: print("\\n--- RESULT: Comparison not completed due to runtime errors. ---")
    else: print("\\n--- RESULT: Comparison was disabled or reference file not found. ---")


# --- Пример вызова ---
print("\n--- Starting Example Inference & Comparison Run ---")
run_inference_comparison_cell(
    text="Hello, how are you?", # Используем тот же промпт, что и для эталона
    db_path_str="./data/db",
    model_name_hf="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    nest_level=1, # float16
    temperature=0.1,
    top_p=0.9,
    max_new_tokens=10,
    use_kv_cache=True,
    compare_outputs=True
)
print("--- Example Inference & Comparison Run Finished ---")


Torch and Transformers imported successfully.
  [VeectorDB] Successfully imported tensors v0.7.6
  Imported VeectorDB (v0.9.8)
  Imported tensors (v0.7.6)
  Imported operations (v0.8.9)
  Imported Memory (v0.1.0)
Veector Qwen2 Ops Module Loaded. Found 3 operations.
  Found optional module: veector_models.qwen2.ops
Core components imported successfully.
Using Core: 0.7.13, Tensors: 0.7.6, Ops: 0.8.9, DB: 0.9.8
Veector components imported successfully.

--- Starting Example Inference & Comparison Run ---
--- Running Inference & Detailed Comparison Cell ---
  [MEM_LOG] Start of function: RSS=694.23 MB, RAM Used=13.2%
Prompt: 'Hello, how are you?'
DB Path: /content/data/db
Model Source: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
Nest Level (Precision): 1
Sampling: Temp=0.1, TopP=0.9
Max New Tokens: 10
Use KV Cache: True
Compare Outputs: True
Comparison Tolerances: atol=0.005, rtol=0.001

Loading Tokenizer directly from HF Hub: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
Tokenizer loaded: Llam

In [None]:
del vec_processor
del vec_out_f32
del vec
if 'torch' in locals() and hasattr(torch, 'cuda'): torch.cuda.empty_cache()
import gc
gc.collect()
print("\nMemory cleanup attempted.")

In [None]:
# === Inference Cell (Adapted from qwen_inference.py v0.2.42) ===
# Цель: Запуск инференса модели Qwen2 с использованием Veector Core
#       и высокоуровневых процессоров, созданных в предыдущих ячейках.

import time
import pickle
import numpy as np
import traceback
import os
import gc
import psutil
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple, Union

# --- Необходимые библиотеки ---
try:
    import torch
    from transformers import AutoTokenizer, AutoConfig, PreTrainedTokenizer
    print("Torch and Transformers imported successfully.")
except ImportError as e:
    print(f"FATAL ERROR: Missing essential libraries (torch, transformers): {e}")
    raise

# --- Импорты проекта Veector ---
# Убедитесь, что файлы проекта доступны в среде Colab (например, в /content/)
PROJECT_IMPORTS_OK = False
CORE_VERSION_REQ = "0.7.13" # Требуем версию с исправленным импортом
TENSORS_VERSION_REQ = "0.7.6"
VEECTORDB_VERSION_REQ = "0.9.8" # Требуем версию с поддержкой initial_index_path
OPERATIONS_VERSION_REQ = "0.8.9"

try:
    from core import Veector, CORE_VERSION
    print(f"  Imported Core (v{CORE_VERSION})")
    if CORE_VERSION < CORE_VERSION_REQ:
         raise ImportError(f"Inference cell requires core v{CORE_VERSION_REQ}+, found v{CORE_VERSION}")

    from tensors import TensorCoordinate, TENSORS_VERSION, GROUP_IDX_QWEN_KNOWLEDGE
    print(f"  Imported Tensors (v{TENSORS_VERSION})")
    if TENSORS_VERSION < TENSORS_VERSION_REQ:
         raise ImportError(f"Inference cell requires tensors v{TENSORS_VERSION_REQ}+, found v{TENSORS_VERSION}")

    from operations import OPERATIONS_VERSION, softmax
    print(f"  Imported operations (v{OPERATIONS_VERSION})")
    if OPERATIONS_VERSION < OPERATIONS_VERSION_REQ:
         print(f"WARN: operations.py version is {OPERATIONS_VERSION}, but v{OPERATIONS_VERSION_REQ}+ is recommended.")

    from veectordb import VeectorDB, VEECTORDB_VERSION
    print(f"  Imported VeectorDB (v{VEECTORDB_VERSION})")
    if VEECTORDB_VERSION < VEECTORDB_VERSION_REQ:
         raise ImportError(f"Inference cell requires VeectorDB v{VEECTORDB_VERSION_REQ}+, found v{VEECTORDB_VERSION}")

    print("Veector components imported successfully.")
    PROJECT_IMPORTS_OK = True

except ImportError as e:
    print(f"---!!! FATAL ERROR (ImportError in Inference Cell) !!! ---")
    print(f"Specific error: {e}")
    print(f"Ensure files (core v{CORE_VERSION_REQ}+, tensors v{TENSORS_VERSION_REQ}+, operations v{OPERATIONS_VERSION_REQ}+, veectordb v{VEECTORDB_VERSION_REQ}+) are UP-TO-DATE and ACCESSIBLE.")
    print(f"-----------------------------------------")
    raise # Прерываем выполнение ячейки
except Exception as import_e:
    print(f"---!!! FATAL ERROR (Other Exception during Import) !!! ---")
    print(f"Specific error: {import_e}")
    traceback.print_exc()
    print(f"----------------------------------------------------------")
    raise

# --- Вспомогательные функции (копия из qwen_inference.py) ---

def log_memory_usage(stage: str):
    """Logiruet tekushhee ispol'zovanie RAM."""
    try:
        process = psutil.Process(os.getpid())
        mem_info = process.memory_info()
        vmem = psutil.virtual_memory()
        print(f"  [MEM_LOG] {stage}: RSS={mem_info.rss / (1024**2):.2f} MB, RAM Used={vmem.percent:.1f}%")
    except Exception as e:
        print(f"  [MEM_LOG] Error getting memory usage: {e}")

def sample_top_p(logits: np.ndarray, temperature: float, top_p: float) -> int:
    """Primenjaet temperature scaling i top-p sampling."""
    if np.any(np.isnan(logits)):
        print("ERROR: NaN detected in logits before sampling! Returning argmax.")
        return int(np.argmax(logits))

    if temperature < 1e-9:
        return int(np.argmax(logits))

    logits_f32 = logits.astype(np.float32)
    scaled_logits = logits_f32 / temperature
    probabilities = softmax(scaled_logits) # Используем softmax из operations

    if np.any(np.isnan(probabilities)):
        print("ERROR: NaN detected in probabilities after softmax! Returning argmax.")
        return int(np.argmax(logits_f32))

    if 0.0 < top_p < 1.0:
        sorted_indices = np.argsort(probabilities)[::-1]
        sorted_probabilities = probabilities[sorted_indices]
        cumulative_probabilities = np.cumsum(sorted_probabilities)
        cutoff_index = np.searchsorted(cumulative_probabilities, top_p)
        cutoff_index = min(cutoff_index, len(sorted_probabilities) - 1)
        cutoff_prob = sorted_probabilities[cutoff_index]
        probabilities[probabilities < cutoff_prob] = 0.0

    prob_sum = np.sum(probabilities)
    if prob_sum > 1e-9:
        final_probabilities = probabilities / prob_sum
    else:
        print("Warning: All probabilities became zero after top-p. Using argmax.")
        return int(np.argmax(logits_f32))

    if np.any(np.isnan(final_probabilities)):
        print("ERROR: NaN detected in final_probabilities before choice! Using argmax.")
        return int(np.argmax(logits_f32))

    vocab_size = len(final_probabilities)
    token_ids = np.arange(vocab_size)
    try:
        # Убедимся, что сумма вероятностей равна 1 перед np.random.choice
        final_probabilities /= final_probabilities.sum()
        predicted_token_id = np.random.choice(token_ids, p=final_probabilities)
    except ValueError as e:
        print(f"ERROR in np.random.choice (Top-P): {e}. Prob sum: {np.sum(final_probabilities)}. Using argmax.")
        predicted_token_id = np.argmax(logits_f32)

    # Явно преобразуем в int, так как np.random.choice может вернуть numpy.int64
    return int(predicted_token_id)

def log_tensor_stats(name: str, tensor: Optional[np.ndarray], log_values: bool = False):
    """Logiruet formu, tip, nalichie NaN i primernye znachenija tenzora."""
    if tensor is None:
        print(f"  [STATS] {name}: None")
        return
    has_nan = np.any(np.isnan(tensor))
    shape_str = str(tensor.shape)
    dtype_str = str(tensor.dtype)
    print(f"  [STATS] {name}: shape={shape_str}, dtype={dtype_str}, NaN={has_nan}")
    if (has_nan or log_values) and tensor.size > 0 :
        try:
            # Берем срез и преобразуем в список для печати
            sample_slice = tensor.flatten()[:5].tolist()
            print(f"          Sample: {sample_slice}")
        except Exception as e:
            print(f"          Error getting sample: {e}")

# --- Основная функция инференса для Colab ---

def run_inference_cell(
    text: str,
    db_path_str: str = "./data/db", # Путь к БД
    model_name_hf: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", # Имя для загрузки токенизатора/конфига и карт
    nest_level: int = 1, # Уровень точности (1=float16)
    temperature: float = 0.6,
    top_p: float = 0.95,
    max_new_tokens: int = 50, # Увеличил немного для примера
    max_seq_len: Optional[int] = None, # Макс. длина посл-ти (для кеша), None = автоопределение
    use_kv_cache: bool = True # Использовать ли KV кеш
    ):
    """
    Запускает инференс модели Qwen2 с использованием Veector в среде Colab.

    Args:
        text: Входной текст (промпт пользователя).
        db_path_str: Путь к директории базы данных Veector.
        model_name_hf: Идентификатор модели на Hugging Face или локальный путь
                       (используется для загрузки токенизатора, конфига и имен файлов карт).
        nest_level: Целевой уровень точности для процессоров Veector (0=int8, 1=fp16, 2=fp32).
        temperature: Температура для семплирования.
        top_p: Параметр Top-P для семплирования.
        max_new_tokens: Максимальное количество новых токенов для генерации.
        max_seq_len: Максимальная длина последовательности для KV кеша. Если None, берется из конфига модели.
        use_kv_cache: Использовать ли KV кеширование.
    """
    print(f"--- Running Inference Cell ---")
    log_memory_usage("Start of inference function")

    db_path = Path(db_path_str)
    map_model_name = model_name_hf.split('/')[-1] # Получаем имя для файлов карт

    # --- Параметры ---
    print(f"Prompt: '{text}'")
    print(f"DB Path: {db_path.resolve()}")
    print(f"Model Source: {model_name_hf}")
    print(f"Nest Level (Precision): {nest_level}")
    print(f"Sampling: Temp={temperature}, TopP={top_p}")
    print(f"Max New Tokens: {max_new_tokens}")
    print(f"Use KV Cache: {use_kv_cache}")

    if not db_path.is_dir():
        print(f"ERROR: DB directory not found: {db_path.resolve()}")
        return

    # --- Загрузка Токенизатора и Конфига ---
    tokenizer: Optional[PreTrainedTokenizer] = None
    model_config: Optional[AutoConfig] = None
    num_layers = 0
    num_kv_heads = 0
    head_dim = 0
    eos_token_id: Optional[int] = None
    bos_token_id: Optional[int] = None
    user_token_id: Optional[int] = None
    assistant_token_id: Optional[int] = None
    fallback_max_seq_len = 2048

    try:
        print(f"\nLoading Tokenizer from: {model_name_hf}")
        # Указываем use_fast=False, если FastTokenizer вызывает проблемы
        tokenizer = AutoTokenizer.from_pretrained(model_name_hf, trust_remote_code=True, use_fast=False)
        print(f"Tokenizer loaded: {tokenizer.__class__.__name__}")

        eos_token_id = tokenizer.eos_token_id
        bos_token_id = tokenizer.bos_token_id
        # Получаем ID спец токенов вручную, если они есть
        try:
            user_token_id = tokenizer.encode("<|User|>", add_special_tokens=False)[0]
            assistant_token_id = tokenizer.encode("<|Assistant|>", add_special_tokens=False)[0]
        except Exception as tok_e:
             print(f"Warning: Could not encode special tokens '<|User|>' or '<|Assistant|>' directly: {tok_e}")
             # Пытаемся получить из added_tokens_decoder, если они там есть
             user_token_id = tokenizer.added_tokens_decoder.get(tokenizer.vocab.get("<|User|>"), None)
             assistant_token_id = tokenizer.added_tokens_decoder.get(tokenizer.vocab.get("<|Assistant|>"), None)

        if user_token_id is None or assistant_token_id is None:
             print("ERROR: Could not determine User/Assistant token IDs.")
             return

        if tokenizer.pad_token_id is None:
            tokenizer.pad_token_id = eos_token_id if eos_token_id is not None else tokenizer.vocab_size
            print(f"Set pad_token_id to {tokenizer.pad_token_id}")

        print(f"BOS ID: {bos_token_id}, EOS ID: {eos_token_id}, PAD ID: {tokenizer.pad_token_id}")
        print(f"User ID: {user_token_id}, Assistant ID: {assistant_token_id}")

        print(f"\nLoading Config from: {model_name_hf}")
        model_config = AutoConfig.from_pretrained(model_name_hf, trust_remote_code=True)
        num_layers = model_config.num_hidden_layers
        num_kv_heads = getattr(model_config, 'num_key_value_heads', model_config.num_attention_heads)
        head_dim = model_config.hidden_size // model_config.num_attention_heads
        if max_seq_len is None:
            max_seq_len = getattr(model_config, 'max_position_embeddings', fallback_max_seq_len)
        print(f"Config loaded: L={num_layers}, KVH={num_kv_heads}, HDim={head_dim}, MaxSeqLen={max_seq_len}")

    except Exception as e:
        print(f"ERROR loading tokenizer or config: {e}")
        traceback.print_exc()
        return

    # --- Загрузка Карты Процессоров ---
    processor_map: Optional[Dict[str, str]] = None
    proc_map_file = db_path / f"{map_model_name}_proc_map.pkl"
    if proc_map_file.is_file():
        try:
            with open(proc_map_file, 'rb') as f:
                processor_map = pickle.load(f)
            print(f"\nLoaded processor map ({len(processor_map)} entries) from {proc_map_file}")
        except Exception as e:
            print(f"Warning: Failed to load processor map: {e}.")
            return # Карта процессоров обязательна
    else:
        print(f"ERROR: Processor map file not found: {proc_map_file}")
        return

    # --- Инициализация Veector ---
    # Используем основной индексный файл по умолчанию для инференса
    vec: Optional[Veector] = None
    try:
        vec = Veector(db_dir=db_path) # Загружает tensor_index.pkl
        print(f"\nVeector core v{CORE_VERSION} initialized using DB at: {vec.db.db_root_path.resolve()}")
        print(f"  Index size loaded: {len(vec.db.index)}")
        if len(vec.db.index) == 0:
             print("ERROR: Loaded main index (tensor_index.pkl) is empty. Ensure Cell 5 and Cell 6 ran correctly.")
             return
    except Exception as e:
        print(f"FATAL: Veector init failed: {e}")
        traceback.print_exc()
        return

    # --- Проверка наличия процессоров в карте ---
    required_proc_keys = ["embedding", "final_norm", "lm_head"]
    for i in range(num_layers):
        required_proc_keys.extend([f"attn_{i}", f"ffn_{i}"])
    missing_procs = [key for key in required_proc_keys if key not in processor_map]
    if missing_procs:
        print(f"ERROR: Required processors missing from map: {missing_procs}")
        vec.db.close()
        return
    embedding_processor_id = processor_map["embedding"]
    final_norm_id = processor_map["final_norm"]
    lm_head_id = processor_map["lm_head"]
    print("All required processor IDs found in map.")

    # --- Подготовка Входных Данных (Ручное Формирование) ---
    prompt_input_ids_np: Optional[np.ndarray] = None
    try:
        print("\nManually constructing prompt tokens (GGUF-style)...")
        user_text_ids = tokenizer.encode(text, add_special_tokens=False)

        input_ids_list = []
        if bos_token_id is not None:
            input_ids_list.append(bos_token_id)

        input_ids_list.append(user_token_id)
        input_ids_list.extend(user_text_ids)
        input_ids_list.append(assistant_token_id)

        prompt_input_ids_np = np.array([input_ids_list], dtype=np.int64)

        print(f"\n--- Prepared Input ---")
        print(f"Input IDs shape: {prompt_input_ids_np.shape}")
        # print(f"Input IDs: {prompt_input_ids_np[0].tolist()}")
        print(f"Decoded Tokens: {tokenizer.convert_ids_to_tokens(prompt_input_ids_np[0].tolist())}")
        print(f"Decoded String: '{tokenizer.decode(prompt_input_ids_np[0])}'")

    except Exception as e:
        print(f"Error constructing prompt tokens: {e}")
        traceback.print_exc()
        vec.db.close()
        return

    # --- Инициализация KV Кэша (если используется) ---
    kv_cache_list: Optional[List[Tuple[np.ndarray, np.ndarray]]] = None
    if use_kv_cache:
        kv_cache_list = []
        cache_dtype = np.float16 # Используем float16 для кеша
        batch_size = prompt_input_ids_np.shape[0] # Обычно 1
        print(f"\nInitializing KV Cache for {num_layers} layers...")
        cache_shape = (batch_size, num_kv_heads, max_seq_len, head_dim)
        print(f"  Shape per layer: K={cache_shape}, V={cache_shape}, dtype={cache_dtype}")
        for i in range(num_layers):
            k_cache_layer = np.zeros(cache_shape, dtype=cache_dtype)
            v_cache_layer = np.zeros(cache_shape, dtype=cache_dtype)
            kv_cache_list.append((k_cache_layer, v_cache_layer))
        print("KV Cache initialized.")
        log_memory_usage("After KV Cache Init")
    else:
        print("\nKV Cache is disabled.")

    # --- Запуск Авторегрессионной Генерации ---
    start_inference_time = time.time()
    knowledge_group_id = GROUP_IDX_QWEN_KNOWLEDGE # ID группы знаний для Qwen

    print(f"\n--- Starting Autoregressive Generation ---")
    generated_ids: List[int] = []
    current_input_ids_for_step: np.ndarray = prompt_input_ids_np
    prompt_len = current_input_ids_for_step.shape[1]
    total_seq_len = prompt_len # Текущая общая длина последовательности

    full_response_ids = list(prompt_input_ids_np[0]) # Начинаем с ID промпта

    try:
        for step in range(max_new_tokens):
            step_start_time = time.time()
            current_seq_length = current_input_ids_for_step.shape[1] # Длина текущего инпута (1 для шагов > 0)
            start_pos = total_seq_len - current_seq_length # Позиция начала текущих токенов

            # Генерируем position_ids для текущего шага
            position_ids = np.arange(start_pos, total_seq_len, dtype=np.int64).reshape(1, current_seq_length)

            if total_seq_len > max_seq_len:
                print(f"\nERROR: total_seq_len ({total_seq_len}) exceeds max_seq_len ({max_seq_len}). Cannot continue.")
                break

            print(f"\n--- Step {step + 1}/{max_new_tokens} (Pos: {start_pos}..{total_seq_len-1}) ---")
            log_tensor_stats("Input IDs", current_input_ids_for_step)

            # 1. Embedding
            print(f"  Running Embedding...")
            compute_context_embed = {
                "input_data": current_input_ids_for_step,
                "required_nest": nest_level,
                "target_knowledge_group": knowledge_group_id
            }
            embed_result = vec.compute(embedding_processor_id, context=compute_context_embed)
            if not (embed_result and embed_result.get("status") == "completed"):
                raise RuntimeError(f"Embedding failed at step {step+1}: {embed_result.get('provenance', {}).get('error', 'Unknown error')}")
            current_hidden_states = embed_result.get("data")
            log_tensor_stats("Embedding Output", current_hidden_states, log_values=(step < 1)) # Логгируем только на первом шаге
            if current_hidden_states is None:
                raise RuntimeError(f"Embedding returned None data at step {step+1}.")

            # 2. Слои Трансформера
            residual_input = current_hidden_states # Вход для первого residual

            for layer_idx in range(num_layers):
                # Упрощенное логгирование для слоев
                if step > 0 and layer_idx % 5 != 0 and layer_idx != num_layers -1 :
                     continue # Пропускаем лог для промежуточных слоев на шагах > 0
                print(f"\n  Layer {layer_idx}: Processing...")
                log_tensor_stats(f"L{layer_idx} Input HS", current_hidden_states, log_values=(step < 1 and layer_idx < 2)) # Логгируем только в начале

                attn_proc_id = processor_map[f"attn_{layer_idx}"]
                ffn_proc_id = processor_map[f"ffn_{layer_idx}"]

                # Подготовка контекста для Attention
                attn_context = {
                    "input_data": current_hidden_states,
                    "residual_input": residual_input, # Передаем вход слоя для первого residual
                    "required_nest": nest_level,
                    "target_knowledge_group": knowledge_group_id,
                    "position_ids": position_ids,
                    "total_seq_len": total_seq_len # Общая длина для маски и RoPE
                }
                # Добавляем KV кеш в контекст, если он используется
                if use_kv_cache and kv_cache_list:
                    past_key, past_value = kv_cache_list[layer_idx]
                    attn_context["past_key"] = past_key
                    attn_context["past_value"] = past_value
                    attn_context["start_pos"] = start_pos # Указываем, куда писать в кеш

                # Выполнение Attention процессора
                attn_result = vec.compute(attn_proc_id, context=attn_context)

                if not (attn_result and attn_result.get("status") == "completed"):
                    prov = attn_result.get("provenance", {})
                    error_msg = prov.get("error", "Unknown error")
                    print(f"    ERROR: Attn L{layer_idx} failed at step {step+1}: Status={attn_result.get('status')}, Error='{error_msg}'")
                    raise RuntimeError(f"Attn L{layer_idx} failed at step {step+1}")

                attn_block_output = attn_result.get("data") # Выход Attention + первый Residual Add
                result_step_context = attn_result.get("step_context", {})
                log_tensor_stats(f"L{layer_idx} Attn Block Output", attn_block_output, log_values=(step < 1 and layer_idx < 2))

                # Обновление KV кеша (если используется)
                if use_kv_cache and kv_cache_list:
                    new_key = result_step_context.get('k_cache_out')
                    new_value = result_step_context.get('v_cache_out')
                    if new_key is not None and new_value is not None:
                        if np.any(np.isnan(new_key)) or np.any(np.isnan(new_value)):
                            print(f"    ERROR: NaN detected in new K/V cache for L{layer_idx}! NOT updating cache.")
                            log_tensor_stats(f"L{layer_idx} NaN New Key", new_key, log_values=True)
                            log_tensor_stats(f"L{layer_idx} NaN New Value", new_value, log_values=True)
                        else:
                            kv_cache_list[layer_idx] = (new_key, new_value) # Обновляем кеш в списке
                            # print(f"    KV Cache updated for L{layer_idx}")
                    else:
                        print(f"    WARN: K/V cache values ('k_cache_out', 'v_cache_out') not found in attn_result step_context for L{layer_idx}. Cache NOT updated.")

                if attn_block_output is None:
                    raise RuntimeError(f"Attn L{layer_idx} returned None data at step {step+1}.")

                # Вход для FFN - это выход Attention блока (уже с residual)
                ffn_input = attn_block_output
                residual_input_ffn = ffn_input # Вход для второго residual

                # Выполнение FFN процессора
                ffn_context = {
                    "input_data": ffn_input,
                    "residual_input": residual_input_ffn, # Передаем для второго residual
                    "required_nest": nest_level,
                    "target_knowledge_group": knowledge_group_id
                }
                ffn_result = vec.compute(ffn_proc_id, context=ffn_context)

                if not (ffn_result and ffn_result.get("status") == "completed"):
                    prov = ffn_result.get("provenance", {})
                    error_msg = prov.get("error", "Unknown error")
                    print(f"    ERROR: FFN L{layer_idx} failed at step {step+1}: Status={ffn_result.get('status')}, Error='{error_msg}'")
                    raise RuntimeError(f"FFN L{layer_idx} failed at step {step+1}")

                layer_output = ffn_result.get("data") # Выход FFN + второй Residual Add
                log_tensor_stats(f"L{layer_idx} FFN Block Output", layer_output, log_values=(step < 1 and layer_idx < 2))
                if layer_output is None:
                    raise RuntimeError(f"FFN L{layer_idx} returned None data at step {step+1}.")

                # Выход текущего слоя становится входом для следующего
                current_hidden_states = layer_output
                residual_input = layer_output # Обновляем вход для residual следующего слоя
                # --- Konec cikla po slojam ---

            # 3. Final Norm
            print("  Running Final Norm...")
            log_tensor_stats("Input to Final Norm", current_hidden_states, log_values=(step < 1))
            norm_context = {
                "input_data": current_hidden_states,
                "required_nest": nest_level,
                "target_knowledge_group": knowledge_group_id
            }
            norm_result = vec.compute(final_norm_id, context=norm_context)
            if not (norm_result and norm_result.get("status") == "completed"):
                raise RuntimeError(f"Final Norm failed at step {step+1}: {norm_result.get('provenance', {}).get('error', 'Unknown error')}")
            final_normed_states = norm_result.get("data")
            log_tensor_stats("Final Norm Output", final_normed_states, log_values=(step < 1))
            if final_normed_states is None:
                raise RuntimeError(f"Final Norm returned None data at step {step+1}.")

            # 4. LM Head
            print("  Running LM Head...")
            # Берем скрытое состояние ТОЛЬКО последнего токена
            last_token_hidden_state = final_normed_states[:, -1:, :]
            log_tensor_stats("Input to LM Head", last_token_hidden_state, log_values=(step < 1))
            lm_head_context = {
                "input_data": last_token_hidden_state,
                "required_nest": nest_level,
                "target_knowledge_group": knowledge_group_id
            }
            logits_result = vec.compute(lm_head_id, context=lm_head_context)

            if not (logits_result and logits_result.get("status") == "completed"):
                raise RuntimeError(f"LM Head failed at step {step+1}: {logits_result.get('provenance', {}).get('error', 'Unknown error')}")
            final_logits = logits_result.get("data")
            log_tensor_stats("LM Head Output (Logits)", final_logits, log_values=(step < 1))
            if final_logits is None:
                raise RuntimeError(f"LM Head returned None data at step {step+1}.")

            # 5. Семплирование следующего токена
            print("  Sampling next token...")
            # Логиты для последнего токена в последовательности (batch=0, seq=-1)
            last_token_logits = final_logits[0, -1, :]
            log_tensor_stats(f"Logits for Sampling (Step {step+1})", last_token_logits, log_values=True) # Логгируем всегда

            predicted_token_id = sample_top_p(
                logits=last_token_logits,
                temperature=temperature,
                top_p=top_p
            )
            # predicted_token_id уже int

            print(f"  --> Generated token ID = {predicted_token_id}, Decoded = '{tokenizer.decode([predicted_token_id])}'")

            # 6. Проверка условия остановки (EOS)
            if eos_token_id is not None and predicted_token_id == eos_token_id:
                print(f"\nEOS token ({eos_token_id}) generated. Stopping generation.")
                break

            # 7. Подготовка к следующей итерации
            generated_ids.append(predicted_token_id)
            full_response_ids.append(predicted_token_id) # Добавляем в полный ответ
            # Вход для следующего шага - только последний сгенерированный токен
            current_input_ids_for_step = np.array([[predicted_token_id]], dtype=np.int64)
            total_seq_len += 1 # Увеличиваем общую длину последовательности

            # Печать сгенерированного токена
            current_token_str = tokenizer.decode([predicted_token_id])
            print(current_token_str, end='', flush=True)

            # Очистка кеша вычислений Veector (не кеша знаний)
            if vec:
                vec.clear_cache(clear_knowledge=False, clear_compute=True)

            log_memory_usage(f"End of Step {step+1}")
            print(f"  Step {step+1} time: {time.time() - step_start_time:.3f}s")

            # Проверка максимальной длины
            if total_seq_len >= max_seq_len:
                print(f"\nMaximum sequence length ({max_seq_len}) reached. Stopping generation.")
                break
        # --- Konec cikla generacii ---
        print() # Перевод строки после генерации

        # --- Вывод результата ---
        print("\n--- Final Generated Sequence (Decoded) ---")
        generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
        print(f"Generated Text Only: '{generated_text}'")

        full_response = tokenizer.decode(full_response_ids, skip_special_tokens=False)
        print(f"\nFull Response (incl. prompt): '{full_response}'")
        # print(f"Generated IDs: {generated_ids}")

    except Exception as e:
        print(f"\n--- ERROR during inference execution ---")
        print(f"{e}")
        traceback.print_exc()
    finally:
        # Закрываем соединение с БД Veector
        if vec and hasattr(vec, 'db') and vec.db:
            try:
                vec.db.close()
                print("\nDatabase connection closed.")
            except Exception as db_close_e:
                print(f"Error closing DB connection: {db_close_e}")

    end_inference_time = time.time()
    print(f"\n--- Inference Cell Finished in {end_inference_time - start_inference_time:.3f} seconds ---")
    log_memory_usage("End of inference function")

# --- Пример вызова функции (можно разместить в другой ячейке) ---
# print("\n--- Starting Example Inference Run ---")
# run_inference_cell(
#     text="Explain the concept of superposition in quantum computing.",
#     db_path_str="./data/db",
#     model_name_hf="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
#     nest_level=1, # float16
#     temperature=0.1, # Низкая температура для более детерминированного ответа
#     top_p=0.9,
#     max_new_tokens=100,
#     use_kv_cache=True
# )
# print("--- Example Inference Run Finished ---")



In [None]:
# Архивация и скачивание
import shutil
shutil.make_archive("model_DeepSeek-r1-distill-1.5b", "zip", "data")
zip_name = "model_DeepSeek-r1-distill-1.5b_knowledge.zip"

In [None]:
# Выгрузка на Google Drive
drive.mount('/content/drive', force_remount=True)
destination_path = f"/content/drive/My Drive/models/"
shutil.copy(zip_name, destination_path)
print(f"🟢 [LOG] ✅ Архив загружен на Google Drive: {destination_path}")

Mounted at /content/drive
🟢 [LOG] ✅ Архив загружен на Google Drive: /content/drive/My Drive/models/


In [None]:
!python core.py

In [None]:
!python qwen_inference.py