In [1]:
# Ensure necessary libraries are installed:
# pip install optimum[openvino] openvino-dev transformers torch>=2.1.0 accelerate huggingface_hub sentencepiece protobuf<=3.20.3 numpy

import numpy as np
import openvino as ov  # Use openvino directly for Core if needed, but Optimum abstracts much of this
from optimum.intel import OVModelForCausalLM  # <<< Use Optimum Intel for HF integration
from transformers import AutoTokenizer, AutoConfig
import gc
import time
import os

# from huggingface_hub import login # Keep if needed for gated models during conversion/tokenizer download
# import os

# # Login logic (only needed once usually for conversion/download, not runtime if files are local)
# hf_token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
# if hf_token:
#     print("Logging in to Hugging Face Hub (may be needed for tokenizer download)...")
#     try:
#         login(token=hf_token)
#     except Exception as e:
#         print(f"HF Login failed (might be okay if tokenizer is cached): {e}")
# else:
#     print("HF Token not found. Gated model download/tokenizer fetch might fail if not cached.")

# <<< Path to the *converted* OpenVINO model directory >>>
OPENVINO_MODEL_DIR = "openvino_model/"  # <<< Directory created by optimum-cli export
# <<< Get the original model name for metadata, but we load from the OV dir >>>
# It's good practice to store the original name with the converted model,
# optimum usually saves a config.json that might retain this.
try:
    config = AutoConfig.from_pretrained(OPENVINO_MODEL_DIR)
    ORIGINAL_MODEL_NAME = getattr(config, "_name_or_path", "Unknown (Check openvino_model/config.json)")
    print(f"Inferred original model name: {ORIGINAL_MODEL_NAME}")
except Exception:
    ORIGINAL_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  # Fallback
    print(f"Could not infer original model name, using fallback: {ORIGINAL_MODEL_NAME}")

# <<< OpenVINO device selection >>>
# Options: "CPU", "GPU", "GPU.0", "GPU.1", "NPU", "AUTO"
# "AUTO" tries to pick the best available device.
DEVICE = "AUTO"
print(f"Attempting to use OpenVINO device: {DEVICE}")
# Note: OpenVINO handles precision (FP32, FP16, INT8) based on the converted model
# or via compilation settings. We don't set torch_dtype here.

DEPTH_RANGE = 1
# Ensure INPUT_FILE path is correct for your environment
INPUT_FILE = 'feed.txt'  # Assuming it's in the same directory or provide full path
# Create the input file if it doesn't exist for testing
if not os.path.exists(INPUT_FILE):
    print(f"Warning: Input file '{INPUT_FILE}' not found. Creating a dummy file.")
    with open(INPUT_FILE, 'w', encoding='utf-8') as f:
        f.write("The quick brown fox jumps over the lazy dog")

OUTPUT_FILE = f"feed-kv-cache-openvino.lif"  # Changed output filename

# <<< Context window from model config (usually loaded automatically by Optimum) >>>
# MODEL_CONTEXT_WINDOW = 128_000 # Let Optimum/config handle this

print("Step 1 & 2: Loading OpenVINO model and tokenizer...")

try:
    # <<< Load the tokenizer from the directory containing the OV model >>>
    # Optimum saves the tokenizer files alongside the .xml/.bin during export
    tokenizer = AutoTokenizer.from_pretrained(OPENVINO_MODEL_DIR, trust_remote_code=True)
    print("  Tokenizer loaded successfully from OpenVINO model directory.")

    # <<< Load the OpenVINO model using Optimum Intel >>>
    # This handles compiling the model for the target device and setting up KV cache
    model = OVModelForCausalLM.from_pretrained(
        OPENVINO_MODEL_DIR,
        device=DEVICE,
        trust_remote_code=True,
        # ov_config={"PERFORMANCE_HINT": "LATENCY"} # Optional: Optimize for latency or throughput
    )
    print(f"  OpenVINO Model loaded and compiled for device {model.device}.")

except Exception as e:
    print(f"ERROR: Failed to load OpenVINO model or tokenizer from {OPENVINO_MODEL_DIR}")
    print(f"Ensure the directory exists and contains openvino_model.xml/bin and tokenizer files.")
    print(f"Error details: {e}")
    exit()

if tokenizer.pad_token is None:
    print("  Tokenizer missing pad token; setting pad_token = eos_token")
    tokenizer.pad_token = tokenizer.eos_token
    # Update model config if possible (Optimum might handle this internally)
    if hasattr(model, 'config'):
        model.config.pad_token_id = tokenizer.eos_token_id
print("  Model and Tokenizer ready for inference.\n")

# <<< No PyTorch Quantization needed - OV uses its own (often applied during conversion) >>>
# print("Step 2: Applying dynamic quantization for faster CPU inference...") >>> Removed

print("Step 4: Prompting user for inputs...")
promptID = input("  Enter Prompt ID [Default: VanityTestKVCacheOV]: ") or "VanityTestKVCacheOV"  # <<< Changed default
MultiPV_str = input("  Enter MultiPV (top logits to show) [Default: 5]: ") or "5"
MultiPV = int(MultiPV_str)
LegalNumberOfMove_str = input("  Enter Max Number of moves [Default: 10]: ") or "10"
LegalNumberOfMove = int(LegalNumberOfMove_str)
EngineID = f"{ORIGINAL_MODEL_NAME} OpenVINO ({model.device})"  # <<< Updated EngineID
Depth = 1
print("  User inputs captured.\n")

print("Step 5: Pre-processing input sequence...")
initial_prompt = "Complete successive parts of a sentence given one word at a time:"
initial_prompt_ids = tokenizer.encode(initial_prompt, add_special_tokens=False)

print(f"  Reading words from {INPUT_FILE}...")
try:
    with open(INPUT_FILE, 'r', encoding='utf-8') as f:
        words_from_file = f.read().split()
    print(f"  Found {len(words_from_file)} words.")
except FileNotFoundError:
    print(f"Error: Input file '{INPUT_FILE}' not found. Exiting.")
    exit()

# Store tokenized words separately for easier access in the loop
tokenized_words = []
total_token_count = len(initial_prompt_ids)
processed_words = []  # Store the actual words processed to match tokenized_words

print("  Tokenizing words...")
for word in words_from_file:
    # Add space prefix expected by many tokenizers for words after the first
    word_tokens = tokenizer.encode(" " + word, add_special_tokens=False)
    if not word_tokens:  # Handle cases where a word might tokenize to nothing
        print(f"  Warning: Word '{word}' tokenized to empty sequence, skipping.")
        continue
    tokenized_words.append(word_tokens)
    processed_words.append(word)  # Keep track of words we actually tokenized
    total_token_count += len(word_tokens)

print(f"  Pre-processed {len(processed_words)} words.")
print(f"  Total estimated tokens (prompt + words): {total_token_count}\n")

# <<< Context window check - Optimum model config should have this >>>
model_context_window = getattr(model.config, "max_position_embeddings", None)
if model_context_window:
    print(f"  Model context window: {model_context_window}")
    if total_token_count > model_context_window:
        print(
            f"WARNING: Estimated total token count ({total_token_count}) exceeds model context window ({model_context_window}).")
        print("KV caching might fail or produce unexpected results if the limit is strictly enforced by the model.")
else:
    print("Warning: Could not determine model context window from config.")

num_words_to_process = min(len(processed_words), LegalNumberOfMove)
if num_words_to_process < len(processed_words):
    print(f"  Will process the first {num_words_to_process} words due to LegalNumberOfMove limit.\n")
elif num_words_to_process == 0:
    print("  Warning: No words to process based on input file or limits.\n")

print("Step 8: Preparing output file header...")  # Step numbers kept for consistency
header_lines = [
    f'[PromptID "{promptID}"]\n',
    f'[EngineID "{EngineID}"]\n',
    f'[MultiPV "{MultiPV}"]\n',
    f'[DepthRange "1:1"]\n\n',
    "1-0\n"
]
print("  Header prepared.\n")

print("Step 9: Entering main generation loop (using OpenVINO KV Caching)...\n")
PrevEval = "n.a."
start_time = time.time()
# <<< KV Caching Change: Optimum's OVModelForCausalLM handles the past_key_values internally >>>
# We need to pass `use_cache=True` and it returns the cache object
past_key_values = None

if num_words_to_process > 0:
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as writer:
        print("  Writing header to output file...")
        writer.write(''.join(header_lines))
        print("  Header written. Starting word-by-word prediction.\n")

        # <<< Process the initial prompt first to build the initial cache >>>
        print("  9.0: Processing initial prompt to build cache...")
        # <<< Input needs to be NumPy array or PyTorch tensor >>>
        initial_input_ids = np.array([initial_prompt_ids], dtype=np.int64)
        start_time_gen = time.time()

        # Use model's __call__ or generate method (call is lower level, similar to torch)
        # No need for torch.no_grad() with OpenVINO
        outputs = model(
            input_ids=initial_input_ids,
            past_key_values=None,  # No cache yet
            use_cache=True,  # Request cache output
            output_attentions=False,
            output_hidden_states=False,
        )

        gen_duration = time.time() - start_time_gen
        # <<< Optimum returns cache in outputs.past_key_values >>>
        past_key_values = outputs.past_key_values
        print(f"      Initial prompt processing took: {gen_duration:.4f} seconds")
        # Logits for prediction *after* the prompt
        # logits_after_prompt = outputs.logits[:, -1, :] # Shape: [batch_size, vocab_size]

        # Now loop through the words, using the cache
        for turnCount in range(1, num_words_to_process + 1):
            current_word = processed_words[turnCount - 1]
            current_word_tokens = tokenized_words[turnCount - 1]
            print(f"Turn {turnCount}: Processing word '{current_word}' ({len(current_word_tokens)} tokens)")

            # <<< Input is just the current word's tokens (as NumPy) >>>
            input_ids = np.array([current_word_tokens], dtype=np.int64)
            current_input_len = input_ids.shape[1]

            start_time_gen = time.time()
            print(f"  9.4: Running OpenVINO model.forward() with {current_input_len} new tokens (using KV cache)...")

            outputs = model(
                input_ids=input_ids,
                past_key_values=past_key_values,  # <<< Pass the cache received from previous step
                use_cache=True,  # <<< Request updated cache
                output_attentions=False,
                output_hidden_states=False,
            )
            end_time_gen = time.time()
            gen_duration = end_time_gen - start_time_gen
            print(f"      Model forward pass took: {gen_duration:.4f} seconds")

            # <<< Update the cache for the next iteration >>>
            past_key_values = outputs.past_key_values

            # <<< Get logits for the *next* token prediction >>>
            # Logits are returned as torch tensors by default in Optimum, even from OV
            # Access the raw logits tensor
            logits_for_step = outputs.logits[:, -1, :]  # Shape: [batch_size, vocab_size]
            logits_for_prediction = logits_for_step[0]  # Shape: [vocab_size]

            # <<< Use torch.topk on the logits tensor >>>
            # Need torch if Optimum returns torch tensors
            # import torch # Make sure torch is imported if not already
            try:
                import torch
            except ImportError:
                print("Error: PyTorch is required for logit processing even with OpenVINO via Optimum.")
                exit()

            # Move logits to CPU if they somehow ended up elsewhere (unlikely with OV CPU/AUTO)
            logits_for_prediction_cpu = logits_for_prediction.cpu()

            top_k_logits_values, top_k_logits_indices = torch.topk(
                logits_for_prediction_cpu, k=MultiPV, dim=-1
            )

            # Convert results to lists for processing
            top_k_logits_values = top_k_logits_values.tolist()
            top_k_logits_indices = top_k_logits_indices.tolist()

            # Decode the top K tokens based on logits
            top_k_tokens = [tokenizer.decode(tid) for tid in top_k_logits_indices]

            print(f"      Top {MultiPV} Logits (Token | Logit Value):")
            for i in range(MultiPV):
                token_str_cleaned = top_k_tokens[i].strip()
                print(f"        - '{token_str_cleaned}': {top_k_logits_values[i]:.4f} (ID: {top_k_logits_indices[i]})")

            # 9.5 Derive primary metrics USING THE TOP LOGITS
            modelToken = top_k_tokens[0].strip()  # Token with the highest logit
            modelEval = f"{top_k_logits_values[0]:.4f}"
            modelEval = round(float(modelEval) * 100)  # Original scaling
            NextEval = (f"{top_k_logits_values[1]:.4f}" if MultiPV > 1 else "n.a.")
            NextEval = round(float(NextEval) * 100) if MultiPV > 1 and isinstance(top_k_logits_values[1],
                                                                                  float) else "n.a."  # Original scaling
            print(f"  9.5: Top token prediction: '{modelToken}' (Eval: {modelEval}) | Next best Eval: {NextEval}")

            # 9.6 Build lines for this turn
            print("  9.6: Building output lines for this turn...")
            current_stem = initial_prompt + " " + " ".join(processed_words[:turnCount])

            lines = [
                f'[PID "{promptID}"]\n',
                f'[EID "{ORIGINAL_MODEL_NAME}"]\n',  # Use original name for consistency
                f'[Turn "{turnCount}-w"]\n',
                f'[TextToken "{current_word}:"]\n',
                f'[ModelToken "{modelToken}:"]\n',  # The model's top prediction
                f'[Eval "{modelEval}"]\n',
                f'[PrevEval "{PrevEval}"]\n',
                f'[NextEval "{NextEval}"]\n',
                f'[Depth "{Depth}"]\n',
                f'[STEM "{current_stem}"]\n',  # Stem includes the word just processed
                f'[NumLegalMoves "{LegalNumberOfMove}"]\n',
                "---------------\n",
                f"{DEPTH_RANGE}",  # Should likely be Depth? Kept as is from original.
                "---------------\n"
            ]
            # Append the list of top K tokens and their raw logits
            for token_str, logit_val in zip(top_k_tokens, top_k_logits_values):
                lines.append(f"{token_str.strip()}: {logit_val:.4f}\n")

            lines.append(
                "===========================================================================================================\n\n")
            lines.append(f"[Comments]\n")
            lines.append(f"[EndMove]\n\n")
            print("      Lines built.")

            # 9.7 Write to file
            print("  9.7: Writing lines to output file...")
            writer.write(''.join(lines))
            print("      Write complete.\n")

            # 9.8 Update state
            PrevEval = modelEval

            # 9.9 Status update
            status_interval = min(100, num_words_to_process // 2 if num_words_to_process >= 10 else 10)
            if turnCount % status_interval == 0 or turnCount == num_words_to_process:
                elapsed = time.time() - start_time
                rate = turnCount / elapsed if elapsed > 0 else 0
                # Report tokens/sec as well
                current_total_tokens = len(initial_prompt_ids) + sum(len(tk) for tk in tokenized_words[:turnCount])
                token_rate = current_total_tokens / elapsed if elapsed > 0 else 0
                print(
                    f"  Status: Processed {turnCount}/{num_words_to_process} words at {rate:.2f} w/s ({token_rate:.2f} tok/s) ({elapsed:.2f}s total)\n")

        print("  Finished processing requested words.\n")

else:
    print("Skipping main generation loop as there are no words to process.")

print("Step 10: Reporting final statistics...")
total_time = time.time() - start_time
avg_rate_words = (num_words_to_process / total_time) if total_time > 0 and num_words_to_process > 0 else 0
# Calculate final token rate
final_total_tokens = len(initial_prompt_ids) + sum(len(tk) for tk in tokenized_words[:num_words_to_process])
avg_rate_tokens = (final_total_tokens / total_time) if total_time > 0 and final_total_tokens > 0 else 0

print(f"  Total turns processed: {num_words_to_process}")
print(f"  Total tokens processed (including prompt): {final_total_tokens}")
print(f"  Total time: {total_time:.2f} seconds")
print(f"  Average speed: {avg_rate_words:.2f} words/second")
print(f"  Average speed: {avg_rate_tokens:.2f} tokens/second")  # Report token speed
print(f"  Output written to {OUTPUT_FILE}")

# Optional: Clean up memory
print("\nCleaning up resources...")
del model
del tokenizer
del tokenized_words
if 'initial_input_ids' in locals():
    del initial_input_ids
if 'input_ids' in locals():
    del input_ids
if 'outputs' in locals():
    del outputs
if 'past_key_values' in locals():
    del past_key_values  # Clear cache
# OpenVINO objects (like compiled model) are managed by the OVModelForCausalLM object
# No explicit CUDA cache to clear
gc.collect()

print("\nScript finished.")

  from .autonotebook import tqdm as notebook_tqdm



Inferred original model name: openvino_model/
Attempting to use OpenVINO device: AUTO
Step 1 & 2: Loading OpenVINO model and tokenizer...
  Tokenizer loaded successfully from OpenVINO model directory.
  OpenVINO Model loaded and compiled for device cpu.
  Model and Tokenizer ready for inference.

Step 4: Prompting user for inputs...


KeyboardInterrupt: Interrupted by user