In [5]:
import sys
import os
import numpy as np
from collections import Counter
import random # For random sampling

# Ensure src module can be imported
# ... (sys.path modification code if needed) ...

from transformers import AutoTokenizer
from src.data import get_dataset_processor

# Configuration
MODEL_NAME = "unsloth/gemma-3-1b-it"
MAX_SEQ_LENGTH = 1024
DATASET_NAME = "finqa"
DEFAULT_SYSTEM_PROMPT = "You are a helpful AI assistant that answers questions based on provided financial data."

def print_random_examples(dataset, tokenizer, num_examples_to_print=50, split_name="unknown"):
    """
    Prints a specified number of random examples from the dataset.

    Args:
        dataset: The Hugging Face dataset object (output of processor.create_formatted_dataset).
        tokenizer: The tokenizer instance.
        num_examples_to_print: Number of random examples to print.
        split_name: Name of the dataset split being processed.
    """
    if not dataset or len(dataset) == 0:
        print(f"No examples to print from {split_name} split as it's empty.")
        return

    actual_num_to_print = min(num_examples_to_print, len(dataset))
    if actual_num_to_print == 0:
        print(f"No examples to print from {split_name} split.")
        return

    print(f"\n--- {actual_num_to_print} Random Examples from '{split_name}' Split ---")

    # Shuffle the dataset and select the desired number of examples
    # .shuffle() returns a new shuffled dataset, .select() then picks a range
    random_indices = random.sample(range(len(dataset)), actual_num_to_print)
    sampled_examples = dataset.select(random_indices)

    for i, example in enumerate(sampled_examples):
        print(f"\n--- Example {i+1}/{actual_num_to_print} (Original Index: {random_indices[i]}) ---")

        # Input Prompt (formatted for model)
        prompt_chat_format = example.get("prompt")
        if prompt_chat_format:
            try:
                # Convert the chat format (list of dicts) to a single string
                # add_generation_prompt=True is important for how chat models expect input
                input_prompt_string = tokenizer.apply_chat_template(
                    prompt_chat_format,
                    tokenize=False,
                    add_generation_prompt=True
                )
                print("Input Prompt (Formatted for Model):")
                print(input_prompt_string)
            except Exception as e:
                print(f"Could not format input prompt: {e}")
                print(f"Raw prompt structure: {prompt_chat_format}")
        else:
            print("Input Prompt: [Not Available or Empty]")

        # Output Answer
        output_answer = example.get("answer")
        if output_answer is not None: # Check for None, as empty string is a valid answer
            print("\nOutput Answer (Ground Truth):")
            print(output_answer)
        else:
            print("\nOutput Answer: [Not Available]")
    print("--- End of Random Examples ---")


def main():
    print(f"Loading tokenizer for: {MODEL_NAME}...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_NAME,
            model_max_length=MAX_SEQ_LENGTH
        )
        print("Tokenizer loaded successfully using AutoTokenizer.")
    except Exception as e:
        print(f"Error: Could not load tokenizer for '{MODEL_NAME}': {e}")
        return

    print(f"\nLoading dataset processor for: {DATASET_NAME}...")
    try:
        processor = get_dataset_processor(DATASET_NAME)
        print("Dataset processor loaded successfully.")
    except Exception as e:
        print(f"Error: Could not load dataset processor for '{DATASET_NAME}': {e}")
        return

    all_input_token_ids = []
    all_output_token_ids = []
    splits_to_analyze = ["train", "validation", "test"]

    for split_name in splits_to_analyze:
        print(f"\n--- Analyzing split: {split_name} ---")
        dataset_split = None # Initialize for current split
        try:
            dataset_split = processor.create_formatted_dataset(
                system_prompt=DEFAULT_SYSTEM_PROMPT,
                split=split_name
            )
        except Exception as e:
            print(f"Could not load or process split '{split_name}': {e}")
            if DATASET_NAME == "finqa" and split_name == "dev":
                 print("Note: For 'ibm-research/finqa', the validation split is named 'validation', not 'dev'.")
            continue # Skip to next split

        if not dataset_split or len(dataset_split) == 0:
            print(f"No data found for split: {split_name}")
            continue

        num_examples = len(dataset_split)
        print(f"Number of examples: {num_examples}")

        input_lengths = []
        output_lengths = []
        current_split_input_tokens = []
        current_split_output_tokens = []

        for i, example in enumerate(dataset_split):
            if (i + 1) % 100 == 0 or i == num_examples - 1:
                print(f"Tokenizing example {i+1}/{num_examples} in {split_name}...", end='\r')

            try:
                input_string = tokenizer.apply_chat_template(
                    example["prompt"], tokenize=False, add_generation_prompt=True
                )
                input_ids = tokenizer(input_string, add_special_tokens=False).input_ids
            except Exception as e_in:
                print(f"\nError tokenizing input for example {i} in {split_name}: {e_in}")
                input_ids = []

            try:
                output_ids = tokenizer(example["answer"], add_special_tokens=True).input_ids
            except Exception as e_out:
                print(f"\nError tokenizing output for example {i} in {split_name}: {e_out}")
                output_ids = []

            input_lengths.append(len(input_ids))
            output_lengths.append(len(output_ids))
            current_split_input_tokens.extend(input_ids)
            current_split_output_tokens.extend(output_ids)
        
        print(" " * 80, end='\r')

        all_input_token_ids.extend(current_split_input_tokens)
        all_output_token_ids.extend(current_split_output_tokens)

        # Statistics for the current split
        if input_lengths:
            print("\nInput Token Statistics:")
            print(f"  Total input tokens in split: {np.sum(input_lengths)}")
            print(f"  Min input tokens per example: {np.min(input_lengths) if input_lengths else 'N/A'}")
            print(f"  Max input tokens per example: {np.max(input_lengths) if input_lengths else 'N/A'}")
            print(f"  Mean input tokens per example: {np.mean(input_lengths) if input_lengths else 'N/A'}")
            unique_input_tokens_split = len(set(current_split_input_tokens))
            print(f"  Unique input tokens (diversity in this split): {unique_input_tokens_split}")
        else:
            print("No valid input tokens processed for this split.")

        if output_lengths:
            print("\nOutput Token Statistics:")
            print(f"  Total output tokens in split: {np.sum(output_lengths)}")
            print(f"  Min output tokens per example: {np.min(output_lengths) if output_lengths else 'N/A'}")
            print(f"  Max output tokens per example: {np.max(output_lengths) if output_lengths else 'N/A'}")
            print(f"  Mean output tokens per example: {np.mean(output_lengths) if output_lengths else 'N/A'}")
            unique_output_tokens_split = len(set(current_split_output_tokens))
            print(f"  Unique output tokens (diversity in this split): {unique_output_tokens_split}")
        else:
            print("No valid output tokens processed for this split.")

        # Call the function to print random examples for the current split
        # This is called *after* tokenization stats for the split are done.
        if dataset_split and len(dataset_split) > 0 :
            print_random_examples(dataset_split, tokenizer, num_examples_to_print=5, split_name=split_name) # printing 5 per split for brevity

    print("\n--- Overall Dataset Statistics (aggregated across analyzed splits) ---")
    if all_input_token_ids:
        unique_input_tokens_overall = len(set(all_input_token_ids))
        print(f"Total unique input token IDs (overall vocabulary diversity): {unique_input_tokens_overall}")
    if all_output_token_ids:
        unique_output_tokens_overall = len(set(all_output_token_ids))
        print(f"Total unique output token IDs (overall vocabulary diversity): {unique_output_tokens_overall}")
    combined_tokens = set(all_input_token_ids + all_output_token_ids)
    if combined_tokens:
        print(f"Total unique tokens (input + output) in analyzed data: {len(combined_tokens)}")
    print(f"Tokenizer's declared vocabulary size: {tokenizer.vocab_size}")

if __name__ == "__main__":
    main()

Loading tokenizer for: unsloth/gemma-3-1b-it...
Tokenizer loaded successfully using AutoTokenizer.

Loading dataset processor for: finqa...
Dataset processor loaded successfully.

--- Analyzing split: train ---
Number of examples: 6251
Tokenizing example 1500/6251 in train...

Token indices sequence length is longer than the specified maximum sequence length for this model (1060 > 1024). Running this sequence through the model will result in indexing errors


                                                                                
Input Token Statistics:
  Total input tokens in split: 2574022
  Min input tokens per example: 134
  Max input tokens per example: 1391
  Mean input tokens per example: 411.77763557830747
  Unique input tokens (diversity in this split): 10054

Output Token Statistics:
  Total output tokens in split: 33861
  Min output tokens per example: 2
  Max output tokens per example: 116
  Mean output tokens per example: 5.416893297072469
  Unique output tokens (diversity in this split): 172

--- 5 Random Examples from 'train' Split ---

--- Example 1/5 (Original Index: 2422) ---
Input Prompt (Formatted for Model):
<bos><start_of_turn>user
You are a helpful AI assistant that answers questions based on provided financial data.

net sales increased by what percent from 2005 to 2006?
results of operations year ended december 31 , 2006 compared to year ended december 31 , 2005 the historical results of operations of pca f