## Retrieve and Compare Tensors

In [None]:
# Import necessary libraries
import importlib
import os
import sys

import torch
from tqdm import tqdm
from transformers import AutoTokenizer

# Ensure utils and other dependencies are accessible
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Import required utilities and reload
from utils import parseutil, setuputil

importlib.reload(setuputil)
importlib.reload(parseutil)

from utils.parseutil import process_csv, process_xls, process_xlsx, test_xlsx
from utils.setuputil import get_fileList

# Define dataset name and path
dataset_name = "all"
train_dir = f"../data/{dataset_name}_train"

# Retrieve all spreadsheet files then filter only .xls files
all_files, _ = get_fileList(train_dir)
xls_files = [file for file in all_files if file.lower().endswith(".xlsx")]

# Define context window and storage for tensors
MAX_ROWS, MAX_COLS, PAD_LENGTH = 100, 100, 32
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Storage for tensors from both methods
tensor_lists = {
    "old": {"x_toks": [], "x_masks": [], "y_toks": []},
    "new": {"x_toks": [], "x_masks": [], "y_toks": []},
}


def process_files(method, tensor_dict, desc):
    """
    Processes files using the given method and stores the resulting tensors.

    Args:
        method (function): The function to process the file (`process_xls` or `test_xls`).
        tensor_dict (dict): Dictionary to store the tensors for x_toks, x_masks, and y_toks.
        desc (str): Description for the tqdm progress bar.
    """
    for file_path in tqdm(xls_files, desc=desc):
        try:
            x_tok, x_mask, y_tok = method(
                file_path,
                max_rows=MAX_ROWS,
                max_cols=MAX_COLS,
                pad_length=PAD_LENGTH,
                tokenizer=tokenizer,
                vocab=None,
            )
            tensor_dict["x_toks"].append(x_tok)
            tensor_dict["x_masks"].append(x_mask)
            tensor_dict["y_toks"].append(y_tok)
        except Exception as e:
            print(f"Error processing {file_path} with {method.__name__}: {e}")


# Run both methods
process_files(process_xlsx, tensor_lists["old"], "Running Old Method")
process_files(test_xlsx, tensor_lists["new"], "Running New Method")


def compare_tensor_lists(list1, list2, name, filenames):
    """
    Compares two lists of tensors element-wise and stops at the first mismatch.

    Args:
        list1 (list): First list of tensors.
        list2 (list): Second list of tensors.
        name (str): Name of the tensor type being compared.
        filenames (list): List of filenames corresponding to the sheets.

    Returns:
        bool: True if tensors match, False otherwise.
    """
    if len(list1) != len(list2):
        print(f"❌ Mismatch in {name} length: {len(list1)} vs {len(list2)}")
        return False

    for i, (tensor1, tensor2) in enumerate(zip(list1, list2)):
        if tensor1.shape != tensor2.shape:
            print(
                f"❌ Shape mismatch in {name}[{i}]: {tensor1.shape} vs {tensor2.shape}"
            )
            return False

        # Find the first mismatched element
        diff_indices = (tensor1 != tensor2).nonzero(as_tuple=True)

        if diff_indices[0].numel() > 0:  # If there are mismatches
            row, col = diff_indices[0][0].item(), diff_indices[1][0].item()

            print(f"\n❌ First mismatch in {name}[{i}] at:")
            print(f"   📂 Sheet index: {i} (File: {filenames[i]})")
            print(f"   📌 Row: {row}, Col: {col}")

            if name == "x_toks":
                # Decode tokens for x_toks tensor
                old_tokens = tokenizer.convert_ids_to_tokens(
                    tensor1[row, col, :].tolist()
                )
                new_tokens = tokenizer.convert_ids_to_tokens(
                    tensor2[row, col, :].tolist()
                )

                print(f"\n🔤 Decoded Tokens from Old Method: {old_tokens}")
                print(f"🔤 Decoded Tokens from New Method: {new_tokens}")

            print(f"\n📊 Old method tensor: {tensor1[row, col, :].tolist()}")
            print(f"\n📊 New method tensor: {tensor2[row, col, :].tolist()}")

            # Stop further processing after first mismatch
            return False

    print(f"✅ {name} lists are identical!")
    return True


# Compare all three tensor lists with tqdm progress bar
all_match = True

with tqdm(total=len(["x_toks", "x_masks", "y_toks"]), desc="Comparing Tensors") as pbar:
    for tensor_name in ["x_toks", "x_masks", "y_toks"]:
        match = compare_tensor_lists(
            tensor_lists["old"][tensor_name],
            tensor_lists["new"][tensor_name],
            tensor_name,
            xls_files,
        )
        all_match &= match
        pbar.update(1)  # Update progress bar after each comparison

if all_match:
    print("\n🎉 All tensors from both methods match exactly!")
else:
    print("\n⚠️ Differences detected in tensor values!")

Running Old Method: 100%|██████████| 1295/1295 [12:22<00:00,  1.74it/s]
Running New Method: 100%|██████████| 1295/1295 [14:01<00:00,  1.54it/s]
Comparing Tensors:  33%|███▎      | 1/3 [00:01<00:03,  1.83s/it]

✅ x_toks lists are identical!


Comparing Tensors:  67%|██████▋   | 2/3 [00:04<00:02,  2.06s/it]

✅ x_masks lists are identical!


Comparing Tensors: 100%|██████████| 3/3 [00:05<00:00,  1.72s/it]

✅ y_toks lists are identical!

🎉 All tensors from both methods match exactly!



