In [38]:
import os
import json
import re

# Define the folder path containing the JSON files
folder_path = "data_symbolic_regression/train"

# Function to tokenize a formula
def tokenize_formula(formula):
    # Define a regex pattern to extract tokens (identifiers, operators, parentheses, etc.)
    token_pattern = r"[a-zA-Z_][a-zA-Z0-9_]*|[()+\-*/]|\d+\.?\d*"
    tokens = re.findall(token_pattern, formula)
    return tokens

# Iterate over all files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".json") and not file_name.startswith('properties'):
        file_path = os.path.join(folder_path, file_name)
        
        # Open and read the JSON file
        with open(file_path, "r") as file:
            data = json.load(file)
            
            # Extract the human-readable formula
            formula_human_readable = data.get("formula_human_readable", "")
            
            if formula_human_readable:
                # Tokenize the formula
                tokens = tokenize_formula(formula_human_readable)
                
                # Print or store the tokens for further analysis
                print(f"File: {file_name}")
                print(f"Formula: {formula_human_readable}")
                print(f"Tokens: {tokens}\n")

File: f_473.json
Formula: (gaussian(cos(var_0))+tanh((var_2+var_1)))
Tokens: ['(', 'gaussian', '(', 'cos', '(', 'var_0', ')', ')', '+', 'tanh', '(', '(', 'var_2', '+', 'var_1', ')', ')', ')']

File: f_189.json
Formula: (gaussian(reverse(var_1))*gaussian((var_0*var_2)))
Tokens: ['(', 'gaussian', '(', 'reverse', '(', 'var_1', ')', ')', '*', 'gaussian', '(', '(', 'var_0', '*', 'var_2', ')', ')', ')']

File: f_536.json
Formula: (neg(cosh(var_0))+pow_2((var_2*var_1)))
Tokens: ['(', 'neg', '(', 'cosh', '(', 'var_0', ')', ')', '+', 'pow_2', '(', '(', 'var_2', '*', 'var_1', ')', ')', ')']

File: f_166.json
Formula: ((pow_2(var_1)*log(var_0))*neg(var_2))
Tokens: ['(', '(', 'pow_2', '(', 'var_1', ')', '*', 'log', '(', 'var_0', ')', ')', '*', 'neg', '(', 'var_2', ')', ')']

File: f_49.json
Formula: ((C_0*(var_2*var_1))*sinh(sin(var_0)))
Tokens: ['(', '(', 'C_0', '*', '(', 'var_2', '*', 'var_1', ')', ')', '*', 'sinh', '(', 'sin', '(', 'var_0', ')', ')', ')']

File: f_424.json
Formula: ((sin(var_1)

In [39]:
import torch
import torch.nn.functional as F
import numpy as np
import random

class TextDiffusionModel:
    def __init__(self, vocab_size, seq_len, device="cpu"):
        """
        Initialize the text diffusion model.

        Parameters:
        - vocab_size: Size of the vocabulary (number of unique tokens).
        - seq_len: Length of the token sequence.
        - device: Device to use ("cpu" or "cuda").
        """
        self.vocab_size = vocab_size
        self.seq_len = seq_len
        self.device = device
        self.noise_schedule = torch.linspace(0.01, 0.1, steps=1000).to(device)  # Noise variance per timestep

    def add_noise(self, tokens, t):
        """
        Add noise to a sequence of tokens based on timestep t.

        Parameters:
        - tokens: A tensor of token indices with shape (batch_size, seq_len).
        - t: A tensor of timesteps with shape (batch_size,).

        Returns:
        - noisy_tokens: The tokens with added noise.
        - noise: The noise added to the tokens.
        """
        batch_size = tokens.size(0)
        noise_std = self.noise_schedule[t].view(-1, 1, 1)  # Shape: (batch_size, 1, 1)

        # Convert tokens to one-hot vectors
        one_hot = F.one_hot(tokens, num_classes=self.vocab_size).float()
        
        # Add Gaussian noise to the one-hot vectors
        noise = torch.randn_like(one_hot) * noise_std
        noisy_one_hot = one_hot + noise

        # Compute softmax to normalize the noisy one-hot vectors
        noisy_tokens = F.softmax(noisy_one_hot, dim=-1)
        return noisy_tokens, noise

    def sample_from_noisy_tokens(self, noisy_tokens):
        """
        Sample discrete tokens from the noisy token distribution.

        Parameters:
        - noisy_tokens: A tensor of noisy token distributions with shape (batch_size, seq_len, vocab_size).

        Returns:
        - sampled_tokens: A tensor of sampled token indices with shape (batch_size, seq_len).
        """
        sampled_tokens = torch.argmax(noisy_tokens, dim=-1)
        return sampled_tokens

# Example usage
if __name__ == "__main__":
    # Hyperparameters
    vocab_size = 100  # Example vocabulary size
    seq_len = 10  # Example sequence length
    batch_size = 4  # Example batch size
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Initialize the model
    model = TextDiffusionModel(vocab_size, seq_len, device=device)

    # Generate a batch of random tokens
    tokens = torch.randint(0, vocab_size, (batch_size, seq_len), device=device)

    # Choose random timesteps for each batch
    t = torch.randint(0, 1000, (batch_size,), device=device)

    # Add noise to the tokens
    noisy_tokens, noise = model.add_noise(tokens, t)

    # Sample from noisy tokens
    sampled_tokens = model.sample_from_noisy_tokens(noisy_tokens)

    # Print results
    print("Original Tokens:", tokens)
    print("Noisy Tokens (probabilities):", noisy_tokens)
    print("Sampled Tokens:", sampled_tokens)

Original Tokens: tensor([[61, 96, 45, 35, 29, 64, 97, 51, 51, 92],
        [86, 62, 91, 11,  1, 10, 59, 97, 16, 25],
        [11, 37,  4, 92, 44, 64, 38, 49, 82, 78],
        [28, 10, 94, 65, 95, 79, 58, 90, 86, 34]])
Noisy Tokens (probabilities): tensor([[[0.0091, 0.0106, 0.0095,  ..., 0.0102, 0.0099, 0.0092],
         [0.0099, 0.0096, 0.0099,  ..., 0.0094, 0.0096, 0.0102],
         [0.0096, 0.0100, 0.0103,  ..., 0.0098, 0.0096, 0.0102],
         ...,
         [0.0095, 0.0104, 0.0095,  ..., 0.0099, 0.0097, 0.0100],
         [0.0102, 0.0096, 0.0093,  ..., 0.0097, 0.0102, 0.0096],
         [0.0089, 0.0100, 0.0107,  ..., 0.0094, 0.0100, 0.0099]],

        [[0.0103, 0.0102, 0.0096,  ..., 0.0103, 0.0088, 0.0121],
         [0.0102, 0.0090, 0.0105,  ..., 0.0102, 0.0096, 0.0112],
         [0.0094, 0.0101, 0.0099,  ..., 0.0099, 0.0090, 0.0093],
         ...,
         [0.0103, 0.0101, 0.0093,  ..., 0.0231, 0.0092, 0.0101],
         [0.0100, 0.0097, 0.0108,  ..., 0.0092, 0.0097, 0.0098],
       

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os
import json
import re
from dataclasses import dataclass

# Define a function to tokenize a formula
def tokenize_formula(formula):
    token_pattern = r"[a-zA-Z_][a-zA-Z0-9_]*|[()+\-*/]|\d+\.?\d*"
    tokens = re.findall(token_pattern, formula)
    return tokens

@dataclass
class tNetConfig:
    num_vars: int
    embedding_size: int

class tNet(nn.Module):
    def __init__(self, config: tNetConfig):
        super(tNet, self).__init__()

        self.config = config
        self.num_vars = config.num_vars
        self.n_embd = config.embedding_size

        self.activation_func = F.relu

        self.conv1 = nn.Conv1d(self.num_vars + 1, self.n_embd, 1)
        self.conv2 = nn.Conv1d(self.n_embd, 2 * self.n_embd, 1)
        self.conv3 = nn.Conv1d(2 * self.n_embd, 4 * self.n_embd, 1)

        self.fc1 = nn.Linear(4 * self.n_embd, 2 * self.n_embd)
        self.fc2 = nn.Linear(2 * self.n_embd, self.n_embd)

        self.input_batch_norm = nn.GroupNorm(1, self.num_vars + 1)

        self.bn1 = nn.GroupNorm(1, self.n_embd)
        self.bn2 = nn.GroupNorm(1, 2 * self.n_embd)
        self.bn3 = nn.GroupNorm(1, 4 * self.n_embd)
        self.bn4 = nn.GroupNorm(1, 2 * self.n_embd)
        self.bn5 = nn.GroupNorm(1, self.n_embd)

    def forward(self, x):
        """
        :param x: [batch, #features + 1, #points]
        :return: logit: [batch, embedding_size]
        """
        x = self.input_batch_norm(x)
        x = self.activation_func(self.bn1(self.conv1(x)))
        x = self.activation_func(self.bn2(self.conv2(x)))
        x = self.activation_func(self.bn3(self.conv3(x)))
        x, _ = torch.max(x, dim=2)  # global max pooling
        assert x.size(1) == 4 * self.n_embd

        x = self.activation_func(self.bn4(self.fc1(x)))
        x = self.activation_func(self.bn5(self.fc2(x)))

        return x

class TextDiffusionModel:
    def __init__(self, vocab_size, seq_len, device="cpu"):
        """
        Initialize the text diffusion model.

        Parameters:
        - vocab_size: Size of the vocabulary (number of unique tokens).
        - seq_len: Length of the token sequence.
        - device: Device to use ("cpu" or "cuda").
        """
        self.vocab_size = vocab_size
        self.seq_len = seq_len
        self.device = device
        self.noise_schedule = torch.linspace(0.01, 0.1, steps=1000).to(device)  # Noise variance per timestep

    def add_noise(self, tokens, t):
        """
        Add noise to a sequence of tokens based on timestep t.

        Parameters:
        - tokens: A tensor of token indices with shape (batch_size, seq_len).
        - t: A tensor of timesteps with shape (batch_size,).

        Returns:
        - noisy_tokens: The tokens with added noise.
        - noise: The noise added to the tokens.
        """
        batch_size = tokens.size(0)
        noise_std = self.noise_schedule[t].view(-1, 1, 1)  # Shape: (batch_size, 1, 1)

        # Convert tokens to one-hot vectors
        one_hot = F.one_hot(tokens, num_classes=self.vocab_size).float()
        
        # Add Gaussian noise to the one-hot vectors
        noise = torch.randn_like(one_hot) * noise_std
        noisy_one_hot = one_hot + noise

        # Compute softmax to normalize the noisy one-hot vectors
        noisy_tokens = F.softmax(noisy_one_hot, dim=-1)
        return noisy_tokens, noise

    def sample_from_noisy_tokens(self, noisy_tokens):
        """
        Sample discrete tokens from the noisy token distribution.

        Parameters:
        - noisy_tokens: A tensor of noisy token distributions with shape (batch_size, seq_len, vocab_size).

        Returns:
        - sampled_tokens: A tensor of sampled token indices with shape (batch_size, seq_len).
        """
        sampled_tokens = torch.argmax(noisy_tokens, dim=-1)
        return sampled_tokens

class ReverseProcessModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, num_vars, seq_len):
        super(ReverseProcessModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.num_vars = num_vars
        self.seq_len = seq_len

        # Calculate the correct input size for fc1
        input_size = embedding_size + (seq_len * vocab_size) + 1  # embeddings + noisy_tokens + timestep

        # Define layers for the reverse process model
        self.fc1 = nn.Linear(input_size, 512)  # Adjusted input size
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, seq_len * vocab_size)  # Output for all tokens in the sequence

    def forward(self, noisy_tokens, embeddings, t):
        """
        Forward pass for the reverse process model.

        :param noisy_tokens: Tensor of noisy tokens with shape [batch_size, seq_len, vocab_size].
        :param embeddings: Tensor of embeddings with shape [batch_size, embedding_size].
        :param t: Tensor of timesteps with shape [batch_size].
        :return: Predicted noise.
        """
        # Flatten noisy tokens to [batch_size, seq_len * vocab_size]
        noisy_tokens_flat = noisy_tokens.view(noisy_tokens.size(0), -1)

        # Concatenate embeddings, flattened noisy tokens, and timestep information
        timestep_embedding = torch.cat([embeddings, noisy_tokens_flat, t.unsqueeze(1).float()], dim=-1)
        
        # Pass through the fully connected layers
        x = F.relu(self.fc1(timestep_embedding))
        x = F.relu(self.fc2(x))
        predicted_noise = self.fc3(x)

        # Reshape to [batch_size, seq_len, vocab_size]
        predicted_noise = predicted_noise.view(-1, self.seq_len, self.vocab_size)
        
        return predicted_noise

# Main function
if __name__ == "__main__":

    # Define the device 
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    folder_path = "data_symbolic_regression/train"

    # Load and tokenize formulas from the training set; Convert the data points to a Pytorch tensor
    tokenized_formulas = []
    points_list = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json") and not file_name.startswith('properties'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r") as file:
                data = json.load(file)

                formula_human_readable = data.get("formula_human_readable", "")
                if formula_human_readable:
                    tokens = tokenize_formula(formula_human_readable)
                    tokenized_formulas.append(tokens)
                
                points = data.get("points")
                if points:
                    points_array = np.array([points["var_0"], points["var_1"], points["var_2"], points["target"]])
                    points_tensor = torch.tensor(points_array, dtype=torch.float32, device=device).unsqueeze(0)  # Add batch dimension
                    points_list.append(points_tensor)
                    # Need below line if points_array is transposed
                    # points_tensor = torch.tensor(points_array, dtype=torch.float32, device=device).unsqueeze(0).permute(0, 2, 1)  # Add batch dimension and transpose

    # Create the vocabulary from the tokens
    vocab_mapping = {token: idx for idx, token in enumerate(set(t for tokens in tokenized_formulas for t in tokens))}
    vocab_size = len(vocab_mapping)

    token_sequences = [[vocab_mapping[token] for token in tokens] for tokens in tokenized_formulas]

    formula_lengths = [len(tokens) for tokens in tokenized_formulas]
    seq_len = int(np.percentile(formula_lengths, 95))  # Use 95th percentile

    batch_size = 4  # Example batch size

    # Initialize the model
    diffusion_model = TextDiffusionModel(vocab_size, seq_len, device=device)
    
    # Pad or truncate sequences to seq_len
    token_sequences = [seq[:seq_len] + [0] * max(0, seq_len - len(seq)) for seq in token_sequences]
    token_tensor = torch.tensor(token_sequences, device=device)

    # Choose random timesteps for each sequence
    t = torch.randint(0, 1000, (len(token_tensor),), device=device)

    # Add noise to the tokens
    noisy_tokens, noise = diffusion_model.add_noise(token_tensor, t)

    # Sample from noisy tokens
    sampled_tokens = diffusion_model.sample_from_noisy_tokens(noisy_tokens)

    # Configuration for tNet
    num_vars = 3
    embedding_size = 32  # Example embedding size
    config = tNetConfig(num_vars=num_vars, embedding_size=embedding_size)

    # Instantiate the model
    tnet_model = tNet(config)

    # Input: batch_size x (num_vars + 1) x num_points
    batch_size = 1

    # Generate embeddings
    # input_tensor = torch.rand(batch_size, num_vars, 100)

    output_embeddings = []
    for pt in points_list:
        output_embedding = tnet_model(pt)
        output_embeddings.append(output_embedding)
    
    points_tensors = torch.cat(points_list, dim=0)
    
    output_embeddings_tensor = torch.cat(output_embeddings, dim=0)
    # Print the output
    print("Input shape:", points_tensors.shape)
    print("Output shape:", output_embeddings_tensor.shape)

    # Print results
    print("Original Tokens shape:", token_tensor.shape)
    print("Noisy Tokens (probabilities) shape:", noisy_tokens.shape)
    print("Sampled Tokens shape:", sampled_tokens.shape)

    # Initialize reverse model (denoiser)
    reverse_model = ReverseProcessModel(vocab_size, embedding_size, num_vars, seq_len).to(device)

    # Optimizer for the reverse process model
    optimizer = torch.optim.Adam(reverse_model.parameters(), lr=1e-4)

    # Loss function: MSE between predicted noise and actual noise
    loss_fn = nn.MSELoss()

    # Train the reverse process
    epochs = 100  # Define the number of epochs for training

    for epoch in range(epochs):
        reverse_model.train()
        
        total_loss = 0
        for batch_idx in range(0, len(points_list), batch_size):
            # Select batch of noisy tokens and corresponding points
            batch_points = points_list[batch_idx:batch_idx + batch_size]
            batch_token_tensor = token_tensor[batch_idx:batch_idx + batch_size]
            
            # Choose random timesteps for the batch
            t_batch = torch.randint(0, 1000, (len(batch_points),), device=device)
            
            # Add noise to the tokens (forward diffusion)
            noisy_tokens, noise = diffusion_model.add_noise(batch_token_tensor, t_batch)
            
            # Get embeddings from tNet model
            batch_embeddings = []
            for pt in batch_points:
                embedding = tnet_model(pt)
                batch_embeddings.append(embedding)
            
            embeddings_tensor = torch.cat(batch_embeddings, dim=0)

            # Pass noisy tokens, embeddings, and timestep through reverse model to predict noise
            predicted_noise = reverse_model(noisy_tokens, embeddings_tensor, t_batch)
            
            # Compute loss (MSE between predicted noise and actual noise)
            loss = loss_fn(predicted_noise, noise.view(-1, vocab_size))
            
            # Backpropagate and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Print progress every epoch
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(points_list)}")

Input shape: torch.Size([747, 4, 100])
Output shape: torch.Size([747, 32])
Original Tokens shape: torch.Size([747, 24])
Noisy Tokens (probabilities) shape: torch.Size([747, 24, 23])
Sampled Tokens shape: torch.Size([747, 24])


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/100], Loss: 0.06799435428612699
Epoch [2/100], Loss: 0.004043321036086156
Epoch [3/100], Loss: 0.00383359700420867
Epoch [4/100], Loss: 0.0037539558822513213
Epoch [5/100], Loss: 0.003462837992753108
Epoch [6/100], Loss: 0.003737248511449777
Epoch [7/100], Loss: 0.0036684969613712576
Epoch [8/100], Loss: 0.0035921411640197007
Epoch [9/100], Loss: 0.0036347295148494506
Epoch [10/100], Loss: 0.003659414603903053
Epoch [11/100], Loss: 0.003726245268288385
Epoch [12/100], Loss: 0.0037649068357059833
Epoch [13/100], Loss: 0.0038504724810628495
Epoch [14/100], Loss: 0.003385213521776183
Epoch [15/100], Loss: 0.0039530367483110625
Epoch [16/100], Loss: 0.003736022280092311
Epoch [17/100], Loss: 0.0035848710112755407
Epoch [18/100], Loss: 0.003751328449655082
Epoch [19/100], Loss: 0.0038148433815616517
Epoch [20/100], Loss: 0.00387214769142832
Epoch [21/100], Loss: 0.003646162267032293
Epoch [22/100], Loss: 0.003716097489814274
Epoch [23/100], Loss: 0.0036884073442110133
Epoch [24/100

In [2]:
import torch
import os
import json
import numpy as np
from sklearn.metrics import accuracy_score

def evaluate_diffusion_model(test_folder, diffusion_model, reverse_model, tnet_model, vocab_mapping, seq_len, device):
    """
    Evaluate the diffusion model on the test set.

    Parameters:
    - test_folder: Path to the folder containing the test JSON files.
    - diffusion_model: Instance of the TextDiffusionModel.
    - reverse_model: Instance of the ReverseProcessModel.
    - tnet_model: Instance of the tNet model for generating embeddings.
    - vocab_mapping: Dictionary mapping tokens to indices.
    - seq_len: Length of the token sequence.
    - device: Device to use ("cpu" or "cuda").

    Returns:
    - results: List of tuples (actual_formula, reconstructed_formula).
    """
    reverse_vocab_mapping = {idx: token for token, idx in vocab_mapping.items()}

    results = []

    for file_name in os.listdir(test_folder):
        if file_name.endswith(".json") and not file_name.startswith('properties'):
            file_path = os.path.join(test_folder, file_name)

            with open(file_path, "r") as file:
                data = json.load(file)

                formula_human_readable = data.get("formula_human_readable", "")
                tokens = tokenize_formula(formula_human_readable)

                # Convert tokens to indices
                token_indices = [vocab_mapping.get(token, 0) for token in tokens]

                # Pad or truncate to seq_len
                token_indices = token_indices[:seq_len] + [0] * max(0, seq_len - len(token_indices))
                token_tensor = torch.tensor(token_indices, device=device).unsqueeze(0)  # Add batch dimension

                points = data.get("points")
                if points:
                    points_array = np.array([points["var_0"], points["var_1"], points["var_2"], points["target"]])
                    points_tensor = torch.tensor(points_array, dtype=torch.float32, device=device).unsqueeze(0)

                    # Generate embeddings using tNet model
                    embedding = tnet_model(points_tensor)

                    # Choose random timestep
                    t = torch.randint(0, 1000, (1,), device=device)

                    # Add noise to the tokens
                    noisy_tokens, _ = diffusion_model.add_noise(token_tensor, t)

                    # Use reverse model to reconstruct the clean tokens
                    reconstructed_noise = reverse_model(noisy_tokens, embedding, t)
                    # print(f"Reconstructed Noise Shape: {reconstructed_noise.shape}")
                    # Convert reconstructed noise to token indices
                    # reconstructed_tokens = torch.argmax(reconstructed_noise, dim=-1).squeeze(0)

                    # Ensure reconstructed_tokens is a list
                    reconstructed_tokens = torch.argmax(reconstructed_noise, dim=-1)
                    # print(reconstructed_tokens)
                    if reconstructed_tokens.dim() == 2:  # Case: (batch_size, seq_len)
                        reconstructed_tokens = reconstructed_tokens.squeeze(0)  # Remove batch dimension
                    elif reconstructed_tokens.dim() == 1:  # Case: (seq_len,)
                        pass  # Already correct
                    else:
                        raise ValueError(f"Unexpected shape for reconstructed_tokens: {reconstructed_tokens.shape}")

                    print(reconstructed_tokens)
                    # Map token indices back to tokens
                    reconstructed_formula = " ".join(
                        reverse_vocab_mapping[idx] if idx in reverse_vocab_mapping else "<UNK>" for idx in reconstructed_tokens.tolist()
                    )
                    
                    actual_formula = " ".join(tokens)

                    results.append((actual_formula, reconstructed_formula))

    return results

In [4]:
# Define the device 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

folder_path_test = "data_symbolic_regression/test"

# Load and tokenize formulas from the training set; Convert the data points to a Pytorch tensor
tokenized_formulas_test = []
points_list_test = []

for file_name in os.listdir(folder_path_test):
    if file_name.endswith(".json") and not file_name.startswith('properties'):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, "r") as file:
            data = json.load(file)

            formula_human_readable_test = data.get("formula_human_readable", "")
            if formula_human_readable_test:
                tokens_test = tokenize_formula(formula_human_readable_test)
                tokenized_formulas_test.append(tokens_test)
                
            points_test = data.get("points")
            if points_test:
                points_array_test = np.array([points_test["var_0"], points_test["var_1"], points_test["var_2"], points_test["target"]])
                points_tensor_test = torch.tensor(points_array_test, dtype=torch.float32, device=device).unsqueeze(0)  # Add batch dimension
                points_list_test.append(points_tensor_test)
                # Need below line if points_array is transposed
                # points_tensor = torch.tensor(points_array, dtype=torch.float32, device=device).unsqueeze(0).permute(0, 2, 1)  # Add batch dimension and transpose

# Create the vocabulary from the tokens
vocab_mapping_test = {token: idx for idx, token in enumerate(set(t for tokens in tokenized_formulas_test for t in tokens))}
vocab_size_test = len(vocab_mapping_test)

token_sequences_test = [[vocab_mapping_test[token] for token in tokens] for tokens in tokenized_formulas_test]

formula_lengths_test = [len(tokens) for tokens in tokenized_formulas_test]
seq_len_test = int(np.percentile(formula_lengths_test, 95))  # Use 95th percentile

# Initialize the model
diffusion_model_test = TextDiffusionModel(vocab_size_test, seq_len_test, device=device)
    
# Pad or truncate sequences to seq_len
token_sequences_test = [seq[:seq_len] + [0] * max(0, seq_len - len(seq)) for seq in token_sequences_test]
token_tensor_test = torch.tensor(token_sequences_test, device=device)

# Choose random timesteps for each sequence
t = torch.randint(0, 1000, (len(token_tensor_test),), device=device)

# Configuration for tNet
num_vars_test = 3
embedding_size_test = 32  # Example embedding size
config_test = tNetConfig(num_vars=num_vars_test, embedding_size=embedding_size_test)

# Instantiate the model
tnet_model_test = tNet(config_test)

# reverse_model = ReverseProcessModel(vocab_size_test, embedding_size_test, num_vars_test, seq_len_test).to(device)

# Evaluate the model
results = evaluate_diffusion_model(folder_path_test, diffusion_model, reverse_model, tnet_model, vocab_mapping, seq_len, device)

# Display example results
example_idx = 0  # Index of the example to display

if results:
    actual, reconstructed = results[example_idx]
    print("Example Formula:")
    print(f"Actual Formula: {actual}")
    print(f"Reconstructed Formula: {reconstructed}")

# Calculate accuracy or similarity score (optional)
# accuracies = [accuracy_score(list(actual), list(reconstructed)) for actual, reconstructed in results]
# print(f"Average Reconstruction Accuracy: {np.mean(accuracies):.2f}")

tensor([21, 21, 21, 20,  4, 21, 22,  5, 22, 20,  1, 16, 14, 19, 22, 19,  3, 14,
        14,  0, 10,  1, 18,  6])
tensor([21, 21, 21, 20,  4, 21, 22,  5, 22, 20,  1, 16, 14, 19, 22, 19,  3, 14,
        14,  0, 10,  1, 18,  6])
tensor([21, 21, 21, 20,  4, 21, 22,  5, 22, 20,  1, 16, 14, 19, 22, 19,  3, 14,
        14,  0, 10,  1, 18,  6])
tensor([21, 21, 21, 20,  4, 21, 22,  5, 22, 20,  1, 16, 14, 19, 22, 19,  3, 14,
        14,  0, 10,  1, 18,  6])
tensor([21, 21, 21, 20,  4, 21, 22,  5, 22, 20,  1, 16, 14, 19, 22, 19,  3, 14,
        14,  0, 10,  1, 18,  6])
tensor([21, 21, 21, 20,  4, 21, 22,  5, 22, 20,  1, 16, 14, 19, 22, 19,  3, 14,
        14,  0, 10,  1, 18,  6])
tensor([21, 21, 21, 20,  4, 21, 22,  5, 22, 20,  1, 16, 14, 19, 22, 19,  3, 14,
        14,  0, 10,  1, 18,  6])
tensor([21, 21, 21, 20,  4, 21, 22,  5, 22, 20,  1, 16, 14, 19, 22, 19,  3, 14,
        14,  0, 10,  1, 18,  6])
tensor([21, 21, 21, 20,  4, 21, 22,  5, 22, 20,  1, 16, 14, 19, 22, 19,  3, 14,
        14,  0, 