In [33]:
import os
import json

# Define the paths to the folders
folders = ["data_symbolic_regression/test", "data_symbolic_regression/train", "data_symbolic_regression/val"]

# Dictionary to hold the data from each folder
data = {folder: [] for folder in folders}

# Iterate through each folder and read JSON files
for folder in folders:
    if os.path.exists(folder):
        for file_name in os.listdir(folder):
            if file_name.endswith(".json"):
                file_path = os.path.join(folder, file_name)
                try:
                    with open(file_path, "r") as file:
                        content = json.load(file)
                        data[folder].append(content)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    else:
        print(f"Folder {folder} does not exist.")

# Example: Accessing data
for folder, files in data.items():
    print(f"Folder: {folder}, Number of files: {len(files)}")




Folder: data_symbolic_regression/test, Number of files: 161
Folder: data_symbolic_regression/train, Number of files: 747
Folder: data_symbolic_regression/val, Number of files: 160


In [34]:
if data["data_symbolic_regression/test"]:
    # Access the first item in the test folder
    test = data["data_symbolic_regression/test"]

if data["data_symbolic_regression/train"]:
    # Access the first item in the test folder
    train = data["data_symbolic_regression/train"]

if data["data_symbolic_regression/val"]:
    # Access the first item in the test folder
    val = data["data_symbolic_regression/val"]

In [35]:
len(train)
for i in range(len(train)):
    print(i)
    print(train[i]['formula'])

0
mult(add(cos(var_1, N), mult(var_2, C_0)), mult(mult(var_0, var_0), cos(var_2, N)))
1
add(add(add(var_1, C_0), tan(var_0, N)), tanh(exp(var_2, N), N(N, N)))
2
add(pow_2(cosh(var_2, N), N(N, N)), reverse(mult(var_1, var_0), N(N, N)))
3
add(add(gaussian(var_1, N), add(var_0, C_0)), tan(mult(var_1, var_2), N(N, N)))
4
add(sqrt(mult(var_2, var_0), N(N, N)), neg(cosh(var_1, N), N(N, N)))
5
mult(mult(add(var_2, var_0), sqrt(var_1, N)), tanh(add(var_1, var_2), N(N, N)))
6
log(mult(add(var_0, C_0), add(var_1, var_2)), N(N(N, N), N(N, N)))
7
mult(add(sin(var_1, N), tanh(var_0, N)), sinh(sqrt(var_2, N), N(N, N)))
8
add(tanh(log(var_0, N), N(N, N)), tanh(add(var_2, var_1), N(N, N)))
9
add(sinh(tanh(var_0, N), N(N, N)), cos(mult(var_1, var_2), N(N, N)))
10
add(log(sinh(var_0, N), N(N, N)), neg(mult(var_2, var_1), N(N, N)))
11
add(mult(log(var_0, N), sqrt(var_1, N)), sin(pow_2(var_2, N), N(N, N)))
12
mult(add(mult(var_0, C_0), sinh(var_1, N)), reverse(gaussian(var_2, N), N(N, N)))
13
add(tan(reve

In [39]:
import torch
import torch.nn as nn
from collections import defaultdict

class FormulaTokenizer:
    def __init__(self):
        self.token_to_id = defaultdict(lambda: self.token_to_id["[UNK]"])  # Default to [UNK] for unknown tokens
        self.token_to_id["[PAD]"] = 0
        self.token_to_id["[START]"] = 1
        self.token_to_id["[END]"] = 2
        self.token_to_id["[UNK]"] = 3
        self.id_to_token = {v: k for k, v in self.token_to_id.items()}

    def tokenize(self, formula):
        tokens = formula.replace("(", " ( ").replace(")", " ) ").replace(",", " , ").split()
        return tokens

    def encode(self, formula):
        tokens = self.tokenize(formula)
        token_ids = [self.token_to_id[token] for token in tokens]
        return [self.token_to_id["[START]"]] + token_ids + [self.token_to_id["[END]"]]

    def decode(self, token_ids):
        tokens = [self.id_to_token[token_id] for token_id in token_ids if token_id in self.id_to_token]
        return " ".join(tokens)

    def build_vocab(self, formulas):
        for formula in formulas:
            tokens = self.tokenize(formula)
            for token in tokens:
                self.token_to_id[token]  # Populate vocabulary
        self.id_to_token = {v: k for k, v in self.token_to_id.items()}


# Embedding model
class FormulaEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, token_ids):
        return self.embedding(token_ids)

# Generate embeddings for formulas in the train dataset
def generate_embeddings(train_data, embedding_dim=128, max_length=50):
    # Initialize tokenizer and build vocabulary
    tokenizer = FormulaTokenizer()
    all_formulas = [item["formula_human_readable"] for item in train_data]
    tokenizer.build_vocab(all_formulas)

    # Initialize the embedding model
    vocab_size = len(tokenizer.token_to_id)
    embedding_model = FormulaEmbeddingModel(vocab_size, embedding_dim)

    # Tokenize and pad formulas
    tokenized_data = []
    for item in train_data:
        formula = item["formula_human_readable"]
        encoded_formula = tokenizer.encode(formula)

        # Pad or truncate to max_length
        if len(encoded_formula) < max_length:
            encoded_formula += [tokenizer.token_to_id["[PAD]"]] * (max_length - len(encoded_formula))
        else:
            encoded_formula = encoded_formula[:max_length]

        tokenized_data.append(torch.tensor(encoded_formula, dtype=torch.long))

    # Stack tokenized data
    tokenized_tensor = torch.stack(tokenized_data)

    # Generate embeddings
    with torch.no_grad():  # Disable gradients for inference
        embeddings = embedding_model(tokenized_tensor)

    return embeddings, tokenizer, embedding_model

# Call the function with train data
embedding_dim = 128
max_length = 50
embeddings, tokenizer, embedding_model = generate_embeddings(train, embedding_dim, max_length)

# Display embeddings
print(f"Generated embeddings shape: {embeddings.shape}")


Generated embeddings shape: torch.Size([747, 50, 128])


In [41]:
# Precompute embeddings for train and val
def precompute_embeddings(data, tokenizer, embedding_dim=128, max_length=50):
    vocab_size = len(tokenizer.token_to_id)
    embedding_model = FormulaEmbeddingModel(vocab_size, embedding_dim)

    precomputed = []
    for idx, item in enumerate(data):
        if "formula_human_readable" in item:
            formula = item["formula_human_readable"]
            encoded_formula = tokenizer.encode(formula)

            # Debug: Check if any index exceeds vocab_size
            if any(token_id >= vocab_size for token_id in encoded_formula):
                print(f"Error: Token ID out of range in formula: {formula}")
                continue  # Skip problematic formula

            # Pad or truncate to max_length
            if len(encoded_formula) < max_length:
                encoded_formula += [tokenizer.token_to_id["[PAD]"]] * (max_length - len(encoded_formula))
            else:
                encoded_formula = encoded_formula[:max_length]

            # Convert to tensor and pass through embedding model
            with torch.no_grad():
                token_ids = torch.tensor(encoded_formula, dtype=torch.long).unsqueeze(0)  # Batch size 1
                embedding = embedding_model(token_ids).squeeze(0)  # Remove batch dimension

            precomputed.append((torch.tensor(item["points"]["var_0"], dtype=torch.float32), embedding))

    return precomputed


# Precompute embeddings for train and val datasets
train_precomputed = precompute_embeddings(train, tokenizer, embedding_dim=128, max_length=50)
val_precomputed = precompute_embeddings(val, tokenizer, embedding_dim=128, max_length=50)


In [53]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import random
import math

# Diffusion Model
class DiffusionModel(nn.Module):
    def __init__(self, embedding_dim, vocab_size):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.model = nn.Sequential(
            nn.Linear(embedding_dim + 1, 256),  # Time embedding added
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, embedding_dim)
        )

    def forward(self, x, t):
        """
        Forward pass for the diffusion model.
        Args:
            - x: Noisy embeddings [batch_size, seq_length, embedding_dim]
            - t: Timestep tensor [batch_size]
        Returns:
            - Predicted noise [batch_size, seq_length, embedding_dim]
        """
        # Expand time embeddings to match [batch_size, seq_length, 1]
        t_emb = t.unsqueeze(-1).unsqueeze(-1).repeat(1, x.size(1), 1)  # [batch_size, seq_length, 1]

        # Concatenate time embeddings with input embeddings along the last dimension
        x_t = torch.cat((x, t_emb), dim=-1)  # [batch_size, seq_length, embedding_dim + 1]

        return self.model(x_t)

# Noise Scheduler
def noise_schedule(timesteps, beta_start=1e-4, beta_end=2e-2):
    """
    Creates the noise schedule for the diffusion process.
    Returns:
        - betas: Tensor of size (timesteps,)
        - alpha_cumprod: Cumulative product of (1 - beta_t)
    """
    betas = torch.linspace(beta_start, beta_end, timesteps)
    alphas = 1.0 - betas
    alpha_cumprod = torch.cumprod(alphas, dim=0)
    return betas, alpha_cumprod

# Diffusion Process Utilities
class Diffusion:
    def __init__(self, model, betas, alpha_cumprod, timesteps):
        self.model = model
        self.betas = betas
        self.alpha_cumprod = alpha_cumprod
        self.timesteps = timesteps

    def add_noise(self, x, t):
        """
        Adds Gaussian noise to the input embeddings for the given timestep t.
        Args:
            - x: Original embeddings [batch_size, embedding_dim]
            - t: Timestep [batch_size]
        Returns:
            - x_t: Noisy embeddings [batch_size, embedding_dim]
            - noise: Noise added [batch_size, embedding_dim]
        """
        noise = torch.randn_like(x)  # Shape: [batch_size, seq_length, embedding_dim]
    
        # Expand alpha values to match [batch_size, 1, 1] for broadcasting
        sqrt_alpha_cumprod = self.alpha_cumprod[t].sqrt().view(-1, 1, 1)  # [batch_size, 1, 1]
        sqrt_one_minus_alpha_cumprod = (1 - self.alpha_cumprod[t]).sqrt().view(-1, 1, 1)  # [batch_size, 1, 1]
        
        # Apply noise
        x_t = sqrt_alpha_cumprod * x + sqrt_one_minus_alpha_cumprod * noise  # Broadcasting ensures correct shape
        return x_t, noise
    
    def loss(self, x, t, noise):
        """
        Calculates the loss by predicting the noise added to the embeddings.
        Args:
            - x: Original embeddings
            - t: Timestep
            - noise: Ground truth noise
        Returns:
            - MSE loss
        """
        noisy_x, _ = self.add_noise(x, t)
        pred_noise = self.model(noisy_x, t)
        return nn.MSELoss()(pred_noise, noise)

# Dataset Class for Precomputed Embeddings
class DiffusionDatasetWithEmbeddings(Dataset):
    def __init__(self, precomputed_data):
        self.data = precomputed_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        points, embedding = self.data[idx]
        return points, embedding
def train_diffusion_with_embeddings(train_data, val_data, embedding_dim=128, timesteps=1000, epochs=10, batch_size=32):
    # Prepare datasets and dataloaders
    train_dataset = DiffusionDatasetWithEmbeddings(train_data)
    val_dataset = DiffusionDatasetWithEmbeddings(val_data)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Model and optimizer
    model = DiffusionModel(embedding_dim, embedding_dim)
    print(f"Number of parameters in model: {sum(p.numel() for p in model.parameters())}")
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # Diffusion process
    betas, alpha_cumprod = noise_schedule(timesteps)
    diffusion = Diffusion(model, betas, alpha_cumprod, timesteps)

    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_idx, (points, embeddings) in enumerate(train_loader):
            print(f"Batch {batch_idx + 1}/{len(train_loader)}")
            print(f"Points Shape: {points.shape}, Embeddings Shape: {embeddings.shape}")

            # Add random noise to embeddings
            t = torch.randint(0, timesteps, (points.size(0),), dtype=torch.long)
            print(f"Random Timesteps: {t}")

            noisy_embeddings, noise = diffusion.add_noise(embeddings, t)

            # Predict the noise
            pred_noise = model(noisy_embeddings, t)
            loss = nn.MSELoss()(pred_noise, noise)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

        # Validation
        model.eval()
        with torch.no_grad():
            val_loss = 0
            for points, embeddings in val_loader:
                t = torch.randint(0, timesteps, (points.size(0),), dtype=torch.long)
                noisy_embeddings, noise = diffusion.add_noise(embeddings, t)
                pred_noise = model(noisy_embeddings, t)
                loss = nn.MSELoss()(pred_noise, noise)
                val_loss += loss.item()

            val_loss = val_loss / len(val_loader)
            print(f"Validation Loss: {val_loss}")

        # Save model checkpoint
        torch.save(model.state_dict(), f"diffusion_model_epoch_{epoch + 1}.pth")

    return model


# Train the diffusion model using precomputed embeddings
trained_model = train_diffusion_with_embeddings(train_precomputed, val_precomputed, embedding_dim=128, timesteps=1000, epochs=10, batch_size=32)


Number of parameters in model: 131968
Batch 1/24
Points Shape: torch.Size([32, 100]), Embeddings Shape: torch.Size([32, 50, 128])
Random Timesteps: tensor([984, 504,  32, 979, 880, 750, 232, 321, 825, 819, 822, 647,  60, 249,
        840, 176, 494, 640, 449, 105, 548, 557, 142, 516, 804, 296, 513,  25,
        672, 264, 147,  26])
Batch 2/24
Points Shape: torch.Size([32, 100]), Embeddings Shape: torch.Size([32, 50, 128])
Random Timesteps: tensor([388, 864, 865, 935, 186, 792, 572, 225, 182, 717, 757, 315, 334, 239,
         43, 586, 278, 300, 469, 333, 316, 981, 512, 388, 940, 179, 249,  34,
         51, 311, 956,  23])
Batch 3/24
Points Shape: torch.Size([32, 100]), Embeddings Shape: torch.Size([32, 50, 128])
Random Timesteps: tensor([574, 466, 394, 587, 793, 482, 174, 344, 277, 864, 907, 694, 454, 204,
        889, 512, 695, 335, 886, 443, 267, 646, 996, 449, 209, 611, 451, 516,
        989, 803, 945, 317])
Batch 4/24
Points Shape: torch.Size([32, 100]), Embeddings Shape: torch.Size(