# 2. Preprocess and Tokenize Data

In [42]:

import json
import numpy as np
import random
import re
import gensim
from gensim.models import Word2Vec
from sklearn.preprocessing import normalize

import torch
import torch.nn as nn
from torch.nn import functional as F
from dataclasses import dataclass
import pdb


# Set the random seed (for replicability)
seed = 20777980
random.seed(seed)
np.random.seed(seed)

# Constants for tokenization
PAD_TOKEN = "<PAD>"
NOISE_LEVEL = 0.1
EMBEDDING_DIM = 100  # Set the dimension of the embeddings
EPOCHS = 50  # Number of epochs for Word2Vec training
PATIENCE = 3  # Patience for early stopping (number of epochs to wait for improvement)
MIN_DELTA = 0.0001  # Minimum change in loss to count as an improvement

@dataclass
class tNetConfig:
    num_vars: int
    embedding_size: int

class tNet(nn.Module):
    def __init__(self, config: tNetConfig):
        super(tNet, self).__init__()

        self.config = config
        self.num_vars = config.num_vars
        self.n_embd = config.embedding_size

        self.activation_func = F.relu

        # Define the convolutional layers
        self.conv1 = nn.Conv1d(self.num_vars, self.n_embd, 1)
        self.conv2 = nn.Conv1d(self.n_embd, 2*self.n_embd, 1)
        self.conv3 = nn.Conv1d(2*self.n_embd, 4*self.n_embd, 1)

        # Define fully connected layers
        self.fc1 = nn.Linear(4*self.n_embd, 2*self.n_embd)
        self.fc2 = nn.Linear(2*self.n_embd, self.n_embd)

        # Corrected GroupNorm initialization
        self.input_batch_norm = nn.GroupNorm(1, self.num_vars)  # Corrected to match input channels
        
        # Define other GroupNorm layers
        self.bn1 = nn.GroupNorm(1, self.n_embd)
        self.bn2 = nn.GroupNorm(1, 2*self.n_embd)
        self.bn3 = nn.GroupNorm(1, 4*self.n_embd)
        self.bn4 = nn.GroupNorm(1, 2*self.n_embd)
        self.bn5 = nn.GroupNorm(1, self.n_embd)

    def forward(self, x):
        # Apply normalization and convolutions
        x = self.input_batch_norm(x)
        x = self.activation_func(self.bn1(self.conv1(x)))
        x = self.activation_func(self.bn2(self.conv2(x)))
        x = self.activation_func(self.bn3(self.conv3(x)))

        # Global max pooling
        x, _ = torch.max(x, dim=2)  # Reducing along the sequence dimension (index 2)
        assert x.size(1) == 4 * self.n_embd  # Ensure correct output size

        # Apply fully connected layers
        x = self.activation_func(self.bn4(self.fc1(x)))
        x = self.activation_func(self.bn5(self.fc2(x)))
        return x

def tokenize_skeleton(skeleton_str):
    """Tokenize the skeleton string into a sequence of symbols."""
    skeleton_str = skeleton_str.replace("**", "^")  # Replace '**' with '^'
    pattern = r'[a-zA-Z_][a-zA-Z0-9_]*|[+\-*/^(),.]|C|sin|cos|log|exp|sqrt'
    tokens = re.findall(pattern, skeleton_str)
    return tokens

def train_word2vec_embeddings(dataset, embedding_dim=EMBEDDING_DIM, epochs=EPOCHS, patience_num_epochs=PATIENCE):
    """Train Word2Vec embeddings on the dataset over multiple epochs with early stopping."""
    sentences = []
    for entry in dataset:
        skeleton_str = entry["skeleton"]
        tokens = tokenize_skeleton(skeleton_str)
        sentences.append(tokens)
    
    # Initialize the Word2Vec model
    model = Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=1, workers=4, epochs=1)

    best_loss = np.inf
    num_epochs_without_improvement = 0
    early_stopping = False

    performance_metrics_DICT = {
        "epoch_list": [],
        "train_loss_list": [],
    }

    # Train Word2Vec model with early stopping
    for epoch in range(epochs-1):
        # Train for one epoch
        model.train(sentences, total_examples=model.corpus_count, epochs=1)

        # Get the current loss (Word2Vec model loss is stored in the 'trainables' attribute)
        current_loss = model.get_latest_training_loss()

        # Record loss
        performance_metrics_DICT['epoch_list'].append(epoch+1)
        performance_metrics_DICT['train_loss_list'].append(current_loss)
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {current_loss}")

        # Early stopping check
        if current_loss < best_loss:
            best_loss = current_loss
            model.save("Data/embeddings_model.model")
            num_epochs_without_improvement = 0
            save_JSON(performance_metrics_DICT,'Data/embeddings_performance_metrics_DICT.json')
        else:
            num_epochs_without_improvement += 1

        if num_epochs_without_improvement >= patience_num_epochs:
            print(f'Early stopping occurred at epoch {epoch + 1}')
            early_stopping = True
            break
        
    if early_stopping == False:
        model.save("Data/embeddings_model.model")
        save_JSON(performance_metrics_DICT,'Data/embeddings_performance_metrics_DICT.json')
    return model, performance_metrics_DICT

def build_vocab_and_embeddings(dataset, model=None):
    """Build vocabulary and convert tokens to embeddings using Word2Vec model."""
    token_set = set()
    for entry in dataset:
        skeleton_str = entry["skeleton"]
        tokens = tokenize_skeleton(skeleton_str)
        token_set.update(tokens)
    
    # Convert tokens to embeddings using the Word2Vec model
    vocab_embeddings = {}
    if model:
        for token in token_set:
            if token in model.wv:
                embedding = model.wv[token]
                # Normalize the embedding to have a unit norm (length = 1)
                embedding = normalize([embedding])[0]
                vocab_embeddings[token] = embedding
            else:
                # If the token isn't in the model, use a random vector
                vocab_embeddings[token] = np.random.uniform(-0.1, 0.1, size=EMBEDDING_DIM)
    else:
        # If no pre-trained model is provided, use random embeddings for all tokens
        for token in token_set:
            vocab_embeddings[token] = np.random.uniform(-0.1, 0.1, size=EMBEDDING_DIM)
    
    # Add PAD_TOKEN to the vocabulary and set its embedding to zero
    vocab_embeddings[PAD_TOKEN] = np.zeros(EMBEDDING_DIM)
    
    return vocab_embeddings

def standardize_data(data):
    data_mean = np.mean(data, axis=0)
    data_std = np.std(data, axis=0)    
    data_standardized = (data - data_mean)/data_std
    return data_standardized

def tokenize_dataset_with_embeddings(dataset, vocab_embeddings):
    """Tokenize the dataset and convert tokens to embeddings."""
    tokenized_data = []
        
    max_num_features  = max(len(entry['data']['x']) for entry in dataset)
    
    for entry in dataset:
        skeleton_str = entry["skeleton"]
        tokenized_skeleton = [vocab_embeddings[token] for token in tokenize_skeleton(skeleton_str)]
        
        data = entry["data"]
        x_DICT = data['x'] 
                    
        num_data_points = len(x_DICT[list(x_DICT.keys())[0]])
        num_features = len(list(x_DICT.keys()))
                
        x_with_values_MAT = np.array(list(x_DICT.values())).T
        x_with_values_standardized_MAT = standardize_data(x_with_values_MAT)
        nan_MAT = np.full((num_data_points, max_num_features-num_features), np.nan)
        x_standardized_MAT = np.concatenate((x_with_values_standardized_MAT, nan_MAT), axis=1)        
        
        y = np.array(data['y'])
        y_standardized = standardize_data(y)
        
        mask = np.array([1]*num_features + [0]*(max_num_features-num_features))        
        
        data_DICT = {'x': x_standardized_MAT, 'y': y_standardized, 'mask': mask}
        
        tokenized_data.append({
            "tokens": tokenized_skeleton,
            "data": data_DICT,
            "skeleton": skeleton_str
        })
    
    return tokenized_data

def pad_sequences(tokenized_data, max_length, pad_embedding, pad_token=PAD_TOKEN):
    """Pad tokenized sequences to a fixed length.""" 
    for dp in tokenized_data:
        token_length = len(dp["tokens"])
        if token_length < max_length:
            dp["tokens"] = dp["tokens"] + [pad_embedding]*(max_length - token_length)
        elif token_length > max_length:
            dp["tokens"] = dp["tokens"][:max_length]
    return tokenized_data

def load_dataset(file_path):
    """Load the dataset from a JSON file."""
    with open(file_path, 'r') as file:
        dataset = [json.loads(line) for line in file]
    return dataset

def add_TNet_embeddings(preprocessed_data):
    tokenized_formulas = torch.tensor([datapoint['tokens'] for datapoint in preprocessed_data]).float()
    datasets = torch.tensor([np.concatenate((datapoint['data']['x'], datapoint['data']['y'].reshape(len(datapoint['data']['y']),1)), axis=1) for datapoint in preprocessed_data])
    datasets = torch.nan_to_num(datasets,nan=0.0).float()

    batch_size, seq_len, embedding_dim = tokenized_formulas.shape
    batch_size, num_points, num_features = datasets.shape

    config_formula = tNetConfig(num_vars=seq_len, embedding_size=128) 
    config_data = tNetConfig(num_vars=num_points, embedding_size=128)

    TNet_model_formula = tNet(config_formula) 
    TNet_model_data = tNet(config_data)

    formula_embeddings = TNet_model_formula(tokenized_formulas)  # Shape: [num_formulas, embedding_size]
    dataset_embeddings = TNet_model_data(datasets)  # Shape: [batch_size, embedding_size]

    formula_embeddings = formula_embeddings.detach().cpu().numpy()
    dataset_embeddings = dataset_embeddings.detach().cpu().numpy()

    # Add embeddings to each datapoint in preprocessed_data
    for i, datapoint in enumerate(preprocessed_data):
        datapoint['formula_embedding'] = formula_embeddings[i]
        datapoint['dataset_embedding'] = dataset_embeddings[i]
    return preprocessed_data

def preprocess_and_tokenize_dataset(file_path, model=None, noise_type="gaussian", noise_level=NOISE_LEVEL, max_length='max_length'):
    # Step 1: Load dataset
    dataset = load_dataset(file_path)

    # Step 2: Train Word2Vec model if not provided
    if model is None:
        model, performance_metrics_DICT = train_word2vec_embeddings(dataset)

    # Step 3: Build vocabulary with embeddings
    vocab_embeddings = build_vocab_and_embeddings(dataset, model)

    # Step 4: Tokenize dataset and convert tokens to embeddings
    tokenized_data = tokenize_dataset_with_embeddings(dataset, vocab_embeddings)

    if max_length == 'max_length':
        MAX_LENGTH = max(len(dp["tokens"]) for dp in tokenized_data)
    else:
        MAX_LENGTH = max_length
    
    # Step 7: Pad sequences to a fixed length
    padded_data = pad_sequences(tokenized_data,MAX_LENGTH,vocab_embeddings['<PAD>'])
    
    preprocessed_data = add_TNet_embeddings(padded_data)
    
    return padded_data, vocab_embeddings, model, performance_metrics_DICT

# Save preprocessed data
def save_preprocessed_data(data, output_path):
    """Save the preprocessed data to a file with JSON serialization."""
    def convert_to_serializable(obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()  # Convert ndarray to list
        if isinstance(obj, dict):
            return {key: convert_to_serializable(value) for key, value in obj.items()}
        if isinstance(obj, list):
            return [convert_to_serializable(item) for item in obj]
        return obj  # Return the object as-is if it's already serializable

    with open(output_path, 'w') as file:
        for dp in data:
            json.dump(convert_to_serializable(dp), file)
            file.write("\n")

def save_JSON(data, filename):
    """Save data to a JSON file with support for NumPy arrays."""
    def convert_to_serializable(obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()  # Convert ndarray to list
        if isinstance(obj, dict):
            return {key: convert_to_serializable(value) for key, value in obj.items()}
        return obj  # Return the object as-is if it's already serializable

    with open(filename, 'w') as f:
        json.dump(convert_to_serializable(data), f)
        
def load_JSON(filename):
    """Load a JSON file."""
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

# Main script execution
file_path = "Data/combined_dataset_5_variables_dynamic_seed20777980.json"

# Process the data and train Word2Vec embeddings
preprocessed_data, vocab_embeddings, model, performance_metrics_DICT = preprocess_and_tokenize_dataset(file_path)

# Save the preprocessed data, vocab embeddings, and trained Word2Vec model
save_preprocessed_data(preprocessed_data, "Data/preprocessed_data_with_embeddings.json")
save_JSON(vocab_embeddings, "Data/vocab_embeddings.json")

print("Preprocessing complete with embeddings. Data saved.")

Epoch 1/50, Train Loss: 0.0
Epoch 2/50, Train Loss: 0.0
Epoch 3/50, Train Loss: 0.0
Epoch 4/50, Train Loss: 0.0
Early stopping occurred at epoch 4
Preprocessing complete with embeddings. Data saved.


In [43]:
performance_metrics_DICT = load_JSON('Data/embeddings_performance_metrics_DICT.json')
model = Word2Vec.load("Data/embeddings_model.model")

print("Embeddings Performance Metrics:")
print(performance_metrics_DICT)

Embeddings Performance Metrics:
{'epoch_list': [1], 'train_loss_list': [0.0]}
