# 2. Preprocess and Tokenize Data

In [None]:
import json
import numpy as np
import random
import re

# Set the random seed (for replicability)
seed = 20777980
random.seed(seed)
np.random.seed(seed)

# Constants for tokenization
PAD_TOKEN = "<PAD>"
NOISE_LEVEL = 0.1

def tokenize_skeleton(skeleton_str):
    """Tokenize the skeleton string into a sequence of symbols."""
    # Replace common symbols for consistent tokenization
    skeleton_str = skeleton_str.replace("**", "^")  # Replace '**' with '^'
    # Define a pattern to match variables, operators, and specific tokens
    pattern = r'[a-zA-Z_][a-zA-Z0-9_]*|[+\-*/^(),.]|C|sin|cos|log|exp|sqrt'
    tokens = re.findall(pattern, skeleton_str)
    return tokens

def build_vocab(dataset):
    """Build vocabulary for the dataset."""
    token_set = set()
    for entry in dataset:
        skeleton_str = entry["skeleton"]  # Replace "function" with "skeleton"
        tokens = tokenize_skeleton(skeleton_str)
        token_set.update(tokens)
    
    # Add PAD_TOKEN
    token_set.add(PAD_TOKEN)
    
    # Create a mapping from token to index (integer)
    vocab = {token: idx for idx, token in enumerate(sorted(token_set))}
    return vocab

# Function to convert dataset to tokenized form
def tokenize_dataset(dataset, vocab):
    """Tokenize the dataset of skeletons into token IDs."""
    tokenized_data = []
    for entry in dataset:
        skeleton_str = entry["skeleton"]  # Replace "function" with "skeleton"
        tokenized_skeleton = [vocab[token] for token in tokenize_skeleton(skeleton_str)]
        
        # For each data point, get the tokenized skeleton and its inputs/outputs
        for data_point in entry["data"]:
            inputs = data_point["inputs"]
            output = data_point["output"]
            tokenized_data.append({
                "tokens": tokenized_skeleton,
                "inputs": inputs,
                "output": output
            })
    
    return tokenized_data
# Normalize the inputs and outputs
def normalize_data(data, range_vals=(-1, 1)):
    """Normalize the inputs and outputs to the specified range."""
    # Collect all unique input keys dynamically from the dataset
    
    input_keys = []
    for dp in data:
        input_keys.extend(dp["inputs"].keys())
    input_keys = sorted(set(input_keys), key=lambda x: (int(x[1:]), x))
    
    # Calculate the min and max for each input variable and output
    all_inputs = [dp["inputs"] for dp in data]
    all_outputs = [dp["output"] for dp in data]
    
    # Find the min and max for each input variable
    input_mins = {key: min([inputs.get(key,float('inf')) for inputs in all_inputs]) for key in input_keys}
    input_maxs = {key: max([inputs.get(key,float('-inf')) for inputs in all_inputs]) for key in input_keys}
    output_min = min(all_outputs)
    output_max = max(all_outputs)
    
    # Normalize inputs and outputs
    for dp in data:
        # Normalize inputs
        normalized_inputs = {}
        for key in input_keys:
            if key in dp["inputs"]:  # If the key exists, normalize it
                normalized_inputs[key] = 2*(dp["inputs"][key] - input_mins[key])/(input_maxs[key] - input_mins[key]) - 1
            else:
                # If the key is missing, assign a default value (e.g., 0 or skip normalization)
                normalized_inputs[key] = np.nan  # Default value, adjust as needed
        
        # Normalize output
        normalized_output = 2*(dp["output"] - output_min)/(output_max - output_min) - 1
        
        dp["inputs"] = normalized_inputs
        dp["output"] = normalized_output
    
    return data

def add_masking_to_data(data):
    """Adds a mask to the data where 1 indicates valid data and 0 indicates NaN."""
    for dp in data:
        dp["mask"] = {key: 1 if not np.isnan(value) else 0 for key, value in dp["inputs"].items()}
    return data

def add_noise_to_data(data, noise_type="gaussian", noise_level=NOISE_LEVEL):
    """Add noise to the dataset while ensuring NaN values are ignored."""
    for dp in data:
        for key, value in dp["inputs"].items():
            if not np.isnan(value):  # Only add noise if the value is not NaN
                if noise_type == "gaussian":
                    dp["inputs"][key] += np.random.normal(0, noise_level)
                elif noise_type == "uniform":
                    dp["inputs"][key] += np.random.uniform(-noise_level, noise_level)
        # Add noise to the output as well, but only if it's not NaN
        if not np.isnan(dp["output"]):
            if noise_type == "gaussian":
                dp["output"] += np.random.normal(0, noise_level)
            elif noise_type == "uniform":
                dp["output"] += np.random.uniform(-noise_level, noise_level)

    return data

# Pad sequences to fixed length
def pad_sequences(tokenized_data, max_length, pad_token=PAD_TOKEN):    
    """Pad tokenized sequences to a fixed length."""
    for dp in tokenized_data:
        token_length = len(dp["tokens"])
        
        if token_length < max_length:
            dp["tokens"] = dp["tokens"] + [pad_token]*(max_length - token_length)
        elif token_length > max_length:
            # Truncate the sequence if it's too long
            dp["tokens"] = dp["tokens"][:max_length]
    
    return tokenized_data

# Load dataset from JSON file
def load_dataset(file_path):
    """Load the dataset from a JSON file."""
    with open(file_path, 'r') as file:
        dataset = [json.loads(line) for line in file]
    return dataset

# Preprocess and tokenize the dataset
def preprocess_and_tokenize_dataset(file_path, noise_type="gaussian", noise_level=NOISE_LEVEL, max_length='max_length'):
    # Step 1: Load dataset
    dataset = load_dataset(file_path)

    # Step 2: Build vocabulary
    vocab = build_vocab(dataset)

    # Step 3: Tokenize dataset
    tokenized_data = tokenize_dataset(dataset, vocab)

    # Step 4: Normalize the data and determine the maximum input length
    normalized_data = normalize_data(tokenized_data)

    masked_data = add_masking_to_data(normalized_data)
    
    # Step 5: Add noise to the data (pass input_keys)
    noisy_data = add_noise_to_data(normalized_data, noise_type, noise_level)

    # Calculate the max length dynamically from the tokenized data (or use a fixed value)
    if max_length == 'max_length':
        MAX_LENGTH = max(len(dp["tokens"]) for dp in noisy_data)
    else:
        MAX_LENGTH = max_length
    
    # Step 6: Pad sequences to a fixed length
    padded_data = pad_sequences(noisy_data, max_length=MAX_LENGTH)

    return padded_data,vocab

# Save preprocessed data to a file
def save_preprocessed_data(data, output_path):
    """Save the preprocessed data to a file."""
    with open(output_path, 'w') as file:
        for dp in data:
            json.dump(dp,file)
            file.write("\n")
            
def save_JSON(data,filename):
    with open(filename, 'w') as f:
        json.dump(data, f)
    return

def load_JSON(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

# Main script execution
# Path to the original dataset
file_path = "Data/combined_dataset_5_variables_dynamic_seed20777980.json"

# Preprocess and tokenize the dataset
preprocessed_data,vocab = preprocess_and_tokenize_dataset(file_path, noise_type="gaussian",noise_level=0.1)

# Save the preprocessed data to a new file
save_preprocessed_data(preprocessed_data, "Data/preprocessed_data.json")
save_JSON(vocab, "Data/vocab.json")

print("Preprocessing complete. Data saved to Data/preprocessed_data.json")
print(f"Vocabulary size: {len(vocab)}")


Preprocessing complete. Data saved to Data/preprocessed_data.json
Vocabulary size: 15


: 

# Using continuous vector embeddings

In [23]:
import json
import numpy as np
import random
import re
import gensim
from gensim.models import Word2Vec
from sklearn.preprocessing import normalize

# Set the random seed (for replicability)
seed = 20777980
random.seed(seed)
np.random.seed(seed)

# Constants for tokenization
PAD_TOKEN = "<PAD>"
NOISE_LEVEL = 0.1
EMBEDDING_DIM = 100  # Set the dimension of the embeddings
EPOCHS = 50  # Number of epochs for Word2Vec training
PATIENCE = 3  # Patience for early stopping (number of epochs to wait for improvement)
MIN_DELTA = 0.0001  # Minimum change in loss to count as an improvement

def tokenize_skeleton(skeleton_str):
    """Tokenize the skeleton string into a sequence of symbols."""
    skeleton_str = skeleton_str.replace("**", "^")  # Replace '**' with '^'
    pattern = r'[a-zA-Z_][a-zA-Z0-9_]*|[+\-*/^(),.]|C|sin|cos|log|exp|sqrt'
    tokens = re.findall(pattern, skeleton_str)
    return tokens

def train_word2vec_embeddings(dataset, embedding_dim=EMBEDDING_DIM, epochs=EPOCHS, patience_num_epochs=PATIENCE):
    """Train Word2Vec embeddings on the dataset over multiple epochs with early stopping."""
    sentences = []
    for entry in dataset:
        skeleton_str = entry["skeleton"]
        tokens = tokenize_skeleton(skeleton_str)
        sentences.append(tokens)
    
    # Initialize the Word2Vec model
    model = Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=1, workers=4, epochs=1)

    best_loss = np.inf
    num_epochs_without_improvement = 0
    early_stopping = False

    performance_metrics_DICT = {
        "epoch_list": [],
        "train_loss_list": [],
    }

    # Train Word2Vec model with early stopping
    for epoch in range(epochs-1):
        # Train for one epoch
        model.train(sentences, total_examples=model.corpus_count, epochs=1)

        # Get the current loss (Word2Vec model loss is stored in the 'trainables' attribute)
        current_loss = model.get_latest_training_loss()

        # Record loss
        performance_metrics_DICT['epoch_list'].append(epoch+1)
        performance_metrics_DICT['train_loss_list'].append(current_loss)
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {current_loss}")

        # Early stopping check
        if current_loss < best_loss:
            best_loss = current_loss
            model.save("Data/embeddings_model.model")
            num_epochs_without_improvement = 0
            save_JSON(performance_metrics_DICT,'Data/embeddings_performance_metrics_DICT.json')
        else:
            num_epochs_without_improvement += 1

        if num_epochs_without_improvement >= patience_num_epochs:
            print(f'Early stopping occurred at epoch {epoch + 1}')
            early_stopping = True
            break
        
    if early_stopping == False:
        model.save("Data/embeddings_model.model")
        save_JSON(performance_metrics_DICT,'Data/embeddings_performance_metrics_DICT.json')
    return model, performance_metrics_DICT

def build_vocab_and_embeddings(dataset, model=None):
    """Build vocabulary and convert tokens to embeddings using Word2Vec model."""
    token_set = set()
    for entry in dataset:
        skeleton_str = entry["skeleton"]
        tokens = tokenize_skeleton(skeleton_str)
        token_set.update(tokens)
    
    # Convert tokens to embeddings using the Word2Vec model
    vocab_embeddings = {}
    if model:
        for token in token_set:
            if token in model.wv:
                embedding = model.wv[token]
                # Normalize the embedding to have a unit norm (length = 1)
                embedding = normalize([embedding])[0]
                vocab_embeddings[token] = embedding
            else:
                # If the token isn't in the model, use a random vector
                vocab_embeddings[token] = np.random.uniform(-0.1, 0.1, size=EMBEDDING_DIM)
    else:
        # If no pre-trained model is provided, use random embeddings for all tokens
        for token in token_set:
            vocab_embeddings[token] = np.random.uniform(-0.1, 0.1, size=EMBEDDING_DIM)
    
    # Add PAD_TOKEN to the vocabulary and set its embedding to zero
    vocab_embeddings[PAD_TOKEN] = np.zeros(EMBEDDING_DIM)
    
    return vocab_embeddings

def tokenize_dataset_with_embeddings(dataset, vocab_embeddings):
    """Tokenize the dataset and convert tokens to embeddings."""
    tokenized_data = []
    for entry in dataset:
        skeleton_str = entry["skeleton"]
        tokenized_skeleton = [vocab_embeddings[token] for token in tokenize_skeleton(skeleton_str)]
        
        for data_point in entry["data"]:
            inputs = data_point["inputs"]
            output = data_point["output"]
            tokenized_data.append({
                "tokens": tokenized_skeleton,
                "inputs": inputs,
                "output": output
            })
    
    return tokenized_data

def normalize_data(data, range_vals=(-1, 1)):
    """Normalize the inputs and outputs to the specified range."""
    input_keys = []
    for dp in data:
        input_keys.extend(dp["inputs"].keys())
    input_keys = sorted(set(input_keys), key=lambda x: (int(x[1:]), x))
    
    all_inputs = [dp["inputs"] for dp in data]
    all_outputs = [dp["output"] for dp in data]
    
    input_mins = {key: min([inputs.get(key, float('inf')) for inputs in all_inputs]) for key in input_keys}
    input_maxs = {key: max([inputs.get(key, float('-inf')) for inputs in all_inputs]) for key in input_keys}
    output_min = min(all_outputs)
    output_max = max(all_outputs)
    
    for dp in data:
        normalized_inputs = {}
        for key in input_keys:
            if key in dp["inputs"]:
                normalized_inputs[key] = 2 * (dp["inputs"][key] - input_mins[key]) / (input_maxs[key] - input_mins[key]) - 1
            else:
                normalized_inputs[key] = np.nan  
        
        normalized_output = 2 * (dp["output"] - output_min) / (output_max - output_min) - 1
        
        dp["inputs"] = normalized_inputs
        dp["output"] = normalized_output
    
    return data

def add_masking_to_data(data):
    """Adds a mask to the data where 1 indicates valid data and 0 indicates NaN."""
    for dp in data:
        dp["mask"] = {key: 1 if not np.isnan(value) else 0 for key, value in dp["inputs"].items()}
    return data

def add_noise_to_data(data, noise_type="gaussian", noise_level=NOISE_LEVEL):
    """Add noise to the dataset while ensuring NaN values are ignored."""
    for dp in data:
        for key, value in dp["inputs"].items():
            if not np.isnan(value):
                if noise_type == "gaussian":
                    dp["inputs"][key] += np.random.normal(0, noise_level)
                elif noise_type == "uniform":
                    dp["inputs"][key] += np.random.uniform(-noise_level, noise_level)
        if not np.isnan(dp["output"]):
            if noise_type == "gaussian":
                dp["output"] += np.random.normal(0, noise_level)
            elif noise_type == "uniform":
                dp["output"] += np.random.uniform(-noise_level, noise_level)
    return data

def pad_sequences(tokenized_data, max_length, pad_token=PAD_TOKEN):
    """Pad tokenized sequences to a fixed length.""" 
    for dp in tokenized_data:
        token_length = len(dp["tokens"])
        if token_length < max_length:
            dp["tokens"] = dp["tokens"] + [pad_token] * (max_length - token_length)
        elif token_length > max_length:
            dp["tokens"] = dp["tokens"][:max_length]
    return tokenized_data

def load_dataset(file_path):
    """Load the dataset from a JSON file."""
    with open(file_path, 'r') as file:
        dataset = [json.loads(line) for line in file]
    return dataset

def preprocess_and_tokenize_dataset(file_path, model=None, noise_type="gaussian", noise_level=NOISE_LEVEL, max_length='max_length'):
    # Step 1: Load dataset
    dataset = load_dataset(file_path)

    # Step 2: Train Word2Vec model if not provided
    if model is None:
        model, performance_metrics_DICT = train_word2vec_embeddings(dataset)

    # Step 3: Build vocabulary with embeddings
    vocab_embeddings = build_vocab_and_embeddings(dataset, model)

    # Step 4: Tokenize dataset and convert tokens to embeddings
    tokenized_data = tokenize_dataset_with_embeddings(dataset, vocab_embeddings)

    # Step 5: Normalize the data and determine the maximum input length
    normalized_data = normalize_data(tokenized_data)

    masked_data = add_masking_to_data(normalized_data)
    
    # Step 6: Add noise to the data
    noisy_data = add_noise_to_data(normalized_data, noise_type, noise_level)

    if max_length == 'max_length':
        MAX_LENGTH = max(len(dp["tokens"]) for dp in noisy_data)
    else:
        MAX_LENGTH = max_length
    
    # Step 7: Pad sequences to a fixed length
    padded_data = pad_sequences(noisy_data, max_length=MAX_LENGTH)

    return padded_data, vocab_embeddings, model, performance_metrics_DICT

# Save preprocessed data
def save_preprocessed_data(data, output_path):
    """Save the preprocessed data to a file with JSON serialization."""
    def convert_to_serializable(obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()  # Convert ndarray to list
        if isinstance(obj, dict):
            return {key: convert_to_serializable(value) for key, value in obj.items()}
        if isinstance(obj, list):
            return [convert_to_serializable(item) for item in obj]
        return obj  # Return the object as-is if it's already serializable

    with open(output_path, 'w') as file:
        for dp in data:
            json.dump(convert_to_serializable(dp), file)
            file.write("\n")

def save_JSON(data, filename):
    """Save data to a JSON file with support for NumPy arrays."""
    def convert_to_serializable(obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()  # Convert ndarray to list
        if isinstance(obj, dict):
            return {key: convert_to_serializable(value) for key, value in obj.items()}
        return obj  # Return the object as-is if it's already serializable

    with open(filename, 'w') as f:
        json.dump(convert_to_serializable(data), f)
        
def load_JSON(filename):
    """Load a JSON file."""
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

# Main script execution
file_path = "Data/combined_dataset_5_variables_dynamic_seed20777980.json"

# Process the data and train Word2Vec embeddings
preprocessed_data, vocab_embeddings, model, performance_metrics_DICT = preprocess_and_tokenize_dataset(file_path)

# Save the preprocessed data, vocab embeddings, and trained Word2Vec model
save_preprocessed_data(preprocessed_data, "Data/preprocessed_data_with_embeddings.json")
save_JSON(vocab_embeddings, "Data/vocab_embeddings.json")

print("Preprocessing complete with embeddings. Data saved.")

Epoch 1/50, Train Loss: 0.0
Epoch 2/50, Train Loss: 0.0
Epoch 3/50, Train Loss: 0.0
Epoch 4/50, Train Loss: 0.0
Early stopping occurred at epoch 4
Preprocessing complete with embeddings. Data saved.


In [27]:
performance_metrics_DICT = load_JSON('Data/embeddings_performance_metrics_DICT.json')

model = Word2Vec.load("Data/embeddings_model.model")

print("Embeddings Performance Metrics:")
print(performance_metrics_DICT)

Embeddings Performance Metrics:
{'epoch_list': [1], 'train_loss_list': [0.0]}
