# 2. Preprocess and Tokenize Data

In [34]:
import json
import numpy as np
import random
import re

# Set the random seed (for replicability)
seed = 20777980
random.seed(seed)
np.random.seed(seed)

# Constants for tokenization
PAD_TOKEN = "<PAD>"
NOISE_LEVEL = 0.1

def tokenize_function(function_str):
    """Tokenize the function string into a sequence of symbols."""
    function_str = function_str.replace('**','^')  # Replace '**' with '^' for easier tokenization
    # Regular expression to capture all valid tokens (numbers, variables, functions, operators)
    pattern = r'[a-zA-Z_][a-zA-Z0-9_]*|[+\-*/^(),.]|sin|cos|log|exp|sqrt'
    tokens = re.findall(pattern, function_str)
    return tokens

def build_vocab(dataset):
    """Build vocabulary for the dataset."""
    token_set = set()
    for entry in dataset:
        function_str = entry["function"]
        tokens = tokenize_function(function_str)
        token_set.update(tokens)
    
    # Add PAD_TOKEN
    token_set.add(PAD_TOKEN)
    
    # Create a mapping from token to index (integer)
    vocab = {token: idx for idx, token in enumerate(sorted(token_set))}
    return vocab

# Function to convert dataset to tokenized form
def tokenize_dataset(dataset, vocab):
    """Tokenize the dataset of functions into token IDs."""
    tokenized_data = []
    for entry in dataset:
        function_str = entry["function"]
        tokenized_function = [vocab[token] for token in tokenize_function(function_str)]
        
        # For each data point, get the tokenized function and its inputs/outputs
        for data_point in entry["data"]:
            inputs = data_point["inputs"]
            output = data_point["output"]
            tokenized_data.append({
                "tokens": tokenized_function,
                "inputs": inputs,
                "output": output
            })
    
    return tokenized_data

# Normalize the inputs and outputs
def normalize_data(data, range_vals=(-1, 1)):
    """Normalize the inputs and outputs to the specified range."""
    # Collect all unique input keys dynamically from the dataset
    
    input_keys = []
    for dp in data:
        input_keys.extend(dp["inputs"].keys())
    input_keys = sorted(set(input_keys), key=lambda x: (int(x[1:]), x))
    
    # Calculate the min and max for each input variable and output
    all_inputs = [dp["inputs"] for dp in data]
    all_outputs = [dp["output"] for dp in data]
    
    # Find the min and max for each input variable
    input_mins = {key: min([inputs.get(key,float('inf')) for inputs in all_inputs]) for key in input_keys}
    input_maxs = {key: max([inputs.get(key,float('-inf')) for inputs in all_inputs]) for key in input_keys}
    output_min = min(all_outputs)
    output_max = max(all_outputs)
    
    # Normalize inputs and outputs
    for dp in data:
        # Normalize inputs
        normalized_inputs = {}
        for key in input_keys:
            if key in dp["inputs"]:  # If the key exists, normalize it
                normalized_inputs[key] = 2*(dp["inputs"][key] - input_mins[key])/(input_maxs[key] - input_mins[key]) - 1
            else:
                # If the key is missing, assign a default value (e.g., 0 or skip normalization)
                normalized_inputs[key] = np.nan  # Default value, adjust as needed
        
        # Normalize output
        normalized_output = 2*(dp["output"] - output_min)/(output_max - output_min) - 1
        
        dp["inputs"] = normalized_inputs
        dp["output"] = normalized_output
    
    return data

def add_masking_to_data(data):
    """Adds a mask to the data where 1 indicates valid data and 0 indicates NaN."""
    for dp in data:
        dp["mask"] = {key: 1 if not np.isnan(value) else 0 for key, value in dp["inputs"].items()}
    return data

def add_noise_to_data(data, noise_type="gaussian", noise_level=NOISE_LEVEL):
    """Add noise to the dataset while ensuring NaN values are ignored."""
    for dp in data:
        for key, value in dp["inputs"].items():
            if not np.isnan(value):  # Only add noise if the value is not NaN
                if noise_type == "gaussian":
                    dp["inputs"][key] += np.random.normal(0, noise_level)
                elif noise_type == "uniform":
                    dp["inputs"][key] += np.random.uniform(-noise_level, noise_level)
        # Add noise to the output as well, but only if it's not NaN
        if not np.isnan(dp["output"]):
            if noise_type == "gaussian":
                dp["output"] += np.random.normal(0, noise_level)
            elif noise_type == "uniform":
                dp["output"] += np.random.uniform(-noise_level, noise_level)

    return data

# Pad sequences to fixed length
def pad_sequences(tokenized_data, max_length, pad_token=PAD_TOKEN):    
    """Pad tokenized sequences to a fixed length."""
    for dp in tokenized_data:
        token_length = len(dp["tokens"])
        
        if token_length < max_length:
            dp["tokens"] = dp["tokens"] + [pad_token]*(max_length - token_length)
        elif token_length > max_length:
            # Truncate the sequence if it's too long
            dp["tokens"] = dp["tokens"][:max_length]
    
    return tokenized_data

# Load dataset from JSON file
def load_dataset(file_path):
    """Load the dataset from a JSON file."""
    with open(file_path, 'r') as file:
        dataset = [json.loads(line) for line in file]
    return dataset

# Preprocess and tokenize the dataset
def preprocess_and_tokenize_dataset(file_path, noise_type="gaussian", noise_level=NOISE_LEVEL, max_length='max_length'):
    # Step 1: Load dataset
    dataset = load_dataset(file_path)

    # Step 2: Build vocabulary
    vocab = build_vocab(dataset)

    # Step 3: Tokenize dataset
    tokenized_data = tokenize_dataset(dataset, vocab)

    # Step 4: Normalize the data and determine the maximum input length
    normalized_data = normalize_data(tokenized_data)

    masked_data = add_masking_to_data(normalized_data)
    
    # Step 5: Add noise to the data (pass input_keys)
    noisy_data = add_noise_to_data(normalized_data, noise_type, noise_level)

    # Calculate the max length dynamically from the tokenized data (or use a fixed value)
    if max_length == 'max_length':
        MAX_LENGTH = max(len(dp["tokens"]) for dp in noisy_data)
    else:
        MAX_LENGTH = max_length
    
    # Step 6: Pad sequences to a fixed length
    padded_data = pad_sequences(noisy_data, max_length=MAX_LENGTH)

    return padded_data,vocab

# Save preprocessed data to a file
def save_preprocessed_data(data, output_path):
    """Save the preprocessed data to a file."""
    with open(output_path, 'w') as file:
        for dp in data:
            json.dump(dp,file)
            file.write("\n")
            
def save_JSON(data,filename):
    with open(filename, 'w') as f:
        json.dump(data, f)
    return

def load_JSON(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

# Main script execution
if __name__ == "__main__":
    # Path to the original dataset
    file_path = "Dataset/combined_dataset_5_variables_dynamic_seed20777980.json"
    
    # Preprocess and tokenize the dataset
    preprocessed_data,vocab = preprocess_and_tokenize_dataset(file_path, noise_type="gaussian",noise_level=0.1)

    # Save the preprocessed data to a new file
    save_preprocessed_data(preprocessed_data, "Dataset/preprocessed_data.json")
    save_JSON(vocab, "Dataset/vocab.json")
    
    print("Preprocessing complete. Data saved to Dataset/preprocessed_data.json")
    print(f"Vocabulary size: {len(vocab)}")


Preprocessing complete. Data saved to Dataset/preprocessed_data.json
Vocabulary size: 16
