# Using openpyxl for Direct Parsing of Formulas

Using `pd.read_csv()` and `pd.read_excel()` excludes some formulas. Specifically, 30 formulas are returned as NaN for CSV files and 3 formulas are returned as NaN for Excel files. Therefore, I have used `openpyxl` for direct parsing of the formulas.

CSV files are plain text and don't preserve cell metadata such as the actual formula text; they only store the computed values. When Excel exports to CSV, it writes out the result of the formula, not the formula itself. This is why we use libraries like `openpyxl` with the Excel format (XLSX) and set `data_only=False` to retrieve the underlying formula strings.

In [None]:
from openpyxl import load_workbook
wb = load_workbook('FeynmanEquations.xlsx', data_only=False)
ws = wb.active

# Iterate through rows (we start from row 2 because we skip header)
formula_col_index = 4 
formula_list=[]
# Iterate through rows (we start from row 2 because we skip header)
for row in ws.iter_rows(min_row=2):
    # get cell in formula column
    cell = row[formula_col_index - 1]  # zero-indexed
    formula_list.append(cell.value) 
    

##Save the formula_list in a text file
filename = "formulas.txt"

with open(filename, "w") as f:
    for formula in formula_list:
        f.write(formula + "\n")

print(f"Formulas saved to {filename}")


Formulas saved to formulas.txt


## Plan for Tokenization

### Extract All Unique Variable Names
1. Read the `FeynmanEquations.xlsx` file.
2. Collect all unique variable names from the columns `v1_name`, `v2_name`, `v3_name`, etc.

### Extract All Unique Operators and Functions
1. Parse the formula column.
2. Extract mathematical operators (`+`, `-`, `*`, `/`, `**`) and functions (`sin`, `cos`, `exp`, etc.).

### Merge into a Single Set (Vocabulary)
1. Combine the extracted variable names and operators/functions into a single unique set.
2. This set forms our tokenization vocabulary.

### Tokenize Each Formula Using This Vocabulary
1. Convert each formula into a sequence of tokens using the extracted vocabulary.


In [12]:
## Extract all Uique variable names
import pandas as pd
import sympy
import re

df=pd.read_excel('FeynmanEquations.xlsx')
variable_columns=[col for col in df.columns if re.match(r'v\d+_name',col)]


# print(variable_columns)

unique_variables=set()
for col in variable_columns:
    unique_variables.update(df[col].dropna().astype(str).unique())

print(unique_variables)



{'y2', 'I', 'rho_c_0', 'Nn', 'v', 'C', 'B', 'k', 'I2', 'mom', 'omega', 'theta1', 'y3', 'A_vec', 'mu_drift', 'F', 'kappa', 'y1', 'n', 'c', 'g_', 'q1', 'm2', 'z1', 'Volt', 'gamma', 'Jz', 'r', 'n_0', 'H', 'd1', 'Bx', 'Pwr', 'G', 'r1', 'm1', 'x2', 'k_spring', 't', 'mu', 'w', 'alpha', 'p_d', 'lambd', 'T2', 'Ef', 'y', 'h', 'I_0', 'chi', 'M', 'r2', 'V1', 'z', 'u', 'U', 'theta', 'epsilon', 'delta', 'm', 'n_rho', 'theta2', 'q2', 'Y', 'd', 'E_n', 'A', 'g', 'd2', 'rho', 'x', 'T1', 'V2', 'I1', 'mob', 'sigma_den', 'Int_0', 'm_0', 'p', 'beta', 'sigma', 'Bz', 'x3', 'x1', 'q', 'z2', 'a', 'V', 'pr', 'omega_0', 'By', 'T', 'kb'}


In [13]:
import re

mathematical_symbols = ['+', '-', '*', '/', '**', 'exp', 'sqrt', 'pi', 
                        'sin', 'cos', 'ln', 'Int', 'tanh', 'log', 
                        'arcsin', 'arctan', 'arccos']

mantissa_tokens = [f"{i:03d}" for i in range(0, 1000)]
exponent_tokens = [f"E-{i}" for i in range(11)] + [f"E+{i}" for i in range(11)]

TOKEN_DICT = {
    "<PAD>": 0,
    "<UNK>": 1,
    "[COL_SEP]": 2,
    "[ROW_SEP]": 3,
    "[DATA_END]":4
}

#1/2-1/9
for i,letter in enumerate(["1/2","1/3","1/4","1/5","1/6","1/7","1/8","1/9"],start=len(TOKEN_DICT)):
    TOKEN_DICT[letter]=i


# a-z
for i, letter in enumerate("abcdefghijklmnopqrstuvwxyz", start=len(TOKEN_DICT)):
    TOKEN_DICT[letter] = i

# A-Z
for i, letter in enumerate("ABCDEFGHIJKLMNOPQRSTUVWXYZ", start=len(TOKEN_DICT)):
    TOKEN_DICT[letter] = i

# digits 0-9
for i in range(10):
    TOKEN_DICT[str(i)] = len(TOKEN_DICT)

# A_0..Z_10 and A0..Z10
for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
    for num in range(11):
        TOKEN_DICT[f"{letter}_{num}"] = len(TOKEN_DICT)
        TOKEN_DICT[f"{letter}{num}"]  = len(TOKEN_DICT)

# Add all mathematical symbols
base_index = max(TOKEN_DICT.values()) + 1
for i, symbol in enumerate(mathematical_symbols):
    TOKEN_DICT[symbol] = base_index + i


current_index = max(TOKEN_DICT.values()) + 1
for var in unique_variables:
    TOKEN_DICT[var] = current_index
    current_index += 1

# Add mantissa tokens
current_index = max(TOKEN_DICT.values()) + 1
for i, token in enumerate(mantissa_tokens, start=current_index):
    TOKEN_DICT[token] = i

# Add exponent tokens
current_index = max(TOKEN_DICT.values()) + 1
for i, token in enumerate(exponent_tokens, start=current_index):
    TOKEN_DICT[token] = i


def extract_equation_tokens(equation: str):
    pattern = r"(?:\*\*|[+\-*/=()^]|[A-Za-z_]+|\d+(?:\.\d+)?)"
    return re.findall(pattern, equation)

new_tokens = set()
for eq in formula_list:
    tokens_in_eq = extract_equation_tokens(eq)
    new_tokens.update(tokens_in_eq)

current_index = max(TOKEN_DICT.values()) + 1
for token in new_tokens:
    if token not in TOKEN_DICT:
        TOKEN_DICT[token] = current_index
        current_index += 1


In [14]:
##  Use TOKEN_DICT as a “Seed” for a Subword Tokenize
from tokenizers import Tokenizer, trainers, models, pre_tokenizers


# Step 1: Convert your existing TOKEN_DICT to special tokens
special_tokens = list(TOKEN_DICT.keys())  # e.g. ["<PAD>", "<UNK>", "+", "-", "*", ...]


# Step 2: Initialize a subword tokenizer (e.g. BPE)
tokenizer = Tokenizer(models.BPE(unk_token="<UNK>"))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Step 3: Prepare trainer
trainer = trainers.BpeTrainer(
    vocab_size=2000,  # or however large
    special_tokens=special_tokens
)



# Step 4: Provide your corpus in lines, ideally pre-tokenized or raw text
files = ["formulas.txt"]


tokenizer.train(files, trainer)

# Step 6: Test the resulting tokenizer
encoded = tokenizer.encode("E = sin(pi*x) + theta_12")
print("Tokens:", encoded.tokens)




Tokens: ['E', '=', 'sin', '(', 'pi', '*', 'x', ')', '+', 'theta', '_', '1', '2']


In [15]:
tokenizer.save("my_tokenizer.json")

### Tokenizing the formulaes using the tokenizer


In [16]:
from tokenizers import Tokenizer

tokenizer=Tokenizer.from_file("my_tokenizer.json")

encoded_lines=[]
with open("formulas.txt",'r') as f:
    for line in f:
        formula=line.strip()
        encoded=tokenizer.encode(formula)
        encoded_lines.append(encoded)

for enc in encoded_lines:
    print("Tokens:", enc.tokens)
    print("Token IDs:", enc.ids)
    print()



Tokens: ['exp', '(', '-', 'theta', '**', '2', '/', '2', ')', '/', 'sqrt', '(', '2', '*', 'pi', ')']
Token IDs: [652, 1750, 648, 697, 651, 67, 650, 67, 1751, 650, 653, 1750, 67, 649, 654, 1751]

Tokens: ['exp', '(', '-', '(', 'theta', '/', 'sigma', ')', '**', '2', '/', '2', ')', '/', '(', 'sqrt', '(', '2', '*', 'pi', ')', '*', 'sigma', ')']
Token IDs: [652, 1750, 648, 1750, 697, 650, 711, 1751, 651, 67, 650, 67, 1751, 650, 1750, 653, 1750, 67, 649, 654, 1751, 649, 711, 1751]

Tokens: ['exp', '(', '-', '(', '(', 'theta', '-', 'theta1', ')', '/', 'sigma', ')', '**', '2', '/', '2', ')', '/', '(', 'sqrt', '(', '2', '*', 'pi', ')', '*', 'sigma', ')']
Token IDs: [652, 1750, 648, 1750, 1750, 697, 648, 669, 1751, 650, 711, 1751, 651, 67, 650, 67, 1751, 650, 1750, 653, 1750, 67, 649, 654, 1751, 649, 711, 1751]

Tokens: ['sqrt', '(', '(', 'x2', '-', 'x1', ')', '**', '2', '+', '(', 'y2', '-', 'y1', ')', '**', '2', ')']
Token IDs: [653, 1750, 1750, 688, 648, 714, 1751, 651, 67, 647, 1750, 664, 648,

In [17]:
# Now we will decode some tokens and compare them with the original formulas.
# If the decoded tokens and the original formulas are semantically similar, we can conclude that the tokenization process is effective.

In [18]:
df.shape

(100, 35)

In [19]:
import sympy
from sympy import sympify, symbols, sin, cos, tan, log, sqrt
import re
import json

def preprocess_expression(expr_str):
    """
    Preprocess the expression string:
      - Strips leading '=' signs.
      - Converts common uppercase math function names to lowercase.
    """
    expr_str = expr_str.strip()
    # Remove any leading "=" signs
    while expr_str.startswith("="):
        expr_str = expr_str[1:]
    # Replace common uppercase function names with lowercase ones
    expr_str = re.sub(r'\bCOS\b', 'cos', expr_str)
    expr_str = re.sub(r'\bSIN\b', 'sin', expr_str)
    expr_str = re.sub(r'\bTAN\b', 'tan', expr_str)
    expr_str = re.sub(r'\bLOG\b', 'log', expr_str)
    expr_str = re.sub(r'\bSQRT\b', 'sqrt', expr_str)
    return expr_str

def expr_to_rpn(expr):
    """
    Recursively converts a sympy expression into a list of tokens in Reverse Polish Notation (RPN).
    """
    # Atom (numbers, symbols) -> return its string representation
    if expr.is_Atom:
        return [str(expr)]
    
    # Function call (e.g., exp, sqrt, sin, cos, etc.)
    if expr.is_Function:
        tokens = []
        for arg in expr.args:
            tokens.extend(expr_to_rpn(arg))
        return tokens + [expr.func.__name__]
    
    # Handle exponentiation (power)
    if isinstance(expr, sympy.Pow):
        base, exponent = expr.as_base_exp()
        tokens = expr_to_rpn(base) + expr_to_rpn(exponent) + ["**"]
        return tokens
    
    # Handle addition (left-associative)
    if isinstance(expr, sympy.Add):
        args = expr.args
        tokens = expr_to_rpn(args[0])
        for arg in args[1:]:
            tokens.extend(expr_to_rpn(arg))
            tokens.append("+")
        return tokens
    
    # Handle multiplication (left-associative)
    if isinstance(expr, sympy.Mul):
        args = expr.args
        tokens = expr_to_rpn(args[0])
        for arg in args[1:]:
            tokens.extend(expr_to_rpn(arg))
            tokens.append("*")
        return tokens

    # Fallback: if the expression type is not explicitly handled, return its string.
    return [str(expr)]

def convert_expression_to_rpn(expression_str):
    """
    Converts an expression string to its Reverse Polish Notation (RPN) representation.
    Preprocesses the string and uses a local dictionary to override specific names.
    """
    # Preprocess expression to remove unwanted characters and adjust function names
    expression_str = preprocess_expression(expression_str)
    
    # Define a local dictionary to treat 'gamma' and 'beta' as symbols,
    # and ensure common math functions are in lowercase.
    local_dict = {
        'gamma': symbols('gamma'),
        'beta': symbols('beta'),
        'cos': cos,
        'sin': sin,
        'tan': tan,
        'log': log,
        'sqrt': sqrt,
    }
    
    try:
        # Parse the expression using sympy and the local dictionary
        expr = sympify(expression_str, locals=local_dict)
    except Exception as e:
        print("Error parsing expression:", e)
        return None
    # Convert the parsed expression to RPN
    return expr_to_rpn(expr)



if __name__ == "__main__":
    # List of formulas to convert.
    with open("formulas.txt", 'r') as file:
        formulas = [line.strip() for line in file.readlines()]
    
    for f in formulas:
        rpn = convert_expression_to_rpn(f)
        # Create a list to hold the JSON objects
        json_list = []

        # Iterate over the formulas and their corresponding filenames
        for formula, filename in zip(formulas, df['Filename']):
            rpn = convert_expression_to_rpn(formula)
            if rpn is not None:
                json_list.append({"id": filename, "rpn": rpn})

        # Write the JSON objects to a file
        with open("formulas_rpn.json", "w") as json_file:
            encoded_rpn_list = []
            for item in json_list:
                encoded_rpn = tokenizer.encode(" ".join(item["rpn"]))
                encoded_rpn_list.append({"id": item["id"], "rpn": encoded_rpn.ids})
            json.dump(encoded_rpn_list, json_file, indent=4)


In [20]:
encoded_rpn_list

[{'id': 'I.6.2a',
  'rpn': [5,
   67,
   5,
   651,
   649,
   654,
   648,
   5,
   651,
   649,
   648,
   5,
   697,
   67,
   651,
   649,
   652,
   649]},
 {'id': 'I.6.2',
  'rpn': [5,
   67,
   5,
   651,
   649,
   654,
   648,
   5,
   651,
   649,
   711,
   648,
   66,
   651,
   649,
   648,
   5,
   711,
   648,
   67,
   651,
   649,
   697,
   67,
   651,
   649,
   652,
   649]},
 {'id': 'I.6.2b',
  'rpn': [5,
   67,
   5,
   651,
   649,
   654,
   648,
   5,
   651,
   649,
   711,
   648,
   66,
   651,
   649,
   648,
   5,
   711,
   648,
   67,
   651,
   649,
   697,
   648,
   66,
   669,
   649,
   647,
   67,
   651,
   649,
   652,
   649]},
 {'id': 'I.8.14',
  'rpn': [688,
   648,
   66,
   714,
   649,
   647,
   67,
   651,
   664,
   648,
   66,
   674,
   649,
   647,
   67,
   651,
   647,
   5,
   651]},
 {'id': 'I.9.18',
  'rpn': [45,
   687,
   649,
   677,
   649,
   688,
   648,
   66,
   714,
   649,
   647,
   67,
   651,
   664,
   648,
   66,
 