# Using openpyxl for Direct Parsing of Formulas

Using `pd.read_csv()` and `pd.read_excel()` excludes some formulas. Specifically, 30 formulas are returned as NaN for CSV files and 3 formulas are returned as NaN for Excel files. Therefore, I have used `openpyxl` for direct parsing of the formulas.

CSV files are plain text and don't preserve cell metadata such as the actual formula text; they only store the computed values. When Excel exports to CSV, it writes out the result of the formula, not the formula itself. This is why we use libraries like `openpyxl` with the Excel format (XLSX) and set `data_only=False` to retrieve the underlying formula strings.

In [24]:

from openpyxl import load_workbook
wb = load_workbook('FeynmanEquations.xlsx', data_only=False)
ws = wb.active


In [25]:
# Iterate through rows (we start from row 2 because we skip header)
formula_col_index = 4 
formula_list=[]
# Iterate through rows (we start from row 2 because we skip header)
for row in ws.iter_rows(min_row=2):
    # get cell in formula column
    cell = row[formula_col_index - 1]  # zero-indexed
    formula_list.append(cell.value)  

## Plan for Tokenization

### Extract All Unique Variable Names
1. Read the `FeynmanEquations.xlsx` file.
2. Collect all unique variable names from the columns `v1_name`, `v2_name`, `v3_name`, etc.

### Extract All Unique Operators and Functions
1. Parse the formula column.
2. Extract mathematical operators (`+`, `-`, `*`, `/`, `**`) and functions (`sin`, `cos`, `exp`, etc.).

### Merge into a Single Set (Vocabulary)
1. Combine the extracted variable names and operators/functions into a single unique set.
2. This set forms our tokenization vocabulary.

### Tokenize Each Formula Using This Vocabulary
1. Convert each formula into a sequence of tokens using the extracted vocabulary.


In [26]:
## Extract all Uique variable names
import pandas as pd
import sympy
import re

df=pd.read_excel('FeynmanEquations.xlsx')
variable_columns=[col for col in df.columns if re.match(r'v\d+_name',col)]


# print(variable_columns)

unique_variables=set()
for col in variable_columns:
    unique_variables.update(df[col].dropna().astype(str).unique())

print(unique_variables)



{'omega_0', 'B', 'z', 'm_0', 'I1', 'y', 'q2', 'omega', 'I', 'By', 'lambd', 'Bx', 'n_rho', 'c', 'H', 'theta2', 'kb', 'w', 'd1', 'mu_drift', 'Y', 'G', 'r', 'n_0', 'g_', 'u', 'd', 'x1', 'p_d', 'y3', 'm2', 'z1', 'Bz', 'x3', 'rho', 'T2', 'n', 'T1', 'chi', 'mom', 'z2', 'beta', 'Ef', 'Nn', 'F', 'alpha', 'g', 'Pwr', 'E_n', 'A_vec', 'q', 'V1', 'C', 'A', 'theta1', 'I2', 'delta', 'I_0', 'pr', 'p', 'm1', 'rho_c_0', 'gamma', 'U', 't', 'a', 'Jz', 'y2', 'sigma_den', 'M', 'kappa', 'Volt', 'h', 'k', 'T', 'sigma', 'r1', 'k_spring', 'y1', 'v', 'q1', 'mob', 'r2', 'theta', 'm', 'x', 'Int_0', 'V2', 'V', 'epsilon', 'mu', 'x2', 'd2'}


In [27]:
###Checker
# print("sigma" in unique_variables )
# print("pi" in unique_variables )
# print("sigma" in unique_variables )
# print("omega" in unique_variables )
# print("h" in unique_variables )
print("ln" in unique_variables)
print("Int" in unique_variables)
print("delta" in unique_variables)




False
False
True


In [36]:
formula_list[:10]


['exp(-theta**2/2)/sqrt(2*pi)',
 'exp(-(theta/sigma)**2/2)/(sqrt(2*pi)*sigma)',
 'exp(-((theta-theta1)/sigma)**2/2)/(sqrt(2*pi)*sigma)',
 'sqrt((x2-x1)**2+(y2-y1)**2)',
 'G*m1*m2/((x2-x1)**2+(y2-y1)**2+(z2-z1)**2)',
 'm_0/sqrt(1-v**2/c**2)',
 'x1*y1+x2*y2+x3*y3',
 'mu*Nn',
 'q1*q2*r/(4*pi*epsilon*r**3)',
 'q1*r/(4*pi*epsilon*r**3)']

In [37]:
filename = "formulas.txt"

with open(filename, "w") as f:
    for formula in formula_list:
        f.write(formula + "\n")

print(f"Formulas saved to {filename}")

Formulas saved to formulas.txt


In [28]:
### All mathematical operators and synbols expected 
formula_list

mathematical_symbols=['+','-','*','/','**','exp','sqrt','pi','sin','cos','ln','Int','tanh','log','arcsin','arctan','arccos']



In [29]:
mathematical_symbols = ['+', '-', '*', '/', '**', 'exp', 'sqrt', 'pi', 'sin', 'cos', 'ln', 'Int', 'tanh', 'log', 'arcsin', 'arctan', 'arccos']

TOKEN_DICT = {
    "<PAD>": 0,
    "<UNK>": 1,
}

for index, symbol in enumerate(mathematical_symbols):
    TOKEN_DICT[symbol] = index + 2  # Start index from 2

print(TOKEN_DICT)
current_index=max(TOKEN_DICT.values())+1



for var in unique_variables:
    TOKEN_DICT[var]=current_index
    current_index+=1
    

    

{'<PAD>': 0, '<UNK>': 1, '+': 2, '-': 3, '*': 4, '/': 5, '**': 6, 'exp': 7, 'sqrt': 8, 'pi': 9, 'sin': 10, 'cos': 11, 'ln': 12, 'Int': 13, 'tanh': 14, 'log': 15, 'arcsin': 16, 'arctan': 17, 'arccos': 18}


In [30]:
## Additionally we will need a-z, A-Z and 0-9 as tokens too
## Index Variables A_10 to Z_10, A1 to Z10 

#a-z 
for i, letter in enumerate("abcdefghijklmnopqrstuvwxyz", start=len(TOKEN_DICT)):
    TOKEN_DICT[letter] = i
    
#A-Z
for i, letter in enumerate("ABCDEFGHIJKLMNOPQRSTUVWXYZ", start=len(TOKEN_DICT)):
    TOKEN_DICT[letter] = i
    
# 3. Add numbers 0-9
for i in range(10):
    TOKEN_DICT[str(i)] = len(TOKEN_DICT)
    
## Indexed variables (A_0 to Z_10 and A1 to Z10)
for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
    for num in range(11):  # 0-10
        TOKEN_DICT[f"{letter}_{num}"] = len(TOKEN_DICT)
        TOKEN_DICT[f"{letter}{num}"] = len(TOKEN_DICT)

In [31]:
TOKEN_DICT


{'<PAD>': 0,
 '<UNK>': 1,
 '+': 2,
 '-': 3,
 '*': 4,
 '/': 5,
 '**': 6,
 'exp': 7,
 'sqrt': 8,
 'pi': 9,
 'sin': 10,
 'cos': 11,
 'ln': 12,
 'Int': 13,
 'tanh': 14,
 'log': 15,
 'arcsin': 16,
 'arctan': 17,
 'arccos': 18,
 'omega_0': 19,
 'B': 121,
 'z': 137,
 'm_0': 22,
 'I1': 322,
 'y': 136,
 'q2': 25,
 'omega': 26,
 'I': 128,
 'By': 28,
 'lambd': 29,
 'Bx': 30,
 'n_rho': 31,
 'c': 114,
 'H': 127,
 'theta2': 34,
 'kb': 35,
 'w': 134,
 'd1': 37,
 'mu_drift': 38,
 'Y': 144,
 'G': 126,
 'r': 129,
 'n_0': 42,
 'g_': 43,
 'u': 132,
 'd': 115,
 'x1': 46,
 'p_d': 47,
 'y3': 48,
 'm2': 49,
 'z1': 50,
 'Bz': 51,
 'x3': 52,
 'rho': 53,
 'T2': 563,
 'n': 125,
 'T1': 562,
 'chi': 57,
 'mom': 58,
 'z2': 59,
 'beta': 60,
 'Ef': 61,
 'Nn': 62,
 'F': 125,
 'alpha': 64,
 'g': 118,
 'Pwr': 66,
 'E_n': 67,
 'A_vec': 68,
 'q': 128,
 'V1': 604,
 'C': 122,
 'A': 120,
 'theta1': 73,
 'I2': 323,
 'delta': 75,
 'I_0': 320,
 'pr': 77,
 'p': 127,
 'm1': 79,
 'rho_c_0': 80,
 'gamma': 81,
 'U': 140,
 't': 131,
 

In [32]:
## Now at the last, just go through the equations to add any left tokens
def extract_equation_tokens(equation: str):
    pattern = r"(?:\*\*|[+\-*/=()^]|[A-Za-z_]+|\d+(?:\.\d+)?)"
    tokens = re.findall(pattern, equation)
    return tokens

new_tokens=set()
for eq in formula_list:
    tokens_in_eq=extract_equation_tokens(eq)
    new_tokens.update(tokens_in_eq)
    
current_index=max(TOKEN_DICT.values()) + 1

for token in new_tokens:
    if token not in TOKEN_DICT:
        TOKEN_DICT[token]=current_index
        current_index+=1



In [35]:
with open("my_data.json", "w") as f:
    json.dump(TOKEN_DICT, f, indent=4)

In [39]:
##  Use TOKEN_DICT as a “Seed” for a Subword Tokenize
from tokenizers import Tokenizer, trainers, models, pre_tokenizers


# Step 1: Convert your existing TOKEN_DICT to special tokens
special_tokens = list(TOKEN_DICT.keys())  # e.g. ["<PAD>", "<UNK>", "+", "-", "*", ...]


# Step 2: Initialize a subword tokenizer (e.g. BPE)
tokenizer = Tokenizer(models.BPE(unk_token="<UNK>"))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Step 3: Prepare trainer
trainer = trainers.BpeTrainer(
    vocab_size=2000,  # or however large
    special_tokens=special_tokens
)



# Step 4: Provide your corpus in lines, ideally pre-tokenized or raw text
files = ["formulas.txt"]


tokenizer.train(files, trainer)

# Step 6: Test the resulting tokenizer
encoded = tokenizer.encode("E = sin(pi*x) + theta_12")
print("Tokens:", encoded.tokens)

Tokens: ['E', '=', 'sin', '(', 'pi', '*', 'x', ')', '+', 'theta', '_', '1', '2']


In [40]:
tokenizer.save("my_tokenizer.json")
