In [None]:
<a target="_blank" href="https://colab.research.google.com/github/felixp8/text-to-nn/blob/main/experiments/mlp/data_generation/notebooks/symbmat_embedding.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
GIT_PAT = ...

In [None]:
!git clone https://felixp8:$GIT_PAT@github.com/felixp8/text-to-nn.git
!git clone https://github.com/facebookresearch/SymbolicMathematics.git

In [None]:
import os

os.chdir("./text-to-nn/experiments/mlp/data_generation")

expression_file = "./data/normal/expressions.csv"

In [None]:
import numpy as np
import sympy as sp
import pandas as pd
import h5py

from expr_utils import *

expr_csv = pd.read_csv(expression_file)

In [None]:
expr_list = expr_csv['expr']
expr_list = [map_inputs(expr, 3, ['x', 'y', 'z']) for expr in expr_list]
expr_list = [clean_expr(expr) for expr in expr_list]
[validate_expr(expr) for expr in expr_list]
expr_list = [sp.core.sympify(expr) for expr in expr_list]

In [None]:
import torch

os.chdir('../../../../SymbolicMathematics/')

from src.utils import AttrDict
from src.envs import build_env
from src.model import build_modules

from src.utils import to_cuda
from src.envs.sympy_utils import simplify

In [None]:
!wget https://dl.fbaipublicfiles.com/SymbolicMathematics/models/fwd_bwd.pth

In [None]:
model_path = './fwd_bwd.pth'
assert os.path.isfile(model_path)

In [None]:
params = params = AttrDict({

    # environment parameters
    'env_name': 'char_sp',
    'int_base': 10,
    'balanced': False,
    'positive': True,
    'precision': 10,
    'n_variables': 1,
    'n_coefficients': 0,
    'leaf_probs': '0.75,0,0.25,0',
    'max_len': 512,
    'max_int': 5,
    'max_ops': 15,
    'max_ops_G': 15,
    'clean_prefix_expr': True,
    'rewrite_functions': '',
    'tasks': 'prim_fwd',
    'operators': 'add:10,sub:3,mul:10,div:5,sqrt:4,pow2:4,pow3:2,pow4:1,pow5:1,ln:4,exp:4,sin:4,cos:4,tan:4,asin:1,acos:1,atan:1,sinh:1,cosh:1,tanh:1,asinh:1,acosh:1,atanh:1',

    # model parameters
    'cpu': False,
    'emb_dim': 1024,
    'n_enc_layers': 6,
    'n_dec_layers': 6,
    'n_heads': 8,
    'dropout': 0,
    'attention_dropout': 0,
    'sinusoidal_embeddings': False,
    'share_inout_emb': True,
    'reload_model': model_path,

})

In [None]:
env = build_env(params)
x = env.local_dict['x']

In [None]:
modules = build_modules(env, params)
encoder = modules['encoder']
decoder = modules['decoder']

In [None]:
prefix_list = [env.sympy_to_prefix(expr) for expr in expr_list]

In [None]:
embeddings = []

for prefix in prefix_list:
    # x1_prefix = env.clean_prefix(['sub', 'derivative', 'f', 'x', 'x'] + x1_prefix)
    x1 = torch.LongTensor(
        [env.eos_index] +
        [env.word2id[w] for w in prefix] +
        [env.eos_index]
    ).view(-1, 1)
    len1 = torch.LongTensor([len(x1)])
    x1, len1 = to_cuda(x1, len1)

    with torch.no_grad():
        encoded = encoder('fwd', x=x1, lengths=len1, causal=False).transpose(0, 1)

    embeddings.append(encoded)

In [None]:
pooling_mode = "mean"
normalize_embeddings = False

if pooling_mode == "mean":
    embeddings = [emb.mean(dim=(0,1)) for emb in embeddings]
elif pooling_mode == "last":
    embeddings = [emb[0,-1,:] for emb in embeddings]
elif pooling_mode == "first":
    embeddings = [emb[0,0,:] for emb in embeddings]
embeddings = torch.stack(embeddings, dim=0)

if normalize_embeddings:
    # embeddings /= torch.nn.functional.normalize(embeddings, p=2, dim=1)

In [None]:
with h5py.File('./data/normal/symbmat_embeddings.h5', 'w') as h5f:
    h5f.create_dataset('embeddings', data=embeddings.detach().cpu().numpy())