In [4]:
from dataflow.core.lispress import lispress_to_program, program_to_lispress, parse_lispress, render_compact
from dataflow.core.linearize import seq_to_lispress, lispress_to_seq
from dataflow.core.program import ValueOp, Expression, BuildStructOp
import pdb 
import json 
from tqdm import tqdm

EXCEPTIONS={"NextDOW":"DAY_OF_WEEK",
            "FullMonthofMonth": "MONTH"
            }

MONTHS = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

SECONDARY_EXCEPTIONS={m: "MONTH" for m in MONTHS}


def anonymize(lispress_str):
    # lispress_seq = parse_lispress(lispress_str)
    # lispress = seq_to_lispress(lispress_seq)
    lispress = parse_lispress(lispress_str)
    program, __ = lispress_to_program(lispress, 0)
    parent_lookup = {arg: expr.id for expr in program.expressions for arg in expr.arg_ids}
    expression_lookup = {expr.id: expr for expr in program.expressions}

    for i, expr in enumerate(program.expressions):
        op = expr.op
        if isinstance(op, ValueOp):
            value = json.loads(op.value)
            if value['schema'] == "String":
                value['underlying'] = "String"
            elif value['schema'] == "Long": 
                value['underlying'] = "Long"
            elif value['schema'] == "Boolean":
                value['underlying'] = "Bool"
            elif value['schema'] == "Number":
                value['underlying'] = "Number"
            else:
                pdb.set_trace()
            new_op = ValueOp(json.dumps(value))
            new_expr = Expression(expr.id, new_op, expr.type_args, expr.type, expr.arg_ids)
            program.expressions[i] = new_expr 
        
        try:
            parent_expression = expression_lookup[parent_lookup[expr.id]]
        except KeyError:
            parent_expression = None
        # Days/some months
        if (parent_expression is not None and isinstance(parent_expression.op, BuildStructOp) 
            and parent_expression.op.op_schema in EXCEPTIONS.keys() and isinstance(op, BuildStructOp)):
            new_name = EXCEPTIONS[parent_expression.op.op_schema]
            new_op = BuildStructOp(f"Value(\"{new_name}\")", op.op_fields, op.empty_base, push_go = op.push_go)
            new_expr = Expression(expr.id, new_op, expr.type_args, expr.type, expr.arg_ids)
            program.expressions[i] = new_expr
        # Months
        if isinstance(op, BuildStructOp) and op.op_schema in SECONDARY_EXCEPTIONS.keys():
            new_name = SECONDARY_EXCEPTIONS[op.op_schema]
            new_op = BuildStructOp(f"Value(\"{new_name}\")", op.op_fields, op.empty_base, push_go = op.push_go)
            new_expr = Expression(expr.id, new_op, expr.type_args, expr.type, expr.arg_ids)
            program.expressions[i] = new_expr
        # holidays
        if isinstance(op, BuildStructOp) and op.op_schema.startswith("Holiday."):
            new_name = "HOLIDAY"
            new_op = BuildStructOp(f"Value(\"{new_name}\")", op.op_fields, op.empty_base, push_go = op.push_go)
            new_expr = Expression(expr.id, new_op, expr.type_args, expr.type, expr.arg_ids)
            program.expressions[i] = new_expr


            
    
    lispress = program_to_lispress(program)
    # seq = lispress_to_seq(lispress)
    lispress_str = render_compact(lispress)
    return lispress_str

print("reading...")
with open("/srv/local1/estengel/resources/data/smcalflow.agent.data/train.tgt") as train_f, \
    open("/srv/local1/estengel/resources/data/smcalflow.agent.data/valid.tgt") as dev_f:
    train_lines = train_f.readlines()
    dev_lines = dev_f.readlines()

print("converting train...")
new_train_lines = [anonymize(tl.strip()) for tl in tqdm(train_lines)]
print("converting dev...")
new_dev_lines = [anonymize(dl.strip()) for dl in tqdm(dev_lines)]

print("writing...")
with open("/srv/local1/estengel/resources/data/smcalflow.agent.data/train_anon.tgt", "w") as train_f, \
    open("/srv/local1/estengel/resources/data/smcalflow.agent.data/valid_anon.tgt", "w") as dev_f:
    for tl in new_train_lines:
        train_f.write(f"{tl}\n")
    for dl in new_dev_lines:
        dev_f.write(f"{dl}\n")




reading...
converting train...


100%|██████████| 121024/121024 [00:56<00:00, 2145.11it/s]


converting dev...


100%|██████████| 13496/13496 [00:06<00:00, 2159.38it/s]


writing...


In [None]:
new_train_lines[0]