In [None]:
from utils.second_looks_utils import *
from utils.train_utils import *
import spacy
import json
seed = 42

## Our reformatting functions
- Redundant token removal
- Example concatenation for longer training sequences
- Preposing
- Preposing + Interjection
- ReCOGS
- Variable-free

### Redundant Token Removal

In [None]:
for removing_set in [
    ['x', '_'],
    ['x', '_', '(', ')'],
    ['x', '_', '(', ',', ')']
]:
    def token_removal(text, phi): 
        # Parsing:
        terms = []
        for t in phi.split():
            if t not in removing_set:
                terms += [t]
        ret = " ".join(terms).strip()
        return ret
    train_df = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
    dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
    test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
    gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])

    train_df['LF'] = train_df[['sentence', 'LF']].apply(lambda x: token_removal(*x), axis=1,)
    dev_df['LF'] = dev_df[['sentence', 'LF']].apply(lambda x: token_removal(*x), axis=1,)
    test_df['LF'] = test_df[['sentence', 'LF']].apply(lambda x: token_removal(*x), axis=1,)
    gen_df['LF'] = gen_df[['sentence', 'LF']].apply(lambda x: token_removal(*x), axis=1,)
    
    removing_set_str = "".join(removing_set)
    dataset_postfix = f"remove_{removing_set_str}"
    train_df.to_csv(f'./cogs_token_removal/train_{dataset_postfix}.tsv', sep='\t', index=False, header=False)
    dev_df.to_csv(f'./cogs_token_removal/dev_{dataset_postfix}.tsv', sep='\t', index=False, header=False)
    test_df.to_csv(f'./cogs_token_removal/test_{dataset_postfix}.tsv', sep='\t', index=False, header=False)
    gen_df.to_csv(f'./cogs_token_removal/gen_{dataset_postfix}.tsv', sep='\t', index=False, header=False)

### Example Concatenations

In [None]:
def reindex(LFs, initial_indexes):
    new_LF_prefix = []
    new_LF_body = []
    for i in range(len(LFs)):
        if initial_indexes[i] != 0:
            new_lf = []
            for item in LFs[i].split():
                if item.isnumeric():
                    new_i = int(item) + initial_indexes[i]
                    new_lf += [str(new_i)]
                else:
                    new_lf += [item]
            new_lf = " ".join(new_lf)
        else:
            new_lf = LFs[i]
        
        for item in new_lf.split(" ; "):
            if "*" in item:
                new_LF_prefix += [item]
            else:
                new_LF_body += [item]
        new_LF_body += ["AND"]
    new_LF_body = new_LF_body[:-1]
    return " ; ".join(new_LF_prefix) + " ; " + " ".join(new_LF_body)

In [None]:
append_ks = [256, 512, 1024, 2048, 3072]
for append_k in append_ks:
    train_df = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
    train_df_org = train_df.copy()
    train_df = train_df[train_df["type"] != "primitive"]
    dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
    test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
    gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])
    dataset_postfix = f"k_{append_k}"
    append_data = []
    start_indexes = [i*6 for i in range(append_k)]
    sorted_train_df = train_df.sort_values(by="sentence", key=lambda x: x.str.len())
    for start_index in start_indexes:
        conj_1 = sorted_train_df.iloc[-2-start_index].sentence
        if conj_1.split()[0] in {'The', 'A'}:
            conj_1_first = conj_1[0].lower()
        else:
            conj_1_first = conj_1[0]
            
        conj_2 = sorted_train_df.iloc[-3-start_index].sentence
        if conj_2.split()[0] in {'The', 'A'}:
            conj_2_first = conj_2[0].lower()
        else:
            conj_2_first = conj_2[0]
            
        append_data += [
            [sorted_train_df.iloc[-1-start_index].sentence[:-1]+\
            conj_1_first+\
            sorted_train_df.iloc[-2-start_index].sentence[1:-1]+\
            conj_2_first+\
            sorted_train_df.iloc[-3-start_index].sentence[1:],
            reindex(
                [
                    sorted_train_df.iloc[-1-start_index].LF,
                    sorted_train_df.iloc[-2-start_index].LF,
                    sorted_train_df.iloc[-3-start_index].LF
                ],
                [
                    0,
                    len(sorted_train_df.iloc[-1-start_index].sentence[:-1].strip().split()),
                    len(sorted_train_df.iloc[-1-start_index].sentence[:-1].strip().split())+
                    len(sorted_train_df.iloc[-2-start_index].sentence[:-1].strip().split())
                ]
            ),
            'concat']
        ]
    append_df = pd.DataFrame(append_data, columns =['sentence', 'LF', 'type'])
    train_df = pd.concat([train_df_org, append_df])
    train_df.to_csv(f'./cogs_concat/train_{dataset_postfix}.tsv', sep='\t', index=False, header=False)
    dev_df.to_csv(f'./cogs_concat/dev_{dataset_postfix}.tsv', sep='\t', index=False, header=False)
    test_df.to_csv(f'./cogs_concat/test_{dataset_postfix}.tsv', sep='\t', index=False, header=False)
    gen_df.to_csv(f'./cogs_concat/gen_{dataset_postfix}.tsv', sep='\t', index=False, header=False)
    
    max_s = max(train_df['sentence'].str.split().apply(len))
    max_lf = max(train_df['LF'].str.split().apply(len))
    print(max_s, max_lf)

### Preposing

In [None]:
set_seed(seed)
def translate_regular(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)

    return text, terms, _type
        
def translate(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)
    
    if "nmod" not in terms.split():
        return text, terms, _type
    
    if random.random() >= proposing_prob:
        return text, terms, _type
    
    if terms.split().count("nmod") == 2:
        upper_bound = 6
        return_type = "preposing_2"
    elif terms.split().count("nmod") == 1:
        upper_bound = 3
        return_type = "preposing_1"
    else:
        assert False
        
    nmod = terms.split()[terms.split().index("nmod")+2]
    pre_phrase = text.split()[text.split().index(nmod)-2 : text.split().index(nmod)+upper_bound]
    pre_phrase[0] = pre_phrase[0].capitalize()
    pre_text = text.split()[:text.split().index(nmod)-2]
    if pre_text[0] in ["The", "A"]:
        pre_text[0] = pre_text[0].lower()
    post_text = text.split()[text.split().index(nmod)+upper_bound:]
    pre_text = pre_phrase + pre_text + post_text
    pre_text = " ".join(pre_text)

    index_map = {}
    idx = 0
    for i in range(text.split().index(nmod)-2, text.split().index(nmod)+upper_bound):
        index_map[f"{i}"] = f"{idx}"
        idx += 1
    for i in range(text.split().index(nmod)-2):
        ii = 2+upper_bound+i
        index_map[f"{i}"] = f"{ii}"
        idx += 1
    for i in range(text.split().index(nmod)+upper_bound, len(text.split())):
        ii = idx+(i-(text.split().index(nmod)+upper_bound))
        index_map[f"{i}"] = f"{ii}"  
            
    # now handle LF
    pre_terms = []
    for t in terms.split():
        if t.isnumeric():
            pre_terms += [str(index_map[str(int(t))])]
        else:
            pre_terms += [t]
    pre_terms = " ".join(pre_terms)

    pre_terms_def = pre_terms.split(" ; ")[:-1]
    pre_terms_def.sort(key = lambda x: int(x.split()[-2]))  
    pre_terms_role = pre_terms.split(" ; ")[-1].split(" AND ")
    pre_terms_role.sort(key = lambda x: (int(x.split()[-2]) if x.split()[-5] == "(" else int(x.split()[-6]) if x.split()[-2].isnumeric() else int(x.split()[-4]), -1 if x.split()[-5] == "(" else int(x.split()[-2]) if x.split()[-2].isnumeric() else pre_text.split().index(x.split()[-2])))  
    pre_terms_role = " AND ".join(pre_terms_role)
    pre_terms = " ; ".join(pre_terms_def + [pre_terms_role])

    return pre_text, pre_terms, return_type

train_df = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])

proposing_prob = 0.05
train_df[['sentence', 'LF', 'type']] = train_df[['sentence', 'LF', 'type']].apply(lambda x: translate(*x), axis=1, result_type='expand')
dev_df[['sentence', 'LF', 'type']] = dev_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
test_df[['sentence', 'LF', 'type']] = test_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
gen_df[['sentence', 'LF', 'type']] = gen_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')

In [None]:
dataset_postfix = "preposing"
train_df.to_csv(f'./cogs_{dataset_postfix}/train.tsv', sep='\t', index=False, header=False)
dev_df.to_csv(f'./cogs_{dataset_postfix}/dev.tsv', sep='\t', index=False, header=False)
test_df.to_csv(f'./cogs_{dataset_postfix}/test.tsv', sep='\t', index=False, header=False)
gen_df.to_csv(f'./cogs_{dataset_postfix}/gen.tsv', sep='\t', index=False, header=False)

### Preposing + Sprinkles (Interjection)

In [None]:
set_seed(seed)
def translate_regular(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)

    return text, terms, _type
        
def translate(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)
    
    if "nmod" not in terms.split():
        return text, terms, _type
    
    if random.random() >= proposing_prob:
        return text, terms, _type
    
    if terms.split().count("nmod") == 2:
        upper_bound = 6
        return_type = "preposing_2"
    elif terms.split().count("nmod") == 1:
        upper_bound = 3
        return_type = "preposing_1"
    else:
        assert False
        
    nmod = terms.split()[terms.split().index("nmod")+2]
    pre_phrase = text.split()[text.split().index(nmod)-2 : text.split().index(nmod)+upper_bound]
    pre_phrase[0] = pre_phrase[0].capitalize()
    pre_text = text.split()[:text.split().index(nmod)-2]
    if pre_text[0] in ["The", "A"]:
        pre_text[0] = pre_text[0].lower()
    post_text = text.split()[text.split().index(nmod)+upper_bound:]
    pre_text = pre_phrase + pre_text + post_text
    pre_text = " ".join(pre_text)

    index_map = {}
    idx = 0
    for i in range(text.split().index(nmod)-2, text.split().index(nmod)+upper_bound):
        index_map[f"{i}"] = f"{idx}"
        idx += 1
    for i in range(text.split().index(nmod)-2):
        ii = 2+upper_bound+i
        index_map[f"{i}"] = f"{ii}"
        idx += 1
    for i in range(text.split().index(nmod)+upper_bound, len(text.split())):
        ii = idx+(i-(text.split().index(nmod)+upper_bound))
        index_map[f"{i}"] = f"{ii}"  
            
    # now handle LF
    pre_terms = []
    for t in terms.split():
        if t.isnumeric():
            pre_terms += [str(index_map[str(int(t))])]
        else:
            pre_terms += [t]
    pre_terms = " ".join(pre_terms)

    pre_terms_def = pre_terms.split(" ; ")[:-1]
    pre_terms_def.sort(key = lambda x: int(x.split()[-2]))  
    pre_terms_role = pre_terms.split(" ; ")[-1].split(" AND ")
    pre_terms_role.sort(key = lambda x: (int(x.split()[-2]) if x.split()[-5] == "(" else int(x.split()[-6]) if x.split()[-2].isnumeric() else int(x.split()[-4]), -1 if x.split()[-5] == "(" else int(x.split()[-2]) if x.split()[-2].isnumeric() else pre_text.split().index(x.split()[-2])))  
    pre_terms_role = " AND ".join(pre_terms_role)
    pre_terms = " ; ".join(pre_terms_def + [pre_terms_role])

    return pre_text, pre_terms, return_type

def add_um(sentence):
    words = sentence.split()
    new_words = []
    mapping = {}
    offset = 0
    for i, word in enumerate(words):
        mapping[i] = len(new_words)
        new_words.append(word)
        if i > 0 and i < len(words) - 2 and random.random() > 0.5:
            num_um = random.choice([1,2,3])
            for j in range(num_um):
                new_words.append("um")
    return " ".join(new_words), mapping

def sprinkle(text, phi, _type):
    if "preposition" in _type:
        return text, phi, _type
    
    if random.random() >= sprinkle_prob:
        return text, phi, _type
    
    um_text, token_mapping = add_um(text)
    um_phi = []
    for t in phi.split():
        if t.isnumeric():
            um_phi += [str(token_mapping[int(t)])]
        else:
            um_phi += [t]
    um_phi = " ".join(um_phi)
    
    return um_text, um_phi, "sprinkle"
            
train_df = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])

proposing_prob = 0.05
sprinkle_prob = 0.05
train_df[['sentence', 'LF', 'type']] = train_df[['sentence', 'LF', 'type']].apply(lambda x: translate(*x), axis=1, result_type='expand')
train_df[['sentence', 'LF', 'type']] = train_df[['sentence', 'LF', 'type']].apply(lambda x: sprinkle(*x), axis=1, result_type='expand')
dev_df[['sentence', 'LF', 'type']] = dev_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
test_df[['sentence', 'LF', 'type']] = test_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
gen_df[['sentence', 'LF', 'type']] = gen_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')

In [None]:
dataset_postfix = "preposing+sprinkles"
train_df.to_csv(f'./cogs_{dataset_postfix}/train.tsv', sep='\t', index=False, header=False)
dev_df.to_csv(f'./cogs_{dataset_postfix}/dev.tsv', sep='\t', index=False, header=False)
test_df.to_csv(f'./cogs_{dataset_postfix}/test.tsv', sep='\t', index=False, header=False)
gen_df.to_csv(f'./cogs_{dataset_postfix}/gen.tsv', sep='\t', index=False, header=False)

### ReCOGS (Number of resampling iterations = 5)

It seems like the performance gain from increasing the number of resampling iterations dimish quickly after getting the number above 10. We are trying 5 here.

**Since the first release of ReCOGS, we made some changes to the dataset:**

- NP and PP phrases mirror the word orders, instead of regrouping together in the output logical forms.
- Adding in examples with preposing and interjection in the training dataset.

In [None]:
set_seed(seed)
def translate_regular(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)

    return text, terms, _type
        
def translate(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)
    
    if "nmod" not in terms.split():
        return text, terms, _type
    
    if random.random() >= proposing_prob:
        return text, terms, _type
    
    if terms.split().count("nmod") == 2:
        upper_bound = 6
        return_type = "preposing_2"
    elif terms.split().count("nmod") == 1:
        upper_bound = 3
        return_type = "preposing_1"
    else:
        assert False
        
    nmod = terms.split()[terms.split().index("nmod")+2]
    pre_phrase = text.split()[text.split().index(nmod)-2 : text.split().index(nmod)+upper_bound]
    pre_phrase[0] = pre_phrase[0].capitalize()
    pre_text = text.split()[:text.split().index(nmod)-2]
    if pre_text[0] in ["The", "A"]:
        pre_text[0] = pre_text[0].lower()
    post_text = text.split()[text.split().index(nmod)+upper_bound:]
    pre_text = pre_phrase + pre_text + post_text
    pre_text = " ".join(pre_text)

    index_map = {}
    idx = 0
    for i in range(text.split().index(nmod)-2, text.split().index(nmod)+upper_bound):
        index_map[f"{i}"] = f"{idx}"
        idx += 1
    for i in range(text.split().index(nmod)-2):
        ii = 2+upper_bound+i
        index_map[f"{i}"] = f"{ii}"
        idx += 1
    for i in range(text.split().index(nmod)+upper_bound, len(text.split())):
        ii = idx+(i-(text.split().index(nmod)+upper_bound))
        index_map[f"{i}"] = f"{ii}"  
            
    # now handle LF
    pre_terms = []
    for t in terms.split():
        if t.isnumeric():
            pre_terms += [str(index_map[str(int(t))])]
        else:
            pre_terms += [t]
    pre_terms = " ".join(pre_terms)

    pre_terms_def = pre_terms.split(" ; ")[:-1]
    pre_terms_def.sort(key = lambda x: int(x.split()[-2]))  
    pre_terms_role = pre_terms.split(" ; ")[-1].split(" AND ")
    pre_terms_role.sort(key = lambda x: (int(x.split()[-2]) if x.split()[-5] == "(" else int(x.split()[-6]) if x.split()[-2].isnumeric() else int(x.split()[-4]), -1 if x.split()[-5] == "(" else int(x.split()[-2]) if x.split()[-2].isnumeric() else pre_text.split().index(x.split()[-2])))  
    pre_terms_role = " AND ".join(pre_terms_role)
    pre_terms = " ; ".join(pre_terms_def + [pre_terms_role])

    return pre_text, pre_terms, return_type

def add_um(sentence):
    words = sentence.split()
    new_words = []
    mapping = {}
    offset = 0
    for i, word in enumerate(words):
        mapping[i] = len(new_words)
        new_words.append(word)
        if i > 0 and i < len(words) - 2 and random.random() > 0.5:
            num_um = random.choice([1,2,3])
            for j in range(num_um):
                new_words.append("um")
    return " ".join(new_words), mapping

def sprinkle(text, phi, _type):
    if "preposition" in _type:
        return text, phi, _type
    
    if random.random() >= sprinkle_prob:
        return text, phi, _type
    
    um_text, token_mapping = add_um(text)
    um_phi = []
    for t in phi.split():
        if t.isnumeric():
            um_phi += [str(token_mapping[int(t)])]
        else:
            um_phi += [t]
    um_phi = " ".join(um_phi)
    
    return um_text, um_phi, f"{_type}+sprinkle"
            
train_df = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])

proposing_prob = 0.05
sprinkle_prob = 0.05
train_df[['sentence', 'LF', 'type']] = train_df[['sentence', 'LF', 'type']].apply(lambda x: translate(*x), axis=1, result_type='expand')
train_df[['sentence', 'LF', 'type']] = train_df[['sentence', 'LF', 'type']].apply(lambda x: sprinkle(*x), axis=1, result_type='expand')
dev_df[['sentence', 'LF', 'type']] = dev_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
test_df[['sentence', 'LF', 'type']] = test_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
gen_df[['sentence', 'LF', 'type']] = gen_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')

In [None]:
existing_digit_pool = set([])
# loading target vocab to random sample our variable names
for k, v in load_vocab("./model/tgt_vocab.txt").items():
    if k.isnumeric():
        existing_digit_pool.add(k)
existing_digit_pool = list(existing_digit_pool)

def translate(text, phi):
    
    if len(phi.split()) == 1:
        return text, f"LAMBDA a . {phi} ( a )"
    elif "LAMBDA" in phi:
        if len(phi.split()) == 7:
            return text, phi
        phi_split = phi.split(text)
        cleaned_phi = []
        for chunk in phi_split:
            if "LAMBDA" in chunk:
                cleaned_phi += [chunk.strip()]
            else:
                verb_args = chunk.strip(" .").split()[2]
                cleaned_phi += [chunk.strip(" .")]
        return text, " ".join(cleaned_phi[:1] + [f"{text} ( {verb_args} ) AND"] + cleaned_phi[1:])
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
            if "x _" not in d['entvar']:
                d['entvar_name'] = d['entvar']
                assert text_split.count(d['entvar']) == 1
                name_idx = text_split.index(d['entvar'])
                d['entvar'] = f"x _ {name_idx}"
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                def_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            if f"{d['pred']} ( {d['eventvar']} )" not in role_terms:
                role_terms += [f"{d['pred']} ( {d['eventvar']} )"]
            role_terms += [f"{d['role']} ( {d['eventvar']} , {d['entvar']} )"]
            if "entvar_name" in d:
                def_terms += [f"{d['entvar_name']} ( {d['entvar']} )"]
        elif d['type'] == 'mod':
            role_terms += [f"nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))

    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)
    
    # final step, remove biases
    current_digit_pool = set([])
    for t in terms.split():
        if t.isnumeric():
            current_digit_pool.add(t)
    current_digit_pool = list(current_digit_pool)
    random.shuffle(current_digit_pool)
    sample_random_digit = random.sample(existing_digit_pool, k=len(current_digit_pool))
    digit_mapping = dict(zip(current_digit_pool, sample_random_digit))

    new_terms = []
    for t in terms.split():
        if t == "_" or t == "x":
            continue
        if t.isnumeric():
            new_terms += [digit_mapping[t]]
        else:
            new_terms += [t]

    terms = " ".join(new_terms)
    return text, terms

sampled_n = 5
append_k = 3072

train_dfs = []
for i in range(sampled_n):
    train_df_i = train_df.copy()
    train_df_i[['sentence', 'LF']] = train_df_i[['sentence', 'LF']].apply(lambda x: translate(*x), axis=1, result_type='expand')
    train_dfs += [train_df_i]
dev_df[['sentence', 'LF']] = dev_df[['sentence', 'LF']].apply(lambda x: translate(*x), axis=1, result_type='expand')
test_df[['sentence', 'LF']] = test_df[['sentence', 'LF']].apply(lambda x: translate(*x), axis=1, result_type='expand')
gen_df[['sentence', 'LF']] = gen_df[['sentence', 'LF']].apply(lambda x: translate(*x), axis=1, result_type='expand')

def reindex(LFs, existing_digit_pool):
    curr_digit = set([])
    for i in range(len(LFs)):
        for item in LFs[i].split():
            if item.isnumeric():
                curr_digit.add((i, int(item)))
    sampled_digits = random.sample(existing_digit_pool, k=len(curr_digit))
    digit_map = {}
    idx = 0
    for d in list(curr_digit):
        digit_map[d] = sampled_digits[idx]
        idx += 1
    
    reindex_LFs = []
    for i in range(len(LFs)):
        new_LFs = []
        for item in LFs[i].split():
            if item.isnumeric():
                new_LFs += [digit_map[(i, int(item))]]
            else:
                new_LFs += [item]
        reindex_LFs += [" ".join(new_LFs)]
        
    new_LF_prefix = []
    new_LF_body_role = []
        
    for i in range(len(reindex_LFs)):
        new_LF_prefix.extend(reindex_LFs[i].split(" ; ")[:-1])
        for term in reindex_LFs[i].split(" ; ")[-1].split(" AND "):
            new_LF_body_role += [term]
                
    new_LF_body = new_LF_body_role
        
    return " ; ".join(new_LF_prefix) + " ; " + " AND ".join(new_LF_body)

start_indexes = [i*6 for i in range(append_k)]
append_data = []

for i in range(sampled_n):
    train_df_sorted = train_dfs[i].sort_values(by="sentence", key=lambda x: x.str.len())
    for start_index in start_indexes:
        conj_1 = train_df_sorted.iloc[-2-start_index].sentence
        if conj_1.split()[0] in {'The', 'A'}:
            conj_1_first = conj_1[0].lower()
        else:
            conj_1_first = conj_1[0]
            
        conj_2 = train_df_sorted.iloc[-3-start_index].sentence
        if conj_2.split()[0] in {'The', 'A'}:
            conj_2_first = conj_2[0].lower()
        else:
            conj_2_first = conj_2[0]
            
        append_data += [
            [train_df_sorted.iloc[-1-start_index].sentence[:-1]+\
            conj_1_first+\
            train_df_sorted.iloc[-2-start_index].sentence[1:-1]+\
            conj_2_first+\
            train_df_sorted.iloc[-3-start_index].sentence[1:],
            reindex(
                [
                    train_df_sorted.iloc[-1-start_index].LF,
                    train_df_sorted.iloc[-2-start_index].LF,
                    train_df_sorted.iloc[-3-start_index].LF
                ], existing_digit_pool
            ),
            'length_ood']
        ]
append_data = pd.DataFrame(append_data, columns =['sentence', 'LF', 'type'])

In [None]:
train_df = pd.concat(train_dfs)
train_df = pd.concat([train_df, append_data])
train_df = train_df.drop_duplicates()

dataset_postfix = "recogs_v2"
train_df.to_csv(f'./{dataset_postfix}/train.tsv', sep='\t', index=False, header=False)
dev_df.to_csv(f'./{dataset_postfix}/dev.tsv', sep='\t', index=False, header=False)
test_df.to_csv(f'./{dataset_postfix}/test.tsv', sep='\t', index=False, header=False)
gen_df.to_csv(f'./{dataset_postfix}/gen.tsv', sep='\t', index=False, header=False)

ReCOGS testing function to translate back to the COGS LF

In [None]:
train_df = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])
verb_to_forms_map = {}
def collect_verb_forms(text, phi):
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if pred_re.search(conj):
            d = parse_pred(conj)
            verb = d['pred']
            if "x _" in d['eventvar']:
                form = text.split()[int(d['eventvar'].split()[-1])]
                if verb not in verb_to_forms_map:
                    verb_to_forms_map[verb] = set([form])
                else:
                    verb_to_forms_map[verb].add(form)

_ = train_df[['sentence', 'LF']].apply(lambda x: collect_verb_forms(*x), axis=1, result_type='expand')
_ = dev_df[['sentence', 'LF']].apply(lambda x: collect_verb_forms(*x), axis=1, result_type='expand')
_ = test_df[['sentence', 'LF']].apply(lambda x: collect_verb_forms(*x), axis=1, result_type='expand')
_ = gen_df[['sentence', 'LF']].apply(lambda x: collect_verb_forms(*x), axis=1, result_type='expand')

In [None]:
recogs_neoD_verb_re = re.compile(r"""
    ^
    \s*(\w+?)\s*
    \(
    \s*([0-9]+?)\s*
    \)
    \s*$""", re.VERBOSE)

recogs_neoD_pred_re = re.compile(r"""
    ^
    \s*(\w+?)\s*
    \(
    \s*(.+?)\s*
    ,
    \s*(.+?)\s*
    \)
    \s*$""", re.VERBOSE)

recogs_neoD_mod_re = re.compile(r"""
    ^
    \s*(\w+?)\s*
    \.
    \s*(\w+?)\s*
    \(
    \s*(.+?)\s*
    ,
    \s*(.+?)\s*
    \)
    \s*$""", re.VERBOSE)

def reverse_translate(text, phi):
    conjs = phi.split(" ; ")[-1]
    nouns = phi.split(" ; ")[:-1]
    nouns_map = {}
    text_split = text.split()
    filtered_nouns = []
    noun_terms = []
    for noun in nouns:
        if noun.split()[0][0].isupper():
            nouns_map[noun.split()[-2]] = (noun.split()[0], )
        else:
            filtered_nouns += [noun]
            if "*" in noun:
                noun_str = noun.split()[1]                
            else:
                noun_str = noun.split()[0]
            if noun_str in {'tv'}:
                nouns_map[noun.split()[-2]] = (str(text_split.index('TV')), noun_str)
            else:
                nouns_map[noun.split()[-2]] = (str(text_split.index(noun_str)), noun_str)
    
    filtered_conjs = []
    for noun in filtered_nouns:
        if np_re.search(noun):
            d = parse_np(noun)
            if "*" in noun:
                noun_terms += [f"* {d['pred']} ( x _ {nouns_map[d['entvar']][0]} )"]
            else:
                filtered_conjs += [f"{d['pred']} ( x _ {nouns_map[d['entvar']][0]} )"]
      
    verb_map = {}
    for conj in conjs.split(" AND "):
        if recogs_neoD_verb_re.search(conj):
            # candidate for mapping verb.
            pred, arg = recogs_neoD_verb_re.search(conj).groups()
            verb_map[arg] = pred
                
    for conj in conjs.split(" AND "):
        if "nmod" in conj:
            role, pred, first_arg, second_arg = recogs_neoD_mod_re.search(conj).groups()
            filtered_conjs += [f"{nouns_map[first_arg][1]} . nmod . {pred} ( x _ {nouns_map[first_arg][0]} , x _ {nouns_map[second_arg][0]} )"]
        else:
            if recogs_neoD_verb_re.search(conj):
                pass
            else:
                role, first_arg, second_arg = recogs_neoD_pred_re.search(conj).groups()
                forms_set = verb_to_forms_map[verb_map[first_arg]]
                verb_idx = -1
                for form in list(forms_set):
                    if form in text_split:
                        verb_idx = str(text_split.index(form))
                        break
                if second_arg in nouns_map:
                    if len(nouns_map[second_arg]) == 1:
                        filtered_conjs += [f"{verb_map[first_arg]} . {role} ( x _ {verb_idx} , {nouns_map[second_arg][0]} )"]
                    else:
                        filtered_conjs += [f"{verb_map[first_arg]} . {role} ( x _ {verb_idx} , x _ {nouns_map[second_arg][0]} )"]
                else:
                    forms_set = verb_to_forms_map[verb_map[second_arg]]
                    comp_verb_idx = -1
                    for form in list(forms_set):
                        if form in text_split:
                            comp_verb_idx = str(text_split.index(form))
                            break
                    filtered_conjs += [f"{verb_map[first_arg]} . {role} ( x _ {verb_idx} , x _ {comp_verb_idx} )"]
    filtered_conjs.sort(key = lambda x: (int(x.split()[-2]) if x.split()[-5] == "(" else int(x.split()[-6]) if x.split()[-2].isnumeric() else int(x.split()[-4]), -1 if x.split()[-5] == "(" else int(x.split()[-2]) if x.split()[-2].isnumeric() else text_split.index(x.split()[-2])))  
    
    if len(noun_terms) > 0:
        lf = " ; ".join(noun_terms) + " ; " + " AND ".join(filtered_conjs)
    else:
        lf = " AND ".join(filtered_conjs)
    return phi, lf
    
recogs_dev_df = pd.read_csv("./recogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
recogs_test_df = pd.read_csv("./recogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
recogs_gen_df = pd.read_csv("./recogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])
recogs_dev_df[['recogs_LF', 'backward_LF']] = recogs_dev_df[['sentence', 'LF']].apply(
    lambda x: reverse_translate(*x), axis=1, result_type='expand'
)
recogs_test_df[['recogs_LF', 'backward_LF']] = recogs_test_df[['sentence', 'LF']].apply(
    lambda x: reverse_translate(*x), axis=1, result_type='expand'
)
recogs_gen_df[['recogs_LF', 'backward_LF']] = recogs_gen_df[['sentence', 'LF']].apply(
    lambda x: reverse_translate(*x), axis=1, result_type='expand'
)
combined_dev_df = pd.concat([dev_df, recogs_dev_df[['recogs_LF', 'backward_LF']]], axis=1)
combined_test_df = pd.concat([test_df, recogs_test_df[['recogs_LF', 'backward_LF']]], axis=1)
combined_gen_df = pd.concat([gen_df, recogs_gen_df[['recogs_LF', 'backward_LF']]], axis=1)

In [None]:
# TODO: i am a bit lazy here... you can work out a script without needing this.
df_inspecting = combined_gen_df
print("WARNING: Be aware these errors are expected as ReCOGS focuses on semantics reprs not recoversing token positions!")
print()
print("NUM OF ERRORS: ", len(df_inspecting[df_inspecting['LF'] != df_inspecting['backward_LF']]))
print()
inspect_id = 0
row = df_inspecting[df_inspecting['LF'] != df_inspecting['backward_LF']].iloc[inspect_id]
print("sentence: ", row['sentence'])
print()
print("LF: ", row['LF'])
print()
print("backward_LF: ", row['backward_LF'])
print()
print("recogs_LF: ", row['recogs_LF'])

### Variable-free format
This is from Qiu et. al., 2022 paper

In [None]:
def cogs_lf_to_funcall(lf):
    """Converts the given COGS logical form into the variable-free form.
    - Nouns (entities and unaries) become values:
      Jack --> Jack
      cat ( x _ 1 ) --> cat
      * cat ( x _ 1 ) --> * cat
    - Verbs become functions, and their roles become argument names:
      eat . agent ( x _ 2 , Jack ) --> eat ( agent = Jack )
    - The variables representing nouns resolve to their values:
      cat ( x _ 1 ) AND eat . agent ( x _ 2 , x _ 1 ) --> eat ( agent = cat )
    This converter constructs a graph where variables are nodes and binaries
    are edges. After identifying the root, it then performs depth-first traversal
    to construct the output.
    Args:
    lf: Logical form string.
    Returns:
    The converted logical form.
    """
    if "LAMBDA" in lf or "(" not in lf:
        return lf

    # Parse the terms in the logical form
    # Example: toss . agent ( x _ 1 , John ) --> [toss, agent], [x _ 1, John]
    terms = []
    for raw_term in re.split(" ; | AND ", lf):
        match = re.match(r"(.*) \( (.*) \)", raw_term)
        if not match:
            raise ValueError(f"Malformed term: {raw_term}")
        labels = match.group(1).split(" . ")
        args = match.group(2).split(" , ")
        if len(args) not in (1, 2):
            raise ValueError(f"Invalid number of args: {args}")
        terms.append((labels, args))

    # `nodes` maps variables to node name (e.g., "x _ 3" -> "* cat").
    nodes = {}
    for labels, args in terms:
        if args[0] in nodes:
            # The variable has already been seen; check for conflicts.
            if nodes[args[0]] not in (labels[0], "* " + labels[0]):
                raise ValueError(
                    f"Conflicting node name: {nodes[args[0]]} vs. {labels[0]}")
        else:
            nodes[args[0]] = labels[0]

    # `children` maps variables to a list of (edge name, target node).
    children = {}
    # Potential root nodes; any node being a child will be removed.
    root_candidates = list(nodes)
    for labels, args in terms:
        if len(args) == 2:
            if args[0] not in children:
                children[args[0]] = []
            children[args[0]].append((" . ".join(labels[1:]), args[1]))
            if args[1] in root_candidates:
                root_candidates.remove(args[1])
    if len(root_candidates) != 1:
        raise ValueError(f"Multiple roots: {root_candidates}")
    root = root_candidates[0]

    # Depth-first traverse the graph to construct the funcall
    def dfs(node):
        if node not in nodes:
            # Named entity such as "John"
            if node.startswith("x _"):
                raise ValueError(f"Unbound variable {node}")
            if node in children:
                raise ValueError(f"Named entity {node} has children {children[node]}")
            return [node]
        else:
            # A noun like "cat" or a verb like "jump"
            if node not in children:
                return [nodes[node]]
            funcall_args = []
            for edge_label, edge_target in children[node]:
                funcall_args.append([edge_label, "="] + dfs(edge_target))
            funcall = [nodes[node], "("]
            for i, funcall_arg in enumerate(funcall_args):
                if i != 0:
                    funcall.append(",")
                funcall.extend(funcall_arg)
            funcall.append(")")
            return funcall

    return " ".join(dfs(root))

In [None]:
train_df = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])

train_df['LF'] = train_df[['LF']].apply(lambda x: cogs_lf_to_funcall(*x), axis=1,)
dev_df['LF'] = dev_df[['LF']].apply(lambda x: cogs_lf_to_funcall(*x), axis=1,)
test_df['LF'] = test_df[['LF']].apply(lambda x: cogs_lf_to_funcall(*x), axis=1,)
gen_df['LF'] = gen_df[['LF']].apply(lambda x: cogs_lf_to_funcall(*x), axis=1,)

In [None]:
dataset_postfix = "variable_free"
train_df.to_csv(f'./{dataset_postfix}/train.tsv', sep='\t', index=False, header=False)
dev_df.to_csv(f'./{dataset_postfix}/dev.tsv', sep='\t', index=False, header=False)
test_df.to_csv(f'./{dataset_postfix}/test.tsv', sep='\t', index=False, header=False)
gen_df.to_csv(f'./{dataset_postfix}/gen.tsv', sep='\t', index=False, header=False)

### Participial Verb Phrases

In [None]:
set_seed(seed)

noun_pool = set()
v_pool = set()
def collect_nv(text, phi, _type):
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    role_set = set([])
    for d in data:
        if d['type'] == 'role':
            v_pool.add(d['pred'])
        elif d['type'] == 'np':
            noun_pool.add(d['pred'])
            
    return text, phi, _type

def translate_regular(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)

    return text, terms, _type

def translate(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    role_set = set([])
    object_entvar = None
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
            if d['role'] == "theme":
                object_entvar = d['entvar']
                
            role_set.add(d['role'])
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    start_with_pp = False if text_split[0] in {'A', 'The'} else True
    
    if start_with_pp or (object_entvar is not None and "x _ " not in object_entvar):
        # combine
        def_terms = " ; ".join(def_terms)
        if def_terms == "":
            terms = " AND ".join(rest_terms)
        elif " AND ".join(rest_terms) == "":
            terms = def_terms
        else:
            terms = def_terms + " ; " + " AND ".join(rest_terms)
        return text, terms, _type
    
    if random.random() >= participial_prob:
        # combine
        def_terms = " ; ".join(def_terms)
        if def_terms == "":
            terms = " AND ".join(rest_terms)
        elif " AND ".join(rest_terms) == "":
            terms = def_terms
        else:
            terms = def_terms + " ; " + " AND ".join(rest_terms)
        return text, terms, _type
    
    # agent -> subject
    # theme -> object
    if "theme" in phi:
        # only one object if it exists!
        assert 1 == phi.split().count("theme")
        modify_subject = True if random.random() < 0.5 else False
    else:
        modify_subject = True
    
    # reindex!
    def reindex_terms(terms, offset=1, distance=3):
        new_terms = []
        for term in terms:
            new_term = []
            for t in term.split():
                if t.isnumeric() and int(t) > offset:
                    new_term += [str(int(t)+distance)]
                else:
                    new_term += [t]
            new_terms += [" ".join(new_term)]
        return new_terms
    
    # we need to decide whether to add to subj or obj.
    second_text_pp = False
    if modify_subject:
        subject = text_split[0] if start_with_pp else text_split[1]
        verb_pp = random.choice(list(verb_pp_map.keys()))
        noun_pp = random.choice(list(noun_pool))
        articles = 'a' if random.random() < 0.5 else 'the'
        text_pp = [verb_pp_map[verb_pp], articles, noun_pp]
        if articles == 'a':
            pp_np_term =  [f"{noun_pp} ( x _ 4 )"]
        elif articles == 'the':
            pp_np_term = [f"* {noun_pp} ( x _ 4 )"]
        pp_role_term = [f"{subject} . acl . {verb_pp} ( x _ 1 , x _ 4 )"]
        
        if random.random() < 0.5:
            second_text_pp = True
            second_verb_pp = random.choice(list(verb_pp_map.keys()))
            second_noun_pp = random.choice(list(noun_pool))
            second_articles = 'a' if random.random() < 0.5 else 'the'
            text_pp += [verb_pp_map[second_verb_pp], second_articles, second_noun_pp]
            if second_articles == 'a':
                pp_np_term += [f"{second_noun_pp} ( x _ 7 )"]
            elif second_articles == 'the':
                pp_np_term += [f"* {second_noun_pp} ( x _ 7 )"]
            pp_role_term += [f"{noun_pp} . acl . {second_verb_pp} ( x _ 4 , x _ 7 )"]
            
        def_terms = reindex_terms(def_terms, distance=6 if second_text_pp else 3)
        rest_terms = reindex_terms(rest_terms, distance=6 if second_text_pp else 3)
        
        object_pos = 1
        
    else:
        # locate the obj
        assert "x _ " in object_entvar
        object_pos = int(object_entvar.split()[-1])
        _object = text_split[object_pos]
        
        verb_pp = random.choice(list(verb_pp_map.keys()))
        noun_pp = random.choice(list(noun_pool))
        articles = 'a' if random.random() < 0.5 else 'the'
        text_pp = [verb_pp_map[verb_pp], articles, noun_pp]
        if articles == 'a':
            pp_np_term = [f"{noun_pp} ( x _ {object_pos+3} )"]
        elif articles == 'the':
            pp_np_term = [f"* {noun_pp} ( x _ {object_pos+3} )"]
        pp_role_term = [f"{_object} . acl . {verb_pp} ( x _ {object_pos} , x _ {object_pos+3} )"]
        
        if random.random() < 0.5:
            second_text_pp = True
            second_verb_pp = random.choice(list(verb_pp_map.keys()))
            second_noun_pp = random.choice(list(noun_pool))
            second_articles = 'a' if random.random() < 0.5 else 'the'
            text_pp += [verb_pp_map[second_verb_pp], second_articles, second_noun_pp]
            if second_articles == 'a':
                pp_np_term += [f"{second_noun_pp} ( x _ {object_pos+6} )"]
            elif second_articles == 'the':
                pp_np_term += [f"* {second_noun_pp} ( x _ {object_pos+6} )"]
            pp_role_term += [f"{noun_pp} . acl . {second_verb_pp} ( x _ {object_pos+3} , x _ {object_pos+6} )"]
 
        def_terms = reindex_terms(def_terms, object_pos, distance=6 if second_text_pp else 3)
        rest_terms = reindex_terms(rest_terms, object_pos, distance=6 if second_text_pp else 3)
    
    for np_t in pp_np_term:
        if np_t[0] == "*":
            def_terms += [np_t]
        else:
            rest_terms += [np_t]
    rest_terms += pp_role_term
    text_split = text_split[:object_pos+1] + text_pp + text_split[object_pos+1:]
    new_text = " ".join(text_split)
    def_terms.sort(key = lambda x: int(x.split()[-2]))   
    rest_terms.sort(
        key = lambda x: (
            int(x.split()[-7]) if x.split()[-1] == "acl" else int(x.split()[-2]) if x.split()[-5] == "(" else int(x.split()[-6]) if x.split()[-2].isnumeric() else int(x.split()[-4]), 
            int(x.split()[-3]) if x.split()[-1] == "acl" else -1 if x.split()[-5] == "(" else int(x.split()[-2]) if x.split()[-2].isnumeric() else new_text.split().index(x.split()[-2])
        )
    )  
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)
    
    if second_text_pp:
        return new_text, terms, "subj_participle_verb_phrase_nested" if modify_subject else "obj_participle_verb_phrase_nested"
    else:
        return new_text, terms, "subj_participle_verb_phrase" if modify_subject else "obj_participle_verb_phrase"
    

In [None]:
participial_prob = 0.15
train_df = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])

with open('./verb_pp_map.json', 'r') as openfile:
    verb_pp_map = json.load(openfile)
train_df[['sentence', 'LF', 'type']] = train_df[['sentence', 'LF', 'type']].apply(
    lambda x: collect_nv(*x), axis=1, result_type='expand'
)
train_df[['sentence', 'LF', 'type']] = train_df[['sentence', 'LF', 'type']].apply(
    lambda x: translate(*x), axis=1, result_type='expand'
)

dev_df[['sentence', 'LF', 'type']] = dev_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
test_df[['sentence', 'LF', 'type']] = test_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
gen_df[['sentence', 'LF', 'type']] = gen_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')

In [None]:
dataset_postfix = "participle_verb"
train_df.to_csv(f'./cogs_{dataset_postfix}/train.tsv', sep='\t', index=False, header=False)
dev_df.to_csv(f'./cogs_{dataset_postfix}/dev.tsv', sep='\t', index=False, header=False)
test_df.to_csv(f'./cogs_{dataset_postfix}/test.tsv', sep='\t', index=False, header=False)
gen_df.to_csv(f'./cogs_{dataset_postfix}/gen.tsv', sep='\t', index=False, header=False)

### Participial Verb Phrases (Easy)

In [None]:
set_seed(seed)

noun_pool = set()
v_pool = set()
def collect_nv(text, phi, _type):
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    role_set = set([])
    for d in data:
        if d['type'] == 'role':
            v_pool.add(d['pred'])
        elif d['type'] == 'np':
            noun_pool.add(d['pred'])
            
    return text, phi, _type

def translate_regular(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)

    return text, terms, _type

def translate(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    role_set = set([])
    object_entvar = None
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
            if d['role'] == "theme":
                object_entvar = d['entvar']
                
            role_set.add(d['role'])
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    start_with_pp = False if text_split[0] in {'A', 'The'} else True
    
    if start_with_pp or (object_entvar is not None and "x _ " not in object_entvar):
        # combine
        def_terms = " ; ".join(def_terms)
        if def_terms == "":
            terms = " AND ".join(rest_terms)
        elif " AND ".join(rest_terms) == "":
            terms = def_terms
        else:
            terms = def_terms + " ; " + " AND ".join(rest_terms)
        return text, terms, _type
    
    if random.random() >= participial_prob:
        # combine
        def_terms = " ; ".join(def_terms)
        if def_terms == "":
            terms = " AND ".join(rest_terms)
        elif " AND ".join(rest_terms) == "":
            terms = def_terms
        else:
            terms = def_terms + " ; " + " AND ".join(rest_terms)
        return text, terms, _type
    
    # agent -> subject
    # theme -> object
    if "theme" in phi:
        # only one object if it exists!
        assert 1 == phi.split().count("theme")
        modify_subject = True if random.random() < 0.5 else False
    else:
        modify_subject = True
    
    # reindex!
    def reindex_terms(terms, offset=1, distance=3):
        new_terms = []
        for term in terms:
            new_term = []
            for t in term.split():
                if t.isnumeric() and int(t) > offset:
                    new_term += [str(int(t)+distance)]
                else:
                    new_term += [t]
            new_terms += [" ".join(new_term)]
        return new_terms
    
    # we need to decide whether to add to subj or obj.
    second_text_pp = False
    if modify_subject:
        subject = text_split[0] if start_with_pp else text_split[1]
        verb_pp = random.choice(list(verb_pp_map.keys()))
        noun_pp = random.choice(list(noun_pool))
        articles = 'a' if random.random() < 0.5 else 'the'
        text_pp = [verb_pp_map[verb_pp], articles, noun_pp]
        if articles == 'a':
            pp_np_term =  [f"{noun_pp} ( x _ 4 )"]
        elif articles == 'the':
            pp_np_term = [f"* {noun_pp} ( x _ 4 )"]
        pp_role_term = [f"{subject} . nmod . {verb_pp} ( x _ 1 , x _ 4 )"]
        
        if random.random() < 0.5:
            second_text_pp = True
            second_verb_pp = random.choice(list(verb_pp_map.keys()))
            second_noun_pp = random.choice(list(noun_pool))
            second_articles = 'a' if random.random() < 0.5 else 'the'
            text_pp += [verb_pp_map[second_verb_pp], second_articles, second_noun_pp]
            if second_articles == 'a':
                pp_np_term += [f"{second_noun_pp} ( x _ 7 )"]
            elif second_articles == 'the':
                pp_np_term += [f"* {second_noun_pp} ( x _ 7 )"]
            pp_role_term += [f"{noun_pp} . nmod . {second_verb_pp} ( x _ 4 , x _ 7 )"]
            
        def_terms = reindex_terms(def_terms, distance=6 if second_text_pp else 3)
        rest_terms = reindex_terms(rest_terms, distance=6 if second_text_pp else 3)
        
        object_pos = 1
        
    else:
        # locate the obj
        assert "x _ " in object_entvar
        object_pos = int(object_entvar.split()[-1])
        _object = text_split[object_pos]
        
        verb_pp = random.choice(list(verb_pp_map.keys()))
        noun_pp = random.choice(list(noun_pool))
        articles = 'a' if random.random() < 0.5 else 'the'
        text_pp = [verb_pp_map[verb_pp], articles, noun_pp]
        if articles == 'a':
            pp_np_term = [f"{noun_pp} ( x _ {object_pos+3} )"]
        elif articles == 'the':
            pp_np_term = [f"* {noun_pp} ( x _ {object_pos+3} )"]
        pp_role_term = [f"{_object} . nmod . {verb_pp} ( x _ {object_pos} , x _ {object_pos+3} )"]
        
        if random.random() < 0.5:
            second_text_pp = True
            second_verb_pp = random.choice(list(verb_pp_map.keys()))
            second_noun_pp = random.choice(list(noun_pool))
            second_articles = 'a' if random.random() < 0.5 else 'the'
            text_pp += [verb_pp_map[second_verb_pp], second_articles, second_noun_pp]
            if second_articles == 'a':
                pp_np_term += [f"{second_noun_pp} ( x _ {object_pos+6} )"]
            elif second_articles == 'the':
                pp_np_term += [f"* {second_noun_pp} ( x _ {object_pos+6} )"]
            pp_role_term += [f"{noun_pp} . nmod . {second_verb_pp} ( x _ {object_pos+3} , x _ {object_pos+6} )"]
 
        def_terms = reindex_terms(def_terms, object_pos, distance=6 if second_text_pp else 3)
        rest_terms = reindex_terms(rest_terms, object_pos, distance=6 if second_text_pp else 3)
    
    for np_t in pp_np_term:
        if np_t[0] == "*":
            def_terms += [np_t]
        else:
            rest_terms += [np_t]
    rest_terms += pp_role_term
    text_split = text_split[:object_pos+1] + text_pp + text_split[object_pos+1:]
    new_text = " ".join(text_split)
    def_terms.sort(key = lambda x: int(x.split()[-2]))   
    rest_terms.sort(
        key = lambda x: (
            int(x.split()[-7]) if x.split()[-1] == "nmod" else int(x.split()[-2]) if x.split()[-5] == "(" else int(x.split()[-6]) if x.split()[-2].isnumeric() else int(x.split()[-4]), 
            int(x.split()[-3]) if x.split()[-1] == "nmod" else -1 if x.split()[-5] == "(" else int(x.split()[-2]) if x.split()[-2].isnumeric() else new_text.split().index(x.split()[-2])
        )
    )  
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)
    
    if second_text_pp:
        return new_text, terms, "subj_participle_verb_phrase_nested" if modify_subject else "obj_participle_verb_phrase_nested"
    else:
        return new_text, terms, "subj_participle_verb_phrase" if modify_subject else "obj_participle_verb_phrase"
    

In [None]:
participial_prob = 0.15
train_df = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])

with open('./verb_pp_map.json', 'r') as openfile:
    verb_pp_map = json.load(openfile)
train_df[['sentence', 'LF', 'type']] = train_df[['sentence', 'LF', 'type']].apply(
    lambda x: collect_nv(*x), axis=1, result_type='expand'
)
train_df[['sentence', 'LF', 'type']] = train_df[['sentence', 'LF', 'type']].apply(
    lambda x: translate(*x), axis=1, result_type='expand'
)

dev_df[['sentence', 'LF', 'type']] = dev_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
test_df[['sentence', 'LF', 'type']] = test_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
gen_df[['sentence', 'LF', 'type']] = gen_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')

In [None]:
dataset_postfix = "participle_verb_easy"
train_df.to_csv(f'./cogs_{dataset_postfix}/train.tsv', sep='\t', index=False, header=False)
dev_df.to_csv(f'./cogs_{dataset_postfix}/dev.tsv', sep='\t', index=False, header=False)
test_df.to_csv(f'./cogs_{dataset_postfix}/test.tsv', sep='\t', index=False, header=False)
gen_df.to_csv(f'./cogs_{dataset_postfix}/gen.tsv', sep='\t', index=False, header=False)

### Set-based evaluation

In [None]:
train_df = pd.read_csv("./recogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
dev_df = pd.read_csv("./recogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
test_df = pd.read_csv("./recogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
gen_df = pd.read_csv("./recogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])

In [None]:
for lf in train_df["LF"]:
    if "LAMBDA" not in lf and translate_invariant_form_neoD(lf) == {}:
        print(lf)
for lf in gen_df["LF"]:
    if "LAMBDA" not in lf and translate_invariant_form_neoD(lf) == {}:
        print(lf)

### ReCOGS without coindexing change

In [None]:
set_seed(seed)
def translate_regular(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)

    return text, terms, _type
        
def translate(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)
    
    if "nmod" not in terms.split():
        return text, terms, _type
    
    if random.random() >= proposing_prob:
        return text, terms, _type
    
    if terms.split().count("nmod") == 2:
        upper_bound = 6
        return_type = "preposing_2"
    elif terms.split().count("nmod") == 1:
        upper_bound = 3
        return_type = "preposing_1"
    else:
        assert False
        
    nmod = terms.split()[terms.split().index("nmod")+2]
    pre_phrase = text.split()[text.split().index(nmod)-2 : text.split().index(nmod)+upper_bound]
    pre_phrase[0] = pre_phrase[0].capitalize()
    pre_text = text.split()[:text.split().index(nmod)-2]
    if pre_text[0] in ["The", "A"]:
        pre_text[0] = pre_text[0].lower()
    post_text = text.split()[text.split().index(nmod)+upper_bound:]
    pre_text = pre_phrase + pre_text + post_text
    pre_text = " ".join(pre_text)

    index_map = {}
    idx = 0
    for i in range(text.split().index(nmod)-2, text.split().index(nmod)+upper_bound):
        index_map[f"{i}"] = f"{idx}"
        idx += 1
    for i in range(text.split().index(nmod)-2):
        ii = 2+upper_bound+i
        index_map[f"{i}"] = f"{ii}"
        idx += 1
    for i in range(text.split().index(nmod)+upper_bound, len(text.split())):
        ii = idx+(i-(text.split().index(nmod)+upper_bound))
        index_map[f"{i}"] = f"{ii}"  
            
    # now handle LF
    pre_terms = []
    for t in terms.split():
        if t.isnumeric():
            pre_terms += [str(index_map[str(int(t))])]
        else:
            pre_terms += [t]
    pre_terms = " ".join(pre_terms)

    pre_terms_def = pre_terms.split(" ; ")[:-1]
    pre_terms_def.sort(key = lambda x: int(x.split()[-2]))  
    pre_terms_role = pre_terms.split(" ; ")[-1].split(" AND ")
    pre_terms_role.sort(key = lambda x: (int(x.split()[-2]) if x.split()[-5] == "(" else int(x.split()[-6]) if x.split()[-2].isnumeric() else int(x.split()[-4]), -1 if x.split()[-5] == "(" else int(x.split()[-2]) if x.split()[-2].isnumeric() else pre_text.split().index(x.split()[-2])))  
    pre_terms_role = " AND ".join(pre_terms_role)
    pre_terms = " ; ".join(pre_terms_def + [pre_terms_role])

    return pre_text, pre_terms, return_type

def add_um(sentence):
    words = sentence.split()
    new_words = []
    mapping = {}
    offset = 0
    for i, word in enumerate(words):
        mapping[i] = len(new_words)
        new_words.append(word)
        if i > 0 and i < len(words) - 2 and random.random() > 0.5:
            num_um = random.choice([1,2,3])
            for j in range(num_um):
                new_words.append("um")
    return " ".join(new_words), mapping

def sprinkle(text, phi, _type):
    if "preposition" in _type:
        return text, phi, _type
    
    if random.random() >= sprinkle_prob:
        return text, phi, _type
    
    um_text, token_mapping = add_um(text)
    um_phi = []
    for t in phi.split():
        if t.isnumeric():
            um_phi += [str(token_mapping[int(t)])]
        else:
            um_phi += [t]
    um_phi = " ".join(um_phi)
    
    return um_text, um_phi, f"{_type}+sprinkle"
            
train_df = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])

proposing_prob = 0.05
sprinkle_prob = 0.05
train_df[['sentence', 'LF', 'type']] = train_df[['sentence', 'LF', 'type']].apply(lambda x: translate(*x), axis=1, result_type='expand')
train_df[['sentence', 'LF', 'type']] = train_df[['sentence', 'LF', 'type']].apply(lambda x: sprinkle(*x), axis=1, result_type='expand')
dev_df[['sentence', 'LF', 'type']] = dev_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
test_df[['sentence', 'LF', 'type']] = test_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
gen_df[['sentence', 'LF', 'type']] = gen_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')

In [None]:
existing_digit_pool = set([])
# loading target vocab to random sample our variable names
for k, v in load_vocab("./model/tgt_vocab.txt").items():
    if k.isnumeric():
        existing_digit_pool.add(k)
existing_digit_pool = list(existing_digit_pool)

def translate(text, phi):
    
    if len(phi.split()) == 1:
        return text, f"LAMBDA a . {phi} ( a )"
    elif "LAMBDA" in phi:
        if len(phi.split()) == 7:
            return text, phi
        phi_split = phi.split(text)
        cleaned_phi = []
        for chunk in phi_split:
            if "LAMBDA" in chunk:
                cleaned_phi += [chunk.strip()]
            else:
                verb_args = chunk.strip(" .").split()[2]
                cleaned_phi += [chunk.strip(" .")]
        return text, " ".join(cleaned_phi[:1] + [f"{text} ( {verb_args} ) AND"] + cleaned_phi[1:])
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
            if "x _" not in d['entvar']:
                d['entvar_name'] = d['entvar']
                assert text_split.count(d['entvar']) == 1
                name_idx = text_split.index(d['entvar'])
                d['entvar'] = f"x _ {name_idx}"
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                def_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            if f"{d['pred']} ( {d['eventvar']} )" not in role_terms:
                role_terms += [f"{d['pred']} ( {d['eventvar']} )"]
            role_terms += [f"{d['role']} ( {d['eventvar']} , {d['entvar']} )"]
            if "entvar_name" in d:
                def_terms += [f"{d['entvar_name']} ( {d['entvar']} )"]
        elif d['type'] == 'mod':
            role_terms += [f"nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    

    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)

    new_terms = []
    for t in terms.split():
        if t == "_" or t == "x":
            continue
        if t.isnumeric():
            new_terms += [t]
        else:
            new_terms += [t]

    terms = " ".join(new_terms)
    return text, terms

In [None]:
sampled_n = 1
append_k = 3072

train_df[['sentence', 'LF']] = train_df[['sentence', 'LF']].apply(lambda x: translate(*x), axis=1, result_type='expand')
dev_df[['sentence', 'LF']] = dev_df[['sentence', 'LF']].apply(lambda x: translate(*x), axis=1, result_type='expand')
test_df[['sentence', 'LF']] = test_df[['sentence', 'LF']].apply(lambda x: translate(*x), axis=1, result_type='expand')
gen_df[['sentence', 'LF']] = gen_df[['sentence', 'LF']].apply(lambda x: translate(*x), axis=1, result_type='expand')

In [None]:
def reindex(LFs, initial_indexes):
    new_LF_prefix = []
    new_LF_body = []
    
    reindex_LFs = []
    for i in range(len(LFs)):
        if initial_indexes[i] != 0:
            new_lf = []
            for item in LFs[i].split():
                if item.isnumeric():
                    new_i = int(item) + initial_indexes[i]
                    new_lf += [str(new_i)]
                else:
                    new_lf += [item]
            new_lf = " ".join(new_lf)
        else:
            new_lf = LFs[i]
        reindex_LFs += [new_lf]
        
    new_LF_prefix = []
    new_LF_body = []
        
    for i in range(len(reindex_LFs)):
        new_LF_prefix.extend(reindex_LFs[i].split(" ; ")[:-1])
        for term in reindex_LFs[i].split(" ; ")[-1].split(" AND "):
            new_LF_body += [term]
        
    return " ; ".join(new_LF_prefix) + " ; " + " AND ".join(new_LF_body)

start_indexes = [i*6 for i in range(append_k)]
append_data = []

for i in range(sampled_n):
    sorted_train_df = train_df.sort_values(by="sentence", key=lambda x: x.str.len())
    for start_index in start_indexes:
        conj_1 = sorted_train_df.iloc[-2-start_index].sentence
        if conj_1.split()[0] in {'The', 'A'}:
            conj_1_first = conj_1[0].lower()
        else:
            conj_1_first = conj_1[0]
            
        conj_2 = sorted_train_df.iloc[-3-start_index].sentence
        if conj_2.split()[0] in {'The', 'A'}:
            conj_2_first = conj_2[0].lower()
        else:
            conj_2_first = conj_2[0]
            
        append_data += [
            [sorted_train_df.iloc[-1-start_index].sentence[:-1]+\
            conj_1_first+\
            sorted_train_df.iloc[-2-start_index].sentence[1:-1]+\
            conj_2_first+\
            sorted_train_df.iloc[-3-start_index].sentence[1:],
            reindex(
                [
                    sorted_train_df.iloc[-1-start_index].LF,
                    sorted_train_df.iloc[-2-start_index].LF,
                    sorted_train_df.iloc[-3-start_index].LF
                ],
                [
                    0,
                    len(sorted_train_df.iloc[-1-start_index].sentence[:-1].strip().split()),
                    len(sorted_train_df.iloc[-1-start_index].sentence[:-1].strip().split())+
                    len(sorted_train_df.iloc[-2-start_index].sentence[:-1].strip().split())
                ]
            ),
            'concat']
        ]
        
append_data = pd.DataFrame(append_data, columns =['sentence', 'LF', 'type'])

In [None]:
train_df = pd.concat([train_df, append_data])
train_df = train_df.drop_duplicates()

dataset_postfix = "recogs_positional_index"
train_df.to_csv(f'./{dataset_postfix}/train.tsv', sep='\t', index=False, header=False)
dev_df.to_csv(f'./{dataset_postfix}/dev.tsv', sep='\t', index=False, header=False)
test_df.to_csv(f'./{dataset_postfix}/test.tsv', sep='\t', index=False, header=False)
gen_df.to_csv(f'./{dataset_postfix}/gen.tsv', sep='\t', index=False, header=False)

### Verification of paraphrased participle verb from ChatGPT

In [None]:
nlp = spacy.load("en_core_web_sm")
f = open('./verb_pp_map.json')
data = json.load(f)
lemma_mapping = []
hand_checks = {"nurse", "bless", "ship"}
for k, v in data.items():
    doc = nlp(v) # honestly, this is kind of waste, but whatever.
    for token in doc:
        if token.pos_ == "VERB":
            lemma_mapping += [(k, token.lemma_)]
        else:
            if k not in hand_checks:
                print(k, v)
                assert False # hand check.

In [None]:
for p in lemma_mapping:
    if p[0] != p[1]:
        print(p)
        assert False