## Our reformatting functions
- Redundant token removal
- Example concatenation for longer training sequences
- Preposing
- Preposing + Interjection
- ReCOGS

In [None]:
import pandas as pd
import re, random, copy

import pandas as pd
import re, random, copy

from __future__ import absolute_import, division, print_function

import collections
import unicodedata
import torch
import six
from torch.utils.data import Dataset
import random
import numpy as np


np_re = re.compile(r"""
    ^
    \s*(\*)?
    \s*(\w+?)\s*
    \(
    \s*(.+?)\s*
    \)
    \s*$""", re.VERBOSE)

pred_re = re.compile(r"""
    ^
    \s*(\w+?)\s*
    \.
    \s*(\w+?)\s*
    \(
    \s*(.+?)\s*
    ,
    \s*(.+?)\s*
    \)
    \s*$""", re.VERBOSE)

mod_re = re.compile(r"""
    ^
    \s*(\w+?)\s*
    \.
    \s*(\w+?)\s*
    \.
    \s*(\w+?)\s*
    \(
    \s*(.+?)\s*
    ,
    \s*(.+?)\s*
    \)
    \s*$""", re.VERBOSE)


def parse_np(phi):   
    the, pred, var = np_re.search(phi).groups()
    indef = '' if the is None else '*'
    return {'type': 'np', 'definiteness': indef, 'pred': pred, 'entvar': var}

def parse_pred(phi):
    pred, role, eventvar, entvar = pred_re.search(phi).groups()
    return {'type': 'role', 'role': role, 'pred': pred, 'entvar': entvar, 'eventvar': eventvar}

def parse_mod(phi):
    nppred, rel, pred, e1, e2 = mod_re.search(phi).groups()
    # Keeping `rel` even though it is always 'nmod'
    return {'type': 'mod', 'rel': rel, 'pred': pred, 'nppred': nppred, 'e1': e1, 'e2': e2}

def translate_entity_simplied(entvar, data):
    ent = [e for e in data if e['type'] == 'np' and e.get("entvar") == entvar]
    if not ent:
        return entvar, entvar
    else:
        ent = ent[0]
        return f"{ent['definiteness']} {ent['entvar']} ( {ent['pred']} )", f"{ent['definiteness']} {ent['pred']}"

def translate_entity(entvar, data):
    ent = [e for e in data if e['type'] == 'np' and e.get("entvar") == entvar]
    if not ent:
        return entvar
    else:
        ent = ent[0]
        return f"{ent['definiteness']} {ent['entvar']} ( {ent['pred']} )"
    
def convert_to_unicode(text):
    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
    if six.PY3:
        if isinstance(text, str):
            return text
        elif isinstance(text, bytes):
            return text.decode("utf-8", "ignore")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    elif six.PY2:
        if isinstance(text, str):
            return text.decode("utf-8", "ignore")
        elif isinstance(text, unicode):
            return text
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    else:
        raise ValueError("Not running on Python2 or Python 3?")

def convert_tokens_to_ids(vocab, tokens):
    """Converts a sequence of tokens into ids using the vocab."""
    ids = []
    for token in tokens:
        if token not in vocab.keys():
            ids.append(vocab['[UNK]'])
        else:
            ids.append(vocab[token])
    return ids
        
def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    with open(vocab_file, "r") as reader:
        while True:
            token = convert_to_unicode(reader.readline())
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab

### Redundant Token Removal
TODO: currently, we just have to change the removing set of tokens!

In [None]:
def token_removal(text, phi): 
    global all_roles
    removing_set = {'x', '_', '(', ')', ','}
    # Parsing:
    terms = []
    for t in phi.split():
        if t not in removing_set:
            terms += [t]
    ret = " ".join(terms).strip()
    return ret
train_df = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])

train_df['LF'] = train_df[['sentence', 'LF']].apply(lambda x: token_removal(*x), axis=1,)
dev_df['LF'] = dev_df[['sentence', 'LF']].apply(lambda x: token_removal(*x), axis=1,)
test_df['LF'] = test_df[['sentence', 'LF']].apply(lambda x: token_removal(*x), axis=1,)
gen_df['LF'] = gen_df[['sentence', 'LF']].apply(lambda x: token_removal(*x), axis=1,)

In [None]:
dataset_postfix = "remove_x_(,)"
train_df.to_csv(f'./cogs_token_removal/train_{dataset_postfix}.tsv', sep='\t', index=False, header=False)
dev_df.to_csv(f'./cogs_token_removal/dev_{dataset_postfix}.tsv', sep='\t', index=False, header=False)
test_df.to_csv(f'./cogs_token_removal/test_{dataset_postfix}.tsv', sep='\t', index=False, header=False)
gen_df.to_csv(f'./cogs_token_removal/gen_{dataset_postfix}.tsv', sep='\t', index=False, header=False)

### Example Concatenations

In [None]:
def reindex(LFs, initial_indexes):
    new_LF_prefix = []
    new_LF_body = []
    for i in range(len(LFs)):
        if initial_indexes[i] != 0:
            new_lf = []
            for item in LFs[i].split():
                if item.isnumeric():
                    new_i = int(item) + initial_indexes[i]
                    new_lf += [str(new_i)]
                else:
                    new_lf += [item]
            new_lf = " ".join(new_lf)
        else:
            new_lf = LFs[i]
        
        for item in new_lf.split(" ; "):
            if "*" in item:
                new_LF_prefix += [item]
            else:
                new_LF_body += [item]
        new_LF_body += ["AND"]
    new_LF_body = new_LF_body[:-1]
    return " ; ".join(new_LF_prefix) + " ; " + " ".join(new_LF_body)

In [None]:
append_ks = [256, 512, 1024, 2048, 3072]
for append_k in append_ks:
    train_df = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
    train_df_org = train_df.copy()
    train_df = train_df[train_df["type"] != "primitive"]
    dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
    test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
    gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])
    dataset_postfix = f"k_{append_k}"
    append_data = []
    start_indexes = [i*6 for i in range(append_k)]
    sorted_train_df = train_df.sort_values(by="sentence", key=lambda x: x.str.len())
    for start_index in start_indexes:
        append_data += [
            [sorted_train_df.iloc[-1-start_index].sentence[:-1]+\
            sorted_train_df.iloc[-2-start_index].sentence[0].lower()+\
            sorted_train_df.iloc[-2-start_index].sentence[1:-1]+\
            sorted_train_df.iloc[-3-start_index].sentence[0].lower()+\
            sorted_train_df.iloc[-3-start_index].sentence[1:],
            reindex(
                [
                    sorted_train_df.iloc[-1-start_index].LF,
                    sorted_train_df.iloc[-2-start_index].LF,
                    sorted_train_df.iloc[-3-start_index].LF
                ],
                [
                    0,
                    len(sorted_train_df.iloc[-1-start_index].sentence[:-1].strip().split()),
                    len(sorted_train_df.iloc[-1-start_index].sentence[:-1].strip().split())+
                    len(sorted_train_df.iloc[-2-start_index].sentence[:-1].strip().split())
                ]
            ),
            'concat']
        ]
    append_df = pd.DataFrame(append_data, columns =['sentence', 'LF', 'type'])
    train_df = pd.concat([train_df_org, append_df])
    train_df.to_csv(f'./cogs_concat/train_{dataset_postfix}.tsv', sep='\t', index=False, header=False)
    dev_df.to_csv(f'./cogs_concat/dev_{dataset_postfix}.tsv', sep='\t', index=False, header=False)
    test_df.to_csv(f'./cogs_concat/test_{dataset_postfix}.tsv', sep='\t', index=False, header=False)
    gen_df.to_csv(f'./cogs_concat/gen_{dataset_postfix}.tsv', sep='\t', index=False, header=False)
    
    max_s = max(train_df['sentence'].str.split().apply(len))
    max_lf = max(train_df['LF'].str.split().apply(len))
    print(max_s, max_lf)

### Preposing

In [None]:
def translate_regular(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)

    return text, terms, _type
        
def translate(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)
    
    if "nmod" not in terms.split():
        return text, terms, _type
    
    if random.random() >= proposing_prob:
        return text, terms, _type
    
    if terms.split().count("nmod") == 2:
        upper_bound = 6
        return_type = "preposing_2"
    elif terms.split().count("nmod") == 1:
        upper_bound = 3
        return_type = "preposing_1"
    else:
        assert False
        
    nmod = terms.split()[terms.split().index("nmod")+2]
    pre_phrase = text.split()[text.split().index(nmod)-2 : text.split().index(nmod)+upper_bound]
    pre_phrase[0] = pre_phrase[0].capitalize()
    pre_text = text.split()[:text.split().index(nmod)-2]
    if pre_text[0] in ["The", "A"]:
        pre_text[0] = pre_text[0].lower()
    post_text = text.split()[text.split().index(nmod)+upper_bound:]
    pre_text = pre_phrase + pre_text + post_text
    pre_text = " ".join(pre_text)

    index_map = {}
    idx = 0
    for i in range(text.split().index(nmod)-2, text.split().index(nmod)+upper_bound):
        index_map[f"{i}"] = f"{idx}"
        idx += 1
    for i in range(text.split().index(nmod)-2):
        ii = 2+upper_bound+i
        index_map[f"{i}"] = f"{ii}"
        idx += 1
    for i in range(text.split().index(nmod)+upper_bound, len(text.split())):
        ii = idx+(i-(text.split().index(nmod)+upper_bound))
        index_map[f"{i}"] = f"{ii}"  
            
    # now handle LF
    pre_terms = []
    for t in terms.split():
        if t.isnumeric():
            pre_terms += [str(index_map[str(int(t))])]
        else:
            pre_terms += [t]
    pre_terms = " ".join(pre_terms)

    pre_terms_def = pre_terms.split(" ; ")[:-1]
    pre_terms_def.sort(key = lambda x: int(x.split()[-2]))  
    pre_terms_role = pre_terms.split(" ; ")[-1].split(" AND ")
    pre_terms_role.sort(key = lambda x: (int(x.split()[-2]) if x.split()[-5] == "(" else int(x.split()[-6]) if x.split()[-2].isnumeric() else int(x.split()[-4]), -1 if x.split()[-5] == "(" else int(x.split()[-2]) if x.split()[-2].isnumeric() else pre_text.split().index(x.split()[-2])))  
    pre_terms_role = " AND ".join(pre_terms_role)
    pre_terms = " ; ".join(pre_terms_def + [pre_terms_role])

    return pre_text, pre_terms, return_type

train_df = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])

proposing_prob = 0.05
train_df[['sentence', 'LF', 'type']] = train_df[['sentence', 'LF', 'type']].apply(lambda x: translate(*x), axis=1, result_type='expand')
dev_df[['sentence', 'LF', 'type']] = dev_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
test_df[['sentence', 'LF', 'type']] = test_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
gen_df[['sentence', 'LF', 'type']] = gen_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')

In [None]:
dataset_postfix = "preposing"
train_df.to_csv(f'./cogs_{dataset_postfix}/train.tsv', sep='\t', index=False, header=False)
dev_df.to_csv(f'./cogs_{dataset_postfix}/dev.tsv', sep='\t', index=False, header=False)
test_df.to_csv(f'./cogs_{dataset_postfix}/test.tsv', sep='\t', index=False, header=False)
gen_df.to_csv(f'./cogs_{dataset_postfix}/gen.tsv', sep='\t', index=False, header=False)

### Preposing + Sprinkles (Interjection)

In [None]:
def translate_regular(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)

    return text, terms, _type
        
def translate(text, phi, _type):
    
    if len(phi.split()) == 1:
        return text, phi, _type
    elif "LAMBDA" in phi:
        return text, phi, _type
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                role_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['pred']} . {d['role']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            assert "x _" in d['e1']
            role_terms += [f"{d['nppred']} . nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)
    
    if "nmod" not in terms.split():
        return text, terms, _type
    
    if random.random() >= proposing_prob:
        return text, terms, _type
    
    if terms.split().count("nmod") == 2:
        upper_bound = 6
        return_type = "preposing_2"
    elif terms.split().count("nmod") == 1:
        upper_bound = 3
        return_type = "preposing_1"
    else:
        assert False
        
    nmod = terms.split()[terms.split().index("nmod")+2]
    pre_phrase = text.split()[text.split().index(nmod)-2 : text.split().index(nmod)+upper_bound]
    pre_phrase[0] = pre_phrase[0].capitalize()
    pre_text = text.split()[:text.split().index(nmod)-2]
    if pre_text[0] in ["The", "A"]:
        pre_text[0] = pre_text[0].lower()
    post_text = text.split()[text.split().index(nmod)+upper_bound:]
    pre_text = pre_phrase + pre_text + post_text
    pre_text = " ".join(pre_text)

    index_map = {}
    idx = 0
    for i in range(text.split().index(nmod)-2, text.split().index(nmod)+upper_bound):
        index_map[f"{i}"] = f"{idx}"
        idx += 1
    for i in range(text.split().index(nmod)-2):
        ii = 2+upper_bound+i
        index_map[f"{i}"] = f"{ii}"
        idx += 1
    for i in range(text.split().index(nmod)+upper_bound, len(text.split())):
        ii = idx+(i-(text.split().index(nmod)+upper_bound))
        index_map[f"{i}"] = f"{ii}"  
            
    # now handle LF
    pre_terms = []
    for t in terms.split():
        if t.isnumeric():
            pre_terms += [str(index_map[str(int(t))])]
        else:
            pre_terms += [t]
    pre_terms = " ".join(pre_terms)

    pre_terms_def = pre_terms.split(" ; ")[:-1]
    pre_terms_def.sort(key = lambda x: int(x.split()[-2]))  
    pre_terms_role = pre_terms.split(" ; ")[-1].split(" AND ")
    pre_terms_role.sort(key = lambda x: (int(x.split()[-2]) if x.split()[-5] == "(" else int(x.split()[-6]) if x.split()[-2].isnumeric() else int(x.split()[-4]), -1 if x.split()[-5] == "(" else int(x.split()[-2]) if x.split()[-2].isnumeric() else pre_text.split().index(x.split()[-2])))  
    pre_terms_role = " AND ".join(pre_terms_role)
    pre_terms = " ; ".join(pre_terms_def + [pre_terms_role])

    return pre_text, pre_terms, return_type

def add_um(sentence):
    words = sentence.split()
    new_words = []
    mapping = {}
    offset = 0
    for i, word in enumerate(words):
        mapping[i] = len(new_words)
        new_words.append(word)
        if i > 0 and i < len(words) - 2 and random.random() > 0.5:
            num_um = random.choice([1,2,3])
            for j in range(num_um):
                new_words.append("um")
    return " ".join(new_words), mapping

def sprinkle(text, phi, _type):
    if "preposition" in _type:
        return text, phi, _type
    
    if random.random() >= sprinkle_prob:
        return text, phi, _type
    
    um_text, token_mapping = add_um(text)
    um_phi = []
    for t in phi.split():
        if t.isnumeric():
            um_phi += [str(token_mapping[int(t)])]
        else:
            um_phi += [t]
    um_phi = " ".join(um_phi)
    
    return um_text, um_phi, "sprinkle"
            
train_df = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])

proposing_prob = 0.05
sprinkle_prob = 0.05
train_df[['sentence', 'LF', 'type']] = train_df[['sentence', 'LF', 'type']].apply(lambda x: translate(*x), axis=1, result_type='expand')
train_df[['sentence', 'LF', 'type']] = train_df[['sentence', 'LF', 'type']].apply(lambda x: sprinkle(*x), axis=1, result_type='expand')
dev_df[['sentence', 'LF', 'type']] = dev_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
test_df[['sentence', 'LF', 'type']] = test_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')
gen_df[['sentence', 'LF', 'type']] = gen_df[['sentence', 'LF', 'type']].apply(lambda x: translate_regular(*x), axis=1, result_type='expand')

In [None]:
dataset_postfix = "preposing+sprinkles"
train_df.to_csv(f'./cogs_{dataset_postfix}/train.tsv', sep='\t', index=False, header=False)
dev_df.to_csv(f'./cogs_{dataset_postfix}/dev.tsv', sep='\t', index=False, header=False)
test_df.to_csv(f'./cogs_{dataset_postfix}/test.tsv', sep='\t', index=False, header=False)
gen_df.to_csv(f'./cogs_{dataset_postfix}/gen.tsv', sep='\t', index=False, header=False)

### ReCOGS (Number of resampling iterations = 5)
It seems like the performance gain from increasing the number of resampling iterations dimish quickly after getting the number above 10. We are trying 5 here.

In [None]:
existing_digit_pool = set([])
# loading target vocab to random sample our variable names
for k, v in load_vocab("./cogs/tgt_vocab.txt").items():
    if k.isnumeric():
        existing_digit_pool.add(k)
existing_digit_pool = list(existing_digit_pool)

def translate(text, phi):
    
    if len(phi.split()) == 1:
        return text, phi
    elif "LAMBDA" in phi:
        phi_split = phi.split()
        if len(phi_split) == 7:
            return text, phi
        v_pos = []
        idx = 0
        for t in phi_split:
            if t == text:
                v_pos += [idx]
            idx += 1
        for p in v_pos:
            phi_split[p] = phi_split[p+2]
            phi_split[p+2] = text
        
        return text, " ".join(phi_split)
    
    # parse
    text_split = text.split()
    data = []    
    conjs = re.split(r"\s*(?:AND|;)\s*", phi)
    for conj in conjs: 
        if np_re.search(conj):
            d = parse_np(conj)
        elif pred_re.search(conj):
            d = parse_pred(conj)
        elif mod_re.search(conj):
            d = parse_mod(conj)
        else:
            raise Exception(f"Conjunct could not be parsed: {conj}")
        data.append(d)
    
    # collect
    def_terms = []
    role_terms = []
    nmod_terms = []
    for d in data:
        if d['type'] == 'np':
            if d['definiteness'] == '*':
                def_terms += [f"* {d['pred']} ( {d['entvar']} )"]
            else:
                def_terms += [f"{d['pred']} ( {d['entvar']} )"]
        if d['type'] == 'role':
            role_terms += [f"{d['role']} . {d['pred']} ( {d['eventvar']} , {d['entvar']} )"]
        elif d['type'] == 'mod':
            nmod_terms += [f"nmod . {d['pred']} ( {d['e1']} , {d['e2']} )"]
            
    # sort def_terms
    def_terms = [*set(def_terms)]
    def_terms.sort(key = lambda x: int(x.split()[-2]))    
    rest_terms = nmod_terms + role_terms
    
    # combine
    def_terms = " ; ".join(def_terms)
    if def_terms == "":
        terms = " AND ".join(rest_terms)
    elif " AND ".join(rest_terms) == "":
        terms = def_terms
    else:
        terms = def_terms + " ; " + " AND ".join(rest_terms)
    
    # final step, remove biases
    current_digit_pool = set([])
    for t in terms.split():
        if t.isnumeric():
            current_digit_pool.add(t)
    current_digit_pool = list(current_digit_pool)
    random.shuffle(current_digit_pool)
    sample_random_digit = random.sample(existing_digit_pool, k=len(current_digit_pool))
    digit_mapping = dict(zip(current_digit_pool, sample_random_digit))
    
    new_terms = []
    for t in terms.split():
        if t == "_" or t == "x":
            continue
        if t.isnumeric():
            new_terms += [digit_mapping[t]]
        else:
            new_terms += [t]

    terms = " ".join(new_terms)
    return text, terms

In [None]:
sampled_n = 5
append_k = 3072

train_dfs = []
for i in range(sampled_n):
    train_df_i = pd.read_csv("./cogs/train.tsv", sep="\t", names=['sentence', 'LF', 'type'])
    train_df_i[['sentence', 'LF']] = train_df_i[['sentence', 'LF']].apply(lambda x: translate(*x), axis=1, result_type='expand')
    train_dfs += [train_df_i]
    
dev_df = pd.read_csv("./cogs/dev.tsv", sep="\t", names=['sentence', 'LF', 'type'])
test_df = pd.read_csv("./cogs/test.tsv", sep="\t", names=['sentence', 'LF', 'type'])
gen_df = pd.read_csv("./cogs/gen.tsv", sep="\t", names=['sentence', 'LF', 'type'])
dev_df[['sentence', 'LF']] = dev_df[['sentence', 'LF']].apply(lambda x: translate(*x), axis=1, result_type='expand')
test_df[['sentence', 'LF']] = test_df[['sentence', 'LF']].apply(lambda x: translate(*x), axis=1, result_type='expand')
gen_df[['sentence', 'LF']] = gen_df[['sentence', 'LF']].apply(lambda x: translate(*x), axis=1, result_type='expand')

In [None]:
def reindex(LFs, existing_digit_pool):
    curr_digit = set([])
    for i in range(len(LFs)):
        for item in LFs[i].split():
            if item.isnumeric():
                curr_digit.add((i, int(item)))
    sampled_digits = random.sample(existing_digit_pool, k=len(curr_digit))
    digit_map = {}
    idx = 0
    for d in list(curr_digit):
        digit_map[d] = sampled_digits[idx]
        idx += 1
    
    reindex_LFs = []
    for i in range(len(LFs)):
        new_LFs = []
        for item in LFs[i].split():
            if item.isnumeric():
                new_LFs += [digit_map[(i, int(item))]]
            else:
                new_LFs += [item]
        reindex_LFs += [" ".join(new_LFs)]
        
    new_LF_prefix = []
    new_LF_body_nmod = []
    new_LF_body_verb = []
        
    for i in range(len(reindex_LFs)):
        new_LF_prefix.extend(reindex_LFs[i].split(" ; ")[:-1])
        for term in reindex_LFs[i].split(" ; ")[-1].split(" AND "):
            if "nmod" in term:
                new_LF_body_nmod += [term]
            else:
                new_LF_body_verb += [term]
                
    new_LF_body = new_LF_body_nmod + new_LF_body_verb
        
    return " ; ".join(new_LF_prefix) + " ; " + " AND ".join(new_LF_body)

start_indexes = [i*6 for i in range(append_k)]
append_data = []

for i in range(sampled_n):
    train_df_sorted = train_dfs[i].sort_values(by="sentence", key=lambda x: x.str.len())
    for start_index in start_indexes:
        append_data += [
            [train_df_sorted.iloc[-1-start_index].sentence[:-1]+", "+\
            train_df_sorted.iloc[-2-start_index].sentence[0].lower()+\
            train_df_sorted.iloc[-2-start_index].sentence[1:-1]+", "+\
            train_df_sorted.iloc[-3-start_index].sentence[0].lower()+\
            train_df_sorted.iloc[-3-start_index].sentence[1:],
            reindex(
                [
                    train_df_sorted.iloc[-1-start_index].LF,
                    train_df_sorted.iloc[-2-start_index].LF,
                    train_df_sorted.iloc[-3-start_index].LF
                ], existing_digit_pool
            ),
            'length_ood']
        ]
append_data = pd.DataFrame(append_data, columns =['sentence', 'LF', 'type'])

In [None]:
train_df = pd.concat(train_dfs)
train_df = pd.concat([train_df, append_df])

dataset_postfix = "recogs"
train_df.to_csv(f'./{dataset_postfix}/train.tsv', sep='\t', index=False, header=False)
dev_df.to_csv(f'./{dataset_postfix}/dev.tsv', sep='\t', index=False, header=False)
test_df.to_csv(f'./{dataset_postfix}/test.tsv', sep='\t', index=False, header=False)
gen_df.to_csv(f'./{dataset_postfix}/gen.tsv', sep='\t', index=False, header=False)