In [1]:
import pandas as pd
import itertools
import random
import json

import pyinflect
import spacy
from spacy.tokens import Doc, Span, Token

In [2]:
FOLDER = "gap"

In [4]:
data  = {
    "subj": [
        "we",
        "they",
        "he",
        "she",
        "you"
    ],
    "prefix_verb": [
        "know"
    ],
    "verb": [
     'acknowledge',
     'believe',
     'determine',
     'discover',
     'hold',
     'know',
     'mention',
     'notice',
     'observe',
     'recognize',
     'recommend',
     'remember',
     'require',
     'reveal',
     'show',
     'suspect',
     'understand',
     'love'
    ],
    "object": [
        "someone",
        "everyone",
        "them",
        "her",
        "him",
        "ourselves",
        "myself"
    ],
    "continuation": [
        "by the deadline",
        "last semester",
        "last year",
        "last week",
        "in the middle of the night",
        "after the shocking incident",
        "over the summer",
        "over the past decade",
        "during the financial crisis",
        "last semester",
        "last week",
        "last winter",
        "earlier that week",
        "last month",
        "before the trial"
    ],
}


In [5]:
model = "en_core_web_lg"
nlp = spacy.load(model)

def get_parenthetical():
    s, v = inflect("who", random.choice(verbs))
    out = [s, v, random.choice(data["object"])]
    return " ".join(out)

def inflect(noun, verb):
    sent = " ".join([noun, verb])
    doc = nlp(sent)
    inflection = doc[1].tag_ if doc[1].tag_ in ['VBD', 'VB', 'VBG'] else 'VBD'
    vi = doc[1]._.inflect(inflection)
    if vi is None:
        return noun, verb
    else:
        return noun, vi

def i_me(sent):
    words = set(sent.split())
    if "I" in words and "me" in words:
        return sent.replace("me", "myself")
    return sent

def we_us(sent):
    words = set(sent.split())
    if "we" in words and "us" in words:
        return sent.replace("us", "ourselves")
    return sent

def fix(sent):
    sent = i_me(sent)
    sent = we_us(sent)
    return sent


def stringify(sent):
    sent = " ".join(sent).replace(' ,', ',')
    sent = fix(sent)
    sent = sent[0].upper() + sent[1:]
    return sent 

def complement(prev_subjs, prev_verbs):
    subjs = [s for s in data['subj'] if s not in prev_subjs]
    subj = random.choice(subjs)
    
    verbs = [v for v in data['verb'] if v not in prev_verbs]
    verb = random.choice(verbs)
    
    return inflect(subj, verb)

def get_parts(N, words, splice_obj = False):
    prefix_subj = "I"  # random.choice(data['subj'])
    prefix_verb = random.choice(data['prefix_verb'])

    if splice_obj:
        splice_obj = random.choice(data['object']) # [cp_2_verb]
        embeds, parenthetical_count = get_embeds_splice_obj(N, words, splice_obj)
    else:
        embeds, parenthetical_count = get_embeds(N, words)

    obj = random.choice(data['object']) # [cp_2_verb]

    continuation = random.choice(data['continuation'])
    info = {
        'parenthetical_count': parenthetical_count,
        'clause_count': N
    }
    return prefix_subj, prefix_verb, embeds, obj, continuation, info

def get_embeds(N, words):
    embeds = []
    P = 1 / (N * 2)
    parenthetical_count = 0
    for i in range(N):
        if i < N:
            embeds.append(words[i])
        s, v = complement([], [])
        if random.random() < P and parenthetical_count == 0:
            parenthetical = get_parenthetical()
            embeds.extend([s, parenthetical, v])
            parenthetical_count += 1
        else:
            embeds.extend([s, v])
    return embeds, parenthetical_count

def get_embeds_splice_obj(N, words, obj):
    embeds = []
    P = 1 / (N * 2)
    parenthetical_count = 0
    # For instance, if N is 2, then its 0. If N is 3, then its 1 or 2.
    if N == 2:
        splice_level = 0
        words = ["who", "that"]
        
    elif N == 3:
        if random.random() < 0.67:
            splice_level = 1
            words = random.choice([
               ["who", "that", "that"],
               ["that", "who", "that"]
            ])
        else:
            splice_level = 0
            words = ["who", "that", "that"]
    else:
        assert False, f"Expected N <= 3, but N = {N}, MAX = {MAX}."
    for i in range(N):
        if i < N:
            embeds.append(words[i])
        s, v = complement([], [])
        if random.random() < P and parenthetical_count == 0:
            parenthetical = get_parenthetical()
            embeds.extend([s, parenthetical, v])
            parenthetical_count += 1
        else:
            embeds.extend([s, v])
        if splice_level == i:
            embeds.append(obj)
    return embeds, parenthetical_count

In [6]:
MAX = 3

def S_wh_gap():
    N = random.randint(1, MAX)
    words = ["that"] * (N - 1) + ["who"]
    random.shuffle(words)
    prefix_subj, prefix_verb, embeds, obj, continuation, info = get_parts(N, words)
    return [prefix_subj, prefix_verb] + embeds + [continuation], info

def S_that_no_gap():
    N = random.randint(1, MAX)
    words = ["that"] * (N)
    random.shuffle(words)
    prefix_subj, prefix_verb, embeds, obj, continuation, info = get_parts(N, words)
    return [prefix_subj, prefix_verb] + embeds + [obj, continuation], info

def S_wh_no_gap():
    N = random.randint(1, MAX)
    words = ["that"] * (N - 1) + ["who"]
    random.shuffle(words)
    prefix_subj, prefix_verb, embeds, obj, continuation, info = get_parts(N, words)
    return [prefix_subj, prefix_verb] + embeds + [obj, continuation], info

def S_that_gap():
    N = random.randint(1, MAX)
    words = ["that"] * (N)
    random.shuffle(words)
    prefix_subj, prefix_verb, embeds, obj, continuation, info = get_parts(N, words)
    return [prefix_subj, prefix_verb] + embeds + [continuation], info

def S_wh_gap_obj():
    # NOTE: This setup doesn't work with only one clause -- it folds into `S_wh_no_gap`.
    N = random.randint(1 + 1, MAX)
    words = ["that"] * (N - 1) + ["who"]
    random.shuffle(words)
    prefix_subj, prefix_verb, embeds, obj, continuation, info = get_parts(N, words, splice_obj = True)
    return [prefix_subj, prefix_verb] + embeds + [continuation], info


filler_templates = [  
    ('S_wh_gap', 'both', 'yes', S_wh_gap),
    ('S_that_no_gap', 'both', 'yes', S_that_no_gap),
    ('S_wh_no_gap', 'neither', 'no', S_wh_no_gap),
    ('S_that_gap', 'neither', 'no', S_that_gap),
    ('S_wh_gap_obj', 'bad-only', 'no', S_wh_gap_obj),
]

count = 2500
output = []

for name, section, acceptable, template in filler_templates:
    for _ in range(count):
        parts, info = template()
        sent = stringify(parts)
        output.append({
            **{
            "sentence": sent,
            "section": section,
            "acceptable": acceptable,
            "template": name
            }, 
             **info,
        })

df = pd.DataFrame(output)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
df = df.sort_values(["acceptable", "section", "template", "parenthetical_count", "clause_count"])
df.sample(10)

Unnamed: 0,sentence,section,acceptable,template,parenthetical_count,clause_count
10541,I know who we recognized someone that they who noticed myself loved over the summer,bad-only,no,S_wh_gap_obj,1,2
5299,I know that he knew that you who revealed everyone recognized who we knew her last winter,neither,no,S_wh_no_gap,1,3
9571,I know that you recognized over the summer,neither,no,S_that_gap,0,1
5202,I know that she observed that she who knew him loved who you understood myself over the past decade,neither,no,S_wh_no_gap,1,3
9346,I know that they observed over the past decade,neither,no,S_that_gap,0,1
7097,I know that he knew that we who recommended her noticed who she observed myself last semester,neither,no,S_wh_no_gap,1,3
10366,I know that he acknowledged who you suspected someone that we acknowledged before the trial,bad-only,no,S_wh_gap_obj,0,3
10420,I know who they recommended myself that we knew that he determined last week,bad-only,no,S_wh_gap_obj,0,3
1913,I know that she held who they understood that he acknowledged last semester,both,yes,S_wh_gap,0,3
3094,I know that we who mentioned myself observed someone last semester,both,yes,S_that_no_gap,1,1


In [8]:
df = df.drop_duplicates('sentence')
df["label"] = (df.acceptable == "yes").astype(int)
df.to_csv(f"filler-gap-{count}.tsv", index=False, sep='\t')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [9]:
templates = ['S_wh_gap', 'S_that_no_gap', 'S_wh_no_gap', 'S_that_gap']
bad_only = ['S_wh_gap_obj']

SPLIT_SIZE = 1000

train = []
test = []
from sklearn.model_selection import train_test_split
for t in templates:
    x = df[df.template == t]
    _train, _test = train_test_split(x, test_size=0.5)
    train.append(_train.sample(SPLIT_SIZE))
    test.append(_test.sample(SPLIT_SIZE))
    
train_df = pd.concat(train)
test_df = pd.concat(test)

In [10]:
TOTAL_SIZE = len(train_df) 

SIZE_ORIG_1, SIZE_NEW_1 = round(TOTAL_SIZE * 0.99), round(TOTAL_SIZE * 0.01)
SIZE_ORIG_5, SIZE_NEW_5 = round(TOTAL_SIZE * 0.99), round(TOTAL_SIZE * 0.01)

# train_bad = 

t = 'S_wh_gap_obj'
x = df[df.template == t]
train_bad, test_bad = train_test_split(x, test_size=0.5)
train_bad, test_bad = train_bad.sample(SPLIT_SIZE), test_bad.sample(SPLIT_SIZE)

all_train = pd.concat([train_df, train_bad])
test = pd.concat([test_df, test_bad])

In [11]:
def to_jsonl(df, path):
    with open(path, 'w') as f:
        f.write(df.to_json(orient='records', lines=True))

In [14]:
# both / weak ! [weak]
_weak_both_train = all_train[all_train.section == 'both'].sample(1000)
_weak_weak_train = all_train[all_train.section == 'bad-only']
_weak_both_test = test[test.section == 'both'].sample(1000)
_weak_weak_test = test[test.section == 'bad-only']

_weak_probing_train = pd.concat([_weak_both_train, _weak_weak_train])
_weak_probing_test = pd.concat([_weak_both_test, _weak_weak_test])

to_jsonl(_weak_probing_train, f"{FOLDER}/gap_probing_weak_train.jsonl")
to_jsonl(_weak_probing_test, f"{FOLDER}/gap_probing_weak_val.jsonl")

In [17]:
# both / neither ! [strong]
_strong_both_train = all_train[all_train.section == 'both'].sample(1000)
_strong_neither_train = all_train[all_train.section == 'neither'].sample(1000)
_strong_both_test = test[test.section == 'both'].sample(1000)
_strong_neither_test = test[test.section == 'neither'].sample(1000)

_strong_probing_train = pd.concat([_strong_both_train, _strong_neither_train])
_strong_probing_test = pd.concat([_strong_both_test, _strong_neither_test])

to_jsonl(_strong_probing_train, f"{FOLDER}/gap_probing_strong_train.jsonl")
to_jsonl(_strong_probing_test, f"{FOLDER}/gap_probing_strong_val.jsonl")

In [18]:
_strong_both_train = all_train[all_train.section == 'both']
_strong_neither_train = all_train[all_train.section == 'neither']
_strong_both_test = test[test.section == 'both']
_strong_neither_test = test[test.section == 'neither']

_strong_probing_train = pd.concat([_strong_both_train, _strong_neither_train])
_strong_probing_test = pd.concat([_strong_both_test, _strong_neither_test])

to_jsonl(_strong_probing_train, f"{FOLDER}/gap_finetune_0_train.jsonl")
to_jsonl(_strong_probing_test, f"{FOLDER}/gap_finetune_0_val.jsonl")

gap_finetune_1_train = pd.concat([_strong_probing_train.sample(SIZE_ORIG_1), train_bad.sample(SIZE_NEW_1)])
gap_finetune_1_val = pd.concat([_strong_probing_test.sample(SIZE_ORIG_1), test_bad.sample(SIZE_NEW_1)])

to_jsonl(gap_finetune_1_train, f"{FOLDER}/gap_finetune_1_train.jsonl")
to_jsonl(gap_finetune_1_val, f"{FOLDER}/gap_finetune_1_val.jsonl")

gap_finetune_5_train = pd.concat([_strong_probing_train.sample(SIZE_ORIG_5), train_bad.sample(SIZE_NEW_5)])
gap_finetune_5_val = pd.concat([_strong_probing_test.sample(SIZE_ORIG_5), test_bad.sample(SIZE_NEW_5)])

to_jsonl(gap_finetune_5_train, f"{FOLDER}/gap_finetune_5_train.jsonl")
to_jsonl(gap_finetune_5_val, f"{FOLDER}/gap_finetune_5_val.jsonl")

to_jsonl(test, f"{FOLDER}/gap_test.jsonl")

In [None]:
# weak / neither ?
