In [2]:
import pickle, json
import spacy
from collections import defaultdict
from collections import Counter
import random
from sklearn.model_selection import train_test_split
import pandas as pd, numpy as np

In [4]:
nlp = spacy.load('en_core_web_md')

## Section1: Generate Negative Options

In [36]:
# dd = defaultdict(list)
# for o in data:
#     context = o['context']
#     arg2 = o['qas'][0]['answers'][0]['text']
# #     index = context.index(arg2)
#     relation = o['qas'][0]['question']
#     pos_tags = '_'.join([o.pos_ for o in nlp(arg2)])
#     dd[relation].append(pos_tags)

In [355]:
with open('adversarial-outputs/spanbert-squad/predictions_test.json') as f:
    predictions = json.load(f)
with ('original_data/relations_only/adversarial_train_and_test.json') as f: 
    data = json.load(f)

In [356]:
dd = defaultdict(list)
for i, ((pred_k, pred_v), d) in \
enumerate(zip(predictions.items(), data)):
    if pred_v == '.' or pred_v == '':
        continue
    ans = d['qas'][0]['answers'][0]['text']
    context = d['context']
    relation = d['qas'][0]['question']
    if ans in context:
        continue
    dialog_id = ' '.join(pred_k.split('-')[:3])
    dd[f'{dialog_id}+{relation}'].append(pred_v)

In [357]:
json.dump(dd, open('negative_options.json', 'w'), indent=4)

## Section 2: Generate iteration 0 Adversarial dataset

In [358]:
qa_data = json.load(open('/home/emrys/Data/csk_data/csk_qa_22_01/data/qa/train_and_test.json'))
qa_data_old = json.load(open('/home/emrys/Data/csk_data/csk_qa_22_01/data/qa_old/train_and_test.json'))
dd_ans = defaultdict(list)
for d in qa_data_old:
    dialog_id = ' '.join(d['qas'][0]['id'].split('-')[:3])
    question = d['qas'][0]['question']
    dd_ans[f"{dialog_id}+{question}"].append(d['qas'][0]['answers'][0]['text'])


samples = []
for index, (d, d_old) in enumerate(zip(qa_data, qa_data_old)):
    relation = d['qas'][0]['question']
    dialog_id = ' '.join(d['qas'][0]['id'].split('-')[:3])
    options = dd[f'{dialog_id}+{relation}']
    question = d_old['qas'][0]['question']
    
    correct_answers = dd_ans[f"{dialog_id}+{question}"]
    options = list(set([o for o in options if o not in correct_answers]))
    
    if len(options) < 3:
        continue
    options = random.sample(options, 3)
    insert_position = random.randint(0,3)
    correct_ans = d['qas'][0]['answers'][0]['text']
    options.insert(insert_position, correct_ans)
    for i in range(len(options)):
        options[i] = '"' + options[i] + '"'
    sample = ""
    sample += (f"{index},{d['qas'][0]['id']},")
    sample += ("0" + ',')
#     sample += (f'{relation},') # start phrase
    sample += (f"\"{d_old['qas'][0]['question']}\",") # start phrase
    context = d['context'].replace("\"", "\'")
    sample += (f"\"{context}\",") # sent1    
#     sample += (f"\"{d_old['qas'][0]['question']}\",") # sent2
    sample += (f"{relation},") # sent2
    sample += (f'gen,')
    sample += (','.join(options)+',')
    sample += (f'{insert_position}\n')
    samples.append(sample)

In [359]:
train, test = train_test_split(samples, test_size=0.5)

In [360]:
with open('val_iter0.csv', 'w') as f:
    f.write(",video-id,fold-ind,startphrase,sent1,sent2,gold-source,ending0,ending1,ending2,ending3,label\n")
    for o in test:
        f.write(o)

with open('train_iter0.csv', 'w') as f:
    f.write(",video-id,fold-ind,startphrase,sent1,sent2,gold-source,ending0,ending1,ending2,ending3,label\n")
    for o in train:
        f.write(o)

## Section 3: Generate later iterations 2-35

In [2]:
i = 0 # stands for the 0th iteration

In [423]:
test = pd.read_csv(f'../dataset/val_iter{i}.csv')
train = pd.read_csv(f'../dataset/train_iter{i}.csv')
# this generated file may be in the corresponding model folder
replace_index = np.load('../replace_index.npy')

In [416]:
for idx in range(len(test)):
    rep_i = replace_index[idx]
    gold = test.loc[idx, 'label']
    if rep_i == gold:
        continue
    rep_text = test.loc[idx, f'ending{rep_i}']
    
    dialog_id = ' '.join(test.loc[idx, 'video-id'].split('-')[:3])
    relation = test.loc[idx, 'sent2']
    options = dd[f'{dialog_id}+{relation}']
    
    question = test.loc[idx, 'startphrase']
    correct_answers = dd_ans[f"{dialog_id}+{question}"]
    options = list(set([o for o in options if o not in correct_answers]))
    
    existing_answers = [test.loc[idx, f'ending{i}'] for i in range(4)]
    options = [o for o in options if o not in existing_answers]
    
    if len(options) > 0:
        candidate = random.sample(options, 1)
        test.loc[idx, f'ending{rep_i}'] = candidate

In [417]:
train_and_test = pd.concat([train, test])
train, test = train_test_split(train_and_test, test_size=1/3)

In [418]:
train.to_csv(f'../dataset/train_iter{i+1}.csv', index=False)
test.to_csv(f'../dataset/val_iter{i+1}.csv', index=False)

## Section 4:Data split

In [455]:
it = 35
test = pd.read_csv(f'../dataset/val_iter{it}.csv')
train = pd.read_csv(f'../dataset/train_iter{it}.csv')
train_and_test = pd.concat([train, test])
train_and_test = train_and_test.reset_index(drop=True)

In [456]:
idsf = '../dataset/ids/'
fold1_train = open(idsf + 'fold1_train.txt').read().splitlines()
fold2_train = open(idsf + 'fold2_train.txt').read().splitlines()
fold3_train = open(idsf + 'fold3_train.txt').read().splitlines()
fold4_train = open(idsf + 'fold4_train.txt').read().splitlines()
fold5_train = open(idsf + 'fold5_train.txt').read().splitlines()

fold1_test = open(idsf + 'fold1_test.txt').read().splitlines()
fold2_test = open(idsf + 'fold2_test.txt').read().splitlines()
fold3_test = open(idsf + 'fold3_test.txt').read().splitlines()
fold4_test = open(idsf + 'fold4_test.txt').read().splitlines()
fold5_test = open(idsf + 'fold5_test.txt').read().splitlines()

In [457]:
def get_train_test_index(train_index, test_index):
    train_idx, test_idx = [], []
    for idx, row in train_and_test.iterrows():
        dialog_id = row['video-id'].split('-')
        if len(dialog_id) == 4:
            dialog_id = '-'.join(dialog_id[:2])
        else:
            dialog_id = '-'.join(dialog_id[:3])
        if dialog_id in train_index:
            train_idx.append(idx)
        else:
            test_idx.append(idx)
    return train_idx, test_idx

In [459]:
train_folds = [fold1_train, fold2_train, fold3_train, fold4_train, fold5_train]
test_folds = [fold1_test, fold2_test, fold3_test, fold4_test, fold5_test]
for i, (train_fold, test_fold) in enumerate(zip(train_folds, test_folds)):
    train_idx, test_idx = get_train_test_index(train_fold, test_fold)
    train = train_and_test.loc[train_idx]
    test = train_and_test.loc[test_idx]
    train.loc[:,['sent2', 'startphrase']] = train.loc[:,['startphrase','sent2']].values
    test.loc[:,['sent2', 'startphrase']] = test.loc[:,['startphrase','sent2']].values
    train.to_csv(f'../dataset/train_iter{it}_fold{i}.csv', index=False)
    test.to_csv(f'../dataset/test_iter{it}_fold{i}.csv', index=False)

## Optional: Generate dataset for only Questions, sent1 = None

In [453]:
for i in range(5):
    i = 4
    train = pd.read_csv(f'/home/emrys/Github/csk_mcq/dataset/csk_mcq_22_01/train_iter10_fold{i}.csv')
    test = pd.read_csv(f'/home/emrys/Github/csk_mcq/dataset/csk_mcq_22_01/test_iter10_fold{i}.csv')
    train.loc[:, 'sent1'] = 'fake_sent1'
    test.loc[:, 'sent1'] = 'fake_sent1'
    train.to_csv(f'train_iter{it}_fold{i}_Q.csv', index=False)
    test.to_csv(f'test_iter{it}_fold{i}_Q.csv', index=False)

## Generate dataset for only Ending. (not included in paper)

In [445]:
for i in range(5):
    train = pd.read_csv(f'/home/emrys/Github/csk_mcq/dataset/csk_mcq_22_01/train_iter10_fold{i}.csv')
    test = pd.read_csv(f'/home/emrys/Github/csk_mcq/dataset/csk_mcq_22_01/test_iter10_fold{i}.csv')
    train.loc[:, 'sent1'] = 'fake_sent1'
    test.loc[:, 'sent1'] = 'fake_sent1'
    train.loc[:, 'sent2'] = 'fake_sent2'
    test.loc[:, 'sent2'] = 'fake_sent2'
    train.to_csv(f'train_iter{it}_fold{i}_S.csv', index=False)
    test.to_csv(f'test_iter{it}_fold{i}_S.csv', index=False)

## Check predictions (Not used in paper)

In [255]:
predictions = np.load('/home/emrys/Github/transformers/examples/multiple-choice/predictions.npy')
labels = np.load('/home/emrys/Github/transformers/examples/multiple-choice/label_ids.npy')

In [446]:
# test.loc[predictions.argmax(1) != labels]

## Check negative options (Not used in paper)

In [447]:
negative_options = json.load(open('negative_options.json'))

In [449]:
count = 0
for o in negative_options.values():
    count += len(o)

In [450]:
count / len(negative_options)

20.764682850430695