In [27]:
import json
import numpy as np
np.random.seed(4)

In [2]:
# read CFQ dataset
with open("cfq/dataset.json", "r") as injson:
    inlist = json.load(injson)
    total_entries = len(inlist)
    print(f"Len of dataset: {total_entries}")

Len of dataset: 239357


In [5]:
'''simplify CFQ dataset by only saving its question index, question with brackets, question pattern,
question template, sparql query, sparql pattern and recursion depth.'''
simplified_cfq = []
for h, i in enumerate(inlist):
    single_cfq = {
        'questionIdx': h,
        'questionWithBrackets': i['questionWithBrackets'],
        'questionPatternModEntities': i['questionPatternModEntities'],
        'questionTemplate': i["complexityMeasures"]["questionTemplate"],
        'sparql': i['sparql'],
        'sparqlPattern': i['sparqlPattern'],
        'expectedResponse': i['expectedResponse'],
        'recursionDepth': i['complexityMeasures']['recursionDepth']
        }
    simplified_cfq.append(single_cfq)

In [6]:
# save simplified CFQ dataset
with open('cfq/simplified_cfq.json','w') as f_1:
    json.dump(simplified_cfq, f_1)

In [7]:
# read MCD1 split
with open('cfq/splits/mcd1.json', 'r') as f:
    splits = json.load(f)

print("train set contains {} entries".format(len(splits['trainIdxs'])))
print("dev set contains {} entries".format(len(splits['devIdxs'])))
print("test set contains {} entries".format(len(splits['testIdxs'])))

train set contains 95743 entries
dev set contains 11968 entries
test set contains 11968 entries


In [8]:
#count numbers of train, dev, test split
trainset = []
for i in splits['trainIdxs']:
    trainset.append(simplified_cfq[i])

devset = []
for i in splits['devIdxs']:
    devset.append(simplified_cfq[i])

testset = []
for i in splits['testIdxs']:
    testset.append(simplified_cfq[i])


In [9]:
# create dictionaries of train, dev, test set with keys as their complexity/recursion depth
complexity = []
for i in simplified_cfq:
    complexity.append(i['recursionDepth'])
complexity = set(complexity)

def get_dict_by_complexity(input):
    dict_by_complexity = {element:[] for element in complexity}
    for i in input:
        dict_by_complexity[i['recursionDepth']].append(i)
    return dict_by_complexity

train_dict_by_complexity = get_dict_by_complexity(trainset)
dev_dict_by_complexity = get_dict_by_complexity(devset)
test_dict_by_complexity = get_dict_by_complexity(testset)

In [10]:
'''calculate the numbers of train, dev, test set for multiCFQ
multiCFQ should have 1600 sentences in train set, 201 in dev set, 201 in test set. '''
num_train_sample = np.ceil(2000 * len(trainset)/len(trainset + devset + testset))
num_dev_sample = np.ceil(2000 * len(devset)/len(trainset + devset + testset))
num_test_sample = np.ceil(2000 * len(testset)/len(trainset + devset + testset))
print(f'multiCFQ should have {num_train_sample} sentences in train set, {num_dev_sample} in dev set, {num_test_sample} in test set. ')

multiCFQ should have 1600.0 sentences in train set, 201.0 in dev set, 201.0 in test set. 


In [11]:
#create dictionaries for the numbers of train, dev, test set for muiltiCFQ with keys as complexity/recursion depth
multicfq_train_sample_num_to_complexity = {}
for complexity, questions_by_complexity in train_dict_by_complexity.items():
     multicfq_train_sample_num_to_complexity[complexity] = int(np.ceil(num_train_sample * len(questions_by_complexity)/len(trainset)))

multicfq_dev_sample_num_to_complexity = {}
for complexity, questions_by_complexity in dev_dict_by_complexity.items():
     multicfq_dev_sample_num_to_complexity[complexity] = int(np.ceil(num_dev_sample * len(questions_by_complexity)/len(devset)))

multicfq_test_sample_num_to_complexity = {}
for complexity, questions_by_complexity in test_dict_by_complexity.items():
     multicfq_test_sample_num_to_complexity[complexity] = int(np.ceil(num_test_sample * len(questions_by_complexity)/len(testset)))

In [37]:
#create dictionaries for train, dev, test set for muiltiCFQ with keys as complexity/recursion depth
np.random.seed(4)

multicfq_train_by_complexity = {}
for complexity, questions_by_complexity in train_dict_by_complexity.items():
    multicfq_train_by_complexity[complexity] = np.random.choice(questions_by_complexity, multicfq_train_sample_num_to_complexity[complexity])

multicfq_dev_by_complexity = {}
for complexity, questions_by_complexity in dev_dict_by_complexity.items():
    multicfq_dev_by_complexity[complexity] = np.random.choice(questions_by_complexity, multicfq_dev_sample_num_to_complexity[complexity])

multicfq_test_by_complexity = {}
for complexity, questions_by_complexity in test_dict_by_complexity.items():
    multicfq_test_by_complexity[complexity] = np.random.choice(questions_by_complexity, multicfq_test_sample_num_to_complexity[complexity])

In [38]:
#create lists for train, dev, test set for muiltiCFQ
multicfq_train = []
for value in multicfq_train_by_complexity.values():
    multicfq_train.extend(value)

multicfq_dev = []
for value in multicfq_dev_by_complexity.values():
    multicfq_dev.extend(value)

multicfq_test = []
for value in multicfq_test_by_complexity.values():
    multicfq_test.extend(value)

In [39]:
#save multiCFQ's train, dev, test set as json files
with open('multicfq/multicfq_train.json','w') as m_train:
    json.dump(multicfq_train, m_train)

with open('multicfq/multicfq_dev.json','w') as m_dev:
    json.dump(multicfq_dev, m_dev)

with open('multicfq/multicfq_test.json','w') as m_test:
    json.dump(multicfq_test, m_test)