In [1]:
import os
import json
import random

In [2]:
train_q_file = "../../data/CLEVR_v1/data/CLEVR_train_questions.json"

In [3]:
with open(train_q_file) as f:
    train_questions = json.load(f)

In [4]:
info = train_questions['info']

In [5]:
questions = train_questions['questions']

In [6]:
questions_dict = {}
for q in questions:
    questions_dict[q['question_index']] = q

## Sampling CLEVR dataset with 10% AND or EQUALS questions

In [7]:
def get_questions_with_word(word):
    word_questions = set()
    for k,v in questions_dict.items():
        if word in v['question']:
            word_questions.add(k)
    return word_questions

In [8]:
def get_total_word_count_and_data(subsampled_questions_dict):
    total_word_count = 0
    data = []
    for k,q in subsampled_questions_dict.items():
        line = q['question']
        line = line.strip('?').lower()
        line_as_list = line.split(" ")
        data += line_as_list
        total_word_count += len(line_as_list)
    return data, total_word_count

def get_token_counts(data):
    word_set = {"and", "or", "more", "fewer", "behind", "front", "same"}
    token_count_dict = {"and": 0, "or": 0, "more": 0, "fewer": 0, "behind": 0, "in front": 0, "same": 0}
    prev_word = ""
    for word in data:
        if word in word_set:
            if word == "front":
                if prev_word == "in":
                    token_count_dict["in front"] += 1
            else:
                token_count_dict[word] += 1
        prev_word = word
    return token_count_dict

In [9]:
data, total_word_count = get_total_word_count_and_data(questions_dict)
total_word_count

12868670

In [10]:
token_count_dict = get_token_counts(data)
token_count_dict

{'and': 81506,
 'or': 63214,
 'more': 11570,
 'fewer': 11851,
 'behind': 147409,
 'in front': 147506,
 'same': 356333}

In [11]:
sum_count = sum([token_count_dict[k] for k in token_count_dict.keys()])
token_prop_dict = {k:(v/sum_count) for k,v in token_count_dict.items()}
token_prop_dict

{'and': 0.0994716795075355,
 'or': 0.07714772836833299,
 'more': 0.014120277426228569,
 'fewer': 0.014463215884030663,
 'behind': 0.17990112144536965,
 'in front': 0.18001950233649708,
 'same': 0.43487647503200555}

### AND questions

In [12]:
and_questions = get_questions_with_word(" and ")

In [14]:
len(and_questions) / 10

8150.6

In [23]:
keep_and_questions = set(random.sample(and_questions, 8151))

### EQUALS questions

In [16]:
equal_questions = get_questions_with_word(" same as the number ") | get_questions_with_word(" same number ") | get_questions_with_word(" equal ")

In [17]:
len(equal_questions) / 10

1594.2

In [22]:
keep_equal_questions = set(random.sample(equal_questions, 1594))

### Subsampled corpus

In [19]:
union_remove_questions = and_questions | equal_questions

In [20]:
keep_questions = set(questions_dict.keys())

In [24]:
keep_questions  = (keep_questions - union_remove_questions) | keep_and_questions | keep_equal_questions

In [25]:
subsampled_questions_dict = {x : questions_dict[x] for x in keep_questions}

In [26]:
len(subsampled_questions_dict.keys())

622793

In [27]:
data, total_word_count = get_total_word_count_and_data(subsampled_questions_dict)
total_word_count

11135280

In [28]:
token_count_dict = get_token_counts(data)
token_count_dict

{'and': 9096,
 'or': 63214,
 'more': 11570,
 'fewer': 11851,
 'behind': 118426,
 'in front': 118324,
 'same': 318863}

In [29]:
sum_count = sum([token_count_dict[k] for k in token_count_dict.keys()])

In [30]:
token_prop_dict = {k:(v/sum_count) for k,v in token_count_dict.items()}
token_prop_dict

{'and': 0.013964970890957773,
 'or': 0.09705163477363728,
 'more': 0.017763271021150114,
 'fewer': 0.018194686678621435,
 'behind': 0.18181790267508413,
 'in front': 0.18166130339728315,
 'same': 0.4895462305632661}

### Save new dataset

In [31]:
new_questions = [questions_dict[x] for x in keep_questions]

In [32]:
len(new_questions)

622793

In [33]:
# now renumber questions ids for new dataset
i = 0
for q in new_questions:
    q['question_index'] = i
    i+=1

In [34]:
new_train_questions = {'info': info, 'questions': new_questions}

In [36]:
new_train_q_file = "../../data/CLEVR_10AND10EQUAL/data/CLEVR_train_questions.json"
with open(new_train_q_file, "w") as f:
    json.dump(new_train_questions, f)
