In [1]:
import os
import json
import random

In [3]:
train_q_file = "../../data/CLEVR_v1/data/CLEVR_train_questions.json"

In [4]:
with open(train_q_file) as f:
    train_questions = json.load(f)

In [5]:
info = train_questions['info']

In [6]:
questions = train_questions['questions']

In [7]:
questions_dict = {}
for q in questions:
    questions_dict[q['question_index']] = q

## Sampling CLEVR dataset without AND or EQUALS questions

In [2]:
def get_questions_with_word(word):
    word_questions = set()
    for k,v in questions_dict.items():
        if word in v['question']:
            word_questions.add(k)
    return word_questions

In [3]:
def get_total_word_count_and_data(subsampled_questions_dict):
    total_word_count = 0
    data = []
    for k,q in subsampled_questions_dict.items():
        line = q['question']
        line = line.strip('?').lower()
        line_as_list = line.split(" ")
        data += line_as_list
        total_word_count += len(line_as_list)
    return data, total_word_count

def get_token_counts(data):
    word_set = {"and", "or", "more", "fewer", "behind", "front", "same"}
    token_count_dict = {"and": 0, "or": 0, "more": 0, "fewer": 0, "behind": 0, "in front": 0, "same": 0}
    prev_word = ""
    for word in data:
        if word in word_set:
            if word == "front":
                if prev_word == "in":
                    token_count_dict["in front"] += 1
            else:
                token_count_dict[word] += 1
        prev_word = word
    return token_count_dict

In [10]:
data, total_word_count = get_total_word_count_and_data(questions_dict)
total_word_count

12868670

In [11]:
token_count_dict = get_token_counts(data)
token_count_dict

{'and': 81506,
 'or': 63214,
 'more': 11570,
 'fewer': 11851,
 'behind': 147409,
 'in front': 147506,
 'same': 356333}

In [12]:
sum_count = sum([token_count_dict[k] for k in token_count_dict.keys()])
token_prop_dict = {k:(v/sum_count) for k,v in token_count_dict.items()}
token_prop_dict

{'and': 0.0994716795075355,
 'or': 0.07714772836833299,
 'more': 0.014120277426228569,
 'fewer': 0.014463215884030663,
 'behind': 0.17990112144536965,
 'in front': 0.18001950233649708,
 'same': 0.43487647503200555}

### AND questions

In [13]:
and_questions = get_questions_with_word(" and ")

### EQUALS questions

In [14]:
equal_questions = get_questions_with_word(" same as the number ") | get_questions_with_word(" same number ") | get_questions_with_word(" equal ")

### Subsampled corpus

In [15]:
union_remove_questions = and_questions | equal_questions

In [18]:
keep_questions = set(questions_dict.keys())

In [19]:
keep_questions  = keep_questions - union_remove_questions

In [25]:
subsampled_questions_dict = {x : questions_dict[x] for x in keep_questions}

In [26]:
len(subsampled_questions_dict.keys())

613163

In [21]:
data, total_word_count = get_total_word_count_and_data(subsampled_questions_dict)
total_word_count

10917959

In [22]:
token_count_dict = get_token_counts(data)
token_count_dict

{'and': 0,
 'or': 63214,
 'more': 11570,
 'fewer': 11851,
 'behind': 114782,
 'in front': 114679,
 'same': 314200}

In [23]:
sum_count = sum([token_count_dict[k] for k in token_count_dict.keys()])

In [24]:
token_prop_dict = {k:(v/sum_count) for k,v in token_count_dict.items()}
token_prop_dict

{'and': 0.0,
 'or': 0.10029256095548758,
 'more': 0.018356454745072158,
 'fewer': 0.01880227702539759,
 'behind': 0.18210808889791463,
 'in front': 0.18194467361366723,
 'same': 0.4984959447624608}

### Save new dataset

In [27]:
new_questions = [questions_dict[x] for x in keep_questions]

In [28]:
len(new_questions)

613163

In [29]:
# now renumber questions ids for new dataset
i = 0
for q in new_questions:
    q['question_index'] = i
    i+=1

In [30]:
new_train_questions = {'info': info, 'questions': new_questions}

In [31]:
new_train_q_file = "../../data/CLEVR_noANDnoEQUAL/data/CLEVR_train_questions.json"
with open(new_train_q_file, "w") as f:
    json.dump(new_train_questions, f)


In [10]:
new_train_q_file = "../../data/CLEVR_CHILDESfreq/data/CLEVR_train_questions.json"

with open(new_train_q_file) as f:
    train_questions = json.load(f)

In [11]:
questions = train_questions['questions']

In [12]:
questions_dict = {}
for q in questions:
    questions_dict[q['question_index']] = q

In [13]:
data, total_word_count = get_total_word_count_and_data(questions_dict)
total_word_count

9652086

In [14]:
token_count_dict = get_token_counts(data)
token_count_dict

{'and': 81506,
 'or': 8610,
 'more': 11570,
 'fewer': 105,
 'behind': 113881,
 'in front': 39260,
 'same': 335667}