In [1]:
import os
import json
import random

In [2]:
train_q_file = "../../data/CLEVR_v1/data/CLEVR_train_questions.json"

In [3]:
with open(train_q_file) as f:
    train_questions = json.load(f)

In [4]:
info = train_questions['info']
questions = train_questions['questions']

In [5]:
questions_dict = {}
for q in questions:
    questions_dict[q['question_index']] = q

# Sampling a CHILDES-like CLEVR dataset by word pair

In [6]:
def get_questions_with_word(word):
    word_questions = set()
    for k,v in questions_dict.items():
        if word in v['question']:
            word_questions.add(k)
    return word_questions

In [7]:
def get_total_word_count_and_data(subsampled_questions_dict):
    total_word_count = 0
    data = []
    for k,q in subsampled_questions_dict.items():
        line = q['question']
        line = line.strip('?').lower()
        line_as_list = line.split(" ")
        data += line_as_list
        total_word_count += len(line_as_list)
    return data, total_word_count

def get_token_counts(data):
    word_set = {"and", "or", "more", "fewer", "behind", "front", "same"}
    token_count_dict = {"and": 0, "or": 0, "more": 0, "fewer": 0, "behind": 0, "in front": 0, "same": 0}
    prev_word = ""
    for word in data:
        if word in word_set:
            if word == "front":
                if prev_word == "in":
                    token_count_dict["in front"] += 1
            else:
                token_count_dict[word] += 1
        prev_word = word
    return token_count_dict

## AND - OR

CLEVR COUNTS
AND 81506
OR 63214
total 144720

In [8]:
# CLEVR AND proportion
81506 / 144720

0.5631978993919292

In [9]:
# CLEVR OR proportion
63214 / 144720

0.43680210060807073

CHILDES COUNTS
AND 217497
OR 22975
total 240472

In [10]:
# CHILDES AND proportion
and_prop = 217497 / 240472

In [11]:
# CHILDES OR proportion
or_prop = 22975 / 240472

In [12]:
and_questions = get_questions_with_word(" and ")
len(and_questions)

81506

In [13]:
or_questions = get_questions_with_word(" or ")
len(or_questions)

63214

In [14]:
# how many questions at there intersection?
len(and_questions & or_questions)

0

In [15]:
#Downsample OR question 
n_or = 81506 * or_prop / and_prop

In [16]:
or_questions_subsample = set(random.sample(or_questions, int(round(n_or,0))))

In [17]:
len(or_questions_subsample)

8610

In [18]:
and_or_questions = and_questions | or_questions_subsample

## MORE - LESS

CLEVR COUNTS
MORE 11570
LESS/FEWER 11851
total 23421

In [19]:
# CLEVR MORE proportion
11570 / 23421

0.4940011101148542

In [20]:
# CLEVR MORE proportion
11851 / 23421

0.5059988898851459

CHILDES COUNTS
MORE 23406
LESS/FEWER 212
total 23618

In [21]:
# CHILDES MORE proportion
more_prop = 23406 / 23618

In [22]:
# CHILDES LESS proportion
less_prop = 212 / 23618

In [23]:
more_questions = get_questions_with_word(" more ")
len(more_questions)

11570

In [24]:
less_questions = get_questions_with_word(" fewer ")
len(less_questions)

11851

In [25]:
# intersection with AND and OR ?
len(less_questions & and_or_questions)

0

In [26]:
# intersection with MORE ?
len(less_questions & more_questions)

0

In [27]:
#Downsample LESS question 
n_less = 11570 * less_prop / more_prop

In [28]:
less_questions_subsample = set(random.sample(less_questions, int(round(n_less,0))))

In [29]:
more_less_questions = more_questions | less_questions_subsample

## BEHIND - FRONT

CLEVR COUNTS
BEHIND 147409
FRONT 147506
total 294915

In [30]:
# CLEVR BEHIND proportion
147409 / 294915

0.49983554583524065

In [31]:
# CLEVR FRONT proportion
147506 / 294915

0.5001644541647593

CHILDES COUNTS
BEHIND 2954
IN FRONT 756
total 3710

In [32]:
# CHILDES BEHIND proportion
behind_prop = 2954 / 3710

In [33]:
# CHILDES IN FRONT proportion
front_prop = 756 / 3710

In [34]:
front_prop

0.2037735849056604

In [35]:
behind_questions = get_questions_with_word(" behind ")
len(behind_questions)

134723

In [36]:
# remove already removed OR, LESS questions
behind_questions_subsample = behind_questions - (or_questions - or_questions_subsample) - (less_questions - less_questions_subsample)
len(behind_questions_subsample)

125621

In [37]:
front_questions = get_questions_with_word(" front ")
len(front_questions)

135110

In [38]:
# remove already removed OR questions
front_questions_subsample = front_questions - (or_questions - or_questions_subsample) - (less_questions - less_questions_subsample)
len(front_questions_subsample)

126268

In [39]:
# how many questions at there intersection of FRONT and BEHIND? Quite a few ...
len(behind_questions_subsample & front_questions_subsample)

35301

In [40]:
# how many questions at there intersection of FRONT with all other words? Quite a few ...
len(front_questions_subsample & (and_or_questions | more_less_questions | behind_questions_subsample))

59885

In [41]:
len(front_questions_subsample & (and_or_questions | more_less_questions))

33941

In [42]:
#Downsample FRONT question 
n_front = 125665 * front_prop / behind_prop

In [43]:
# n front is smaller than the intersection with all other words, so I will need to subsample BEHIND questions as well
n_front

32160.710900473932

In [44]:
behind_front_intersect = behind_questions_subsample & front_questions_subsample - (and_or_questions | more_less_questions)
len(behind_front_intersect)

25944

In [45]:
behind_questions_subsample = behind_questions_subsample - behind_front_intersect
front_questions_subsample = front_questions_subsample - behind_front_intersect

In [46]:
len(behind_questions_subsample)

99677

In [47]:
len(front_questions_subsample)

100324

In [48]:
n_front = ((99721) * front_prop / behind_prop)

In [49]:
# since n_front is quite small I will and 
n_front

25521.014218009477

In [50]:
len(front_questions_subsample & (and_or_questions | more_less_questions))

33941

In [51]:
behind_front_questions = behind_questions_subsample

## SAME

In [52]:
same_questions = get_questions_with_word(" same ")
len(same_questions)

356333

In [53]:
# remove intersection with FRONT that dont also contain other words

In [54]:
same_questions = same_questions - front_questions_subsample

### Check new frequencies ...

In [55]:
union_questions = and_or_questions | more_less_questions | behind_front_questions | same_questions

In [56]:
len(union_questions)

445877

In [57]:
subsampled_questions_dict = {x : questions_dict[x] for x in union_questions}

In [58]:
data, total_word_count = get_total_word_count_and_data(subsampled_questions_dict)

In [59]:
total_word_count

8263441

In [60]:
token_count_dict = get_token_counts(data)

In [61]:
token_count_dict

{'and': 81506,
 'or': 8610,
 'more': 11570,
 'fewer': 105,
 'behind': 113838,
 'in front': 39261,
 'same': 335667}

In [62]:
# AND prop
print(81506 / (81506 + 8610) , and_prop)

0.9044564783168361 0.904458731162048


In [63]:
# MORE prop
print(11570 / (11570 + 105), more_prop)

0.9910064239828694 0.9910237954102803


In [64]:
# BEHIND prop
print(113881 / (113881 + 39260) , behind_prop)

0.743634950796978 0.7962264150943397


## Close enough! Save new dataset

In [65]:
new_questions = [questions_dict[x] for x in union_questions]

In [66]:
# now renumber questions ids for new dataset
i = 0
for q in new_questions:
    q['question_index'] = i
    i+=1

In [130]:
new_train_questions = {'info':info, 'questions': new_questions}

In [134]:
new_train_q_file = "../../data/CLEVR_CHILDESfreq/data/CLEVR_train_questions.json"
with open(new_train_q_file, "w") as f:
    json.dump(new_train_questions, f)


### Forgot to add other non word questions last time ....

Adding these questions to rerun experiment 2

In [67]:
keep_questions = set(questions_dict.keys())

In [68]:
word_questions = (and_questions|or_questions|more_questions|less_questions|behind_questions|front_questions|same_questions)
len(word_questions)

600185

In [69]:
nonword_questions = keep_questions - word_questions
subsampled_questions_dict = {x : questions_dict[x] for x in nonword_questions}
data, total_word_count = get_total_word_count_and_data(subsampled_questions_dict)
token_count_dict = get_token_counts(data)
token_count_dict

{'and': 0,
 'or': 0,
 'more': 0,
 'fewer': 0,
 'behind': 0,
 'in front': 0,
 'same': 0}

In [70]:
new_questions = [questions_dict[x] for x in nonword_questions]

In [78]:
train_q_file = "../../data/CLEVR_CHILDESfreq/data/CLEVR_train_questions.json"

with open(train_q_file) as f:
    train_questions = json.load(f)

info = train_questions['info']
questions = train_questions['questions']

In [82]:
len(questions)

445877

In [80]:
questions = questions[0:-1]

In [81]:
questions[-1]

{'image_index': 69999,
 'program': [{'inputs': [], 'function': 'scene', 'value_inputs': []},
  {'inputs': [0], 'function': 'filter_material', 'value_inputs': ['metal']},
  {'inputs': [1], 'function': 'filter_shape', 'value_inputs': ['sphere']},
  {'inputs': [2], 'function': 'unique', 'value_inputs': []},
  {'inputs': [3], 'function': 'same_size', 'value_inputs': []},
  {'inputs': [4], 'function': 'filter_color', 'value_inputs': ['brown']},
  {'inputs': [5], 'function': 'filter_material', 'value_inputs': ['rubber']},
  {'inputs': [6], 'function': 'filter_shape', 'value_inputs': ['cylinder']},
  {'inputs': [7], 'function': 'exist', 'value_inputs': []}],
 'question_index': 445876,
 'image_filename': 'CLEVR_train_069999.png',
 'question_family_index': 44,
 'split': 'train',
 'answer': 'no',
 'question': 'Is there a brown rubber cylinder that has the same size as the metal ball?'}

In [83]:
i = questions[-1]["question_index"] + 1

In [84]:
i

445877

In [85]:
# now renumber questions ids for new dataset
for q in new_questions:
    q['question_index'] = i
    i+=1

In [86]:
questions = questions + new_questions
new_train_questions = {'info':info, 'questions': questions}

In [87]:
len(questions)

545681

In [88]:
with open(train_q_file, "w") as f:
    json.dump(new_train_questions, f)