In [7]:
import os
import json
import random

In [2]:
train_q_file = "../../data/CLEVR_v1/data/CLEVR_train_questions.json"

In [3]:
with open(train_q_file) as f:
    train_questions = json.load(f)

In [5]:
info = train_questions['info']

In [6]:
questions = train_questions['questions']

In [15]:
questions_dict = {}
for q in questions:
    questions_dict[q['question_index']] = q

699989

## Sampling a CHILDES-like CLEVR dataset

As it stands here are the proportional frequencies of our function words:
WORD      PROP_CLEVR  PROP_CHILDES 
and          0.1         0.8 
or           0.08        0.08 
more         0.01        0.09 
less         0.01        0.0007 
behind       0.18        0.01 
in front     0.18        0.002 
same         0.43        0.01 

Since these words can appear in the same questions, I cannot simple downsample them individually based on these proportions, but have to do this in steps.

In [17]:
def get_questions_with_word(word):
    word_questions = set()
    for k,v in questions_dict.items():
        if word in v['question']:
            word_questions.add(k)
    return word_questions

In [101]:
def get_total_word_count_and_data(subsampled_questions_dict):
    total_word_count = 0
    data = []
    for k,q in subsampled_questions_dict.items():
        line = q['question']
        line = line.strip('?').lower()
        line_as_list = line.split(" ")
        data += line_as_list
        total_word_count += len(line_as_list)
    return data, total_word_count

def get_token_counts(data):
    word_set = {"and", "or", "more", "fewer", "behind", "front", "same"}
    token_count_dict = {"and": 0, "or": 0, "more": 0, "fewer": 0, "behind": 0, "in front": 0, "same": 0}
    prev_word = ""
    for word in data:
        if word in word_set:
            if word == "front":
                if prev_word == "in":
                    token_count_dict["in front"] += 1
            else:
                token_count_dict[word] += 1
        prev_word = word
    return token_count_dict

### Upsampling

Repeat all the 'and' and 'more' items once

In [117]:
data, total_word_count = get_total_word_count_and_data(questions_dict)
token_count_dict = get_token_counts(data)
token_count_dict

{'and': 81506,
 'or': 63214,
 'more': 11570,
 'fewer': 11851,
 'behind': 147409,
 'in front': 147506,
 'same': 356333}

In [118]:
token_count_dict['and'] = token_count_dict['and']*2

In [119]:
token_count_dict['more'] = token_count_dict['more']*2

In [120]:
token_count_dict

{'and': 163012,
 'or': 63214,
 'more': 23140,
 'fewer': 11851,
 'behind': 147409,
 'in front': 147506,
 'same': 356333}

In [121]:
sum_count = sum([token_count_dict[k] for k in token_count_dict.keys()])

In [122]:
sum_count

912465

In [123]:
token_prop_dict = {k:(v/sum_count) for k,v in token_count_dict.items()}

In [124]:
token_prop_dict

{'and': 0.17865014000537008,
 'or': 0.06927827368720992,
 'more': 0.025359876817192987,
 'fewer': 0.012987895426125934,
 'behind': 0.16155030603913575,
 'in front': 0.16165661148646798,
 'same': 0.39051689653849736}

### Downsampling

#### SAME

In [72]:
same_questions = get_questions_with_word(" same ")

In [73]:
len(same_questions)

356333

In [137]:
# proportion we need to remove = about 98%
(token_prop_dict['same'] - 0.01)/token_prop_dict['same']
# number of items to sample
(1 - 0.98) * 356333

7126.660000000006

In [138]:
same_questions_subsample = set(random.sample(same_questions, 7126))

In [139]:
same_questions_notincluded = same_questions - same_questions_subsample

#### LESS

In [77]:
less_questions = get_questions_with_word(" fewer ")

In [78]:
len(less_questions)

11851

In [132]:
# proportion we need to remove = about 95%
(token_prop_dict['fewer'] - 0.0007)/token_prop_dict['fewer']
# number of items to sample
(1 - 0.95) * 11851

592.5500000000005

In [133]:
less_questions_subsample = set(random.sample(less_questions, 592))

In [134]:
less_questions_notincluded = less_questions - less_questions_subsample

#### BEHIND

In [82]:
behind_questions = get_questions_with_word(" behind ")

In [83]:
len(behind_questions)

134723

In [141]:
# proportion we need to remove = about 94%
(token_prop_dict['behind'] - 0.01)/token_prop_dict['behind']
# number of items to sample
(1 - 0.94) * 134723

8083.380000000007

In [142]:
behind_questions_subsample = set(random.sample(behind_questions, 8083))

In [143]:
behind_questions_notincluded = behind_questions - behind_questions_subsample

#### IN FRONT

In [87]:
front_questions = get_questions_with_word(" front ")

In [88]:
len(front_questions)

135110

In [146]:
# proportion we need to remove = about 99%
(token_prop_dict['in front'] - 0.002)/token_prop_dict['in front']
# number of items to sample
(1 - 0.99) * 135110

1351.1000000000013

In [147]:
front_questions_subsample = set(random.sample(front_questions, 1351))

In [148]:
front_questions_notincluded = front_questions - front_questions_subsample

#### AND

In [92]:
and_questions = get_questions_with_word(" and ")

In [93]:
len(and_questions)

81506

#### OR

In [94]:
or_questions = get_questions_with_word(" or ")

In [95]:
len(or_questions)

63214

#### MORE

In [96]:
more_questions = get_questions_with_word(" more ")

In [97]:
len(more_questions)

11570

#### UNION SET

In [149]:
union_questions = more_questions | or_questions | and_questions | front_questions_subsample | behind_questions_subsample |less_questions_subsample | same_questions_subsample

In [151]:
len(union_questions) + 81506 + 11570

262904

#### get new frequency prop

In [152]:
subsampled_questions_dict = {x : questions_dict[x] for x in union_questions}

In [153]:
data, total_word_count = get_total_word_count_and_data(subsampled_questions_dict)

In [154]:
total_word_count

3248758

In [159]:
token_count_dict = get_token_counts(data)

In [160]:
token_count_dict

{'and': 81506,
 'or': 63214,
 'more': 11570,
 'fewer': 776,
 'behind': 48772,
 'in front': 44853,
 'same': 44921}

In [161]:
token_count_dict['and'] = token_count_dict['and']*2

In [162]:
token_count_dict['more'] = token_count_dict['more']*2

In [163]:
sum_count = sum([token_count_dict[k] for k in token_count_dict.keys()])

In [164]:
sum_count

388688

In [165]:
token_prop_dict = {k:(v/sum_count) for k,v in token_count_dict.items()}

In [166]:
token_prop_dict

{'and': 0.4193903593627794,
 'or': 0.16263429794591033,
 'more': 0.05953361050508377,
 'fewer': 0.0019964598855637425,
 'behind': 0.125478532910715,
 'in front': 0.11539589593710123,
 'same': 0.1155708434528465}

### Downsample again

#### SAME

In [167]:
len(same_questions_subsample)

7126

In [169]:
# proportion we need to remove = about 91%
(token_prop_dict['same'] - 0.01)/token_prop_dict['same']
# number of items to sample
(1 - 0.91) * 7126

641.3399999999998

In [188]:
same_questions_subsample = set(random.sample(same_questions, 641))

In [189]:
same_questions_notincluded = same_questions - same_questions_subsample

#### LESS

In [172]:
len(less_questions_subsample)

592

In [174]:
# proportion we need to remove = about 65%
(token_prop_dict['fewer'] - 0.0007)/token_prop_dict['fewer']
# number of items to sample
(1 - 0.65) * 592

207.2

In [179]:
less_questions_subsample = set(random.sample(less_questions, 207))

In [180]:
less_questions_notincluded = less_questions - less_questions_subsample

#### BEHIND

In [183]:
len(behind_questions_subsample)

8083

In [184]:
# proportion we need to remove = about 92%
(token_prop_dict['behind'] - 0.01)/token_prop_dict['behind']
# number of items to sample
(1 - 0.92) * 8083

646.6399999999996

In [190]:
behind_questions_subsample = set(random.sample(behind_questions, 646))

In [191]:
behind_questions_notincluded = behind_questions - behind_questions_subsample

#### IN FRONT

In [192]:
len(front_questions_subsample)

1351

In [194]:
# proportion we need to remove = about 98%
(token_prop_dict['in front'] - 0.002)/token_prop_dict['in front']
# number of items to sample
(1 - 0.98) * 1351

27.020000000000024

In [195]:
front_questions_subsample = set(random.sample(front_questions, 27))

In [196]:
front_questions_notincluded = front_questions - front_questions_subsample

#### OR

In [197]:
len(or_questions)

63214

In [199]:
# proportion we need to remove = about 0.5%
(token_prop_dict['or'] - 0.08)/token_prop_dict['or']
# number of items to sample
(1 - 0.5) * 63214

31607.0

In [200]:
or_questions_subsample = set(random.sample(or_questions, 31607))

In [201]:
or_questions_notincluded = or_questions - or_questions_subsample

#### UNION SET

In [202]:
union_questions = more_questions | or_questions_subsample | and_questions | front_questions_subsample | behind_questions_subsample |less_questions_subsample | same_questions_subsample

In [203]:
len(union_questions) + 81506 + 11570

219034

#### get new frequency prop

In [204]:
subsampled_questions_dict = {x : questions_dict[x] for x in union_questions}

In [205]:
data, total_word_count = get_total_word_count_and_data(subsampled_questions_dict)

In [206]:
total_word_count

2515827

In [207]:
token_count_dict = get_token_counts(data)

In [208]:
token_count_dict

{'and': 81506,
 'or': 31627,
 'more': 11570,
 'fewer': 219,
 'behind': 38165,
 'in front': 38022,
 'same': 37508}

In [209]:
token_count_dict['and'] = token_count_dict['and']*2

In [210]:
token_count_dict['more'] = token_count_dict['more']*2

In [211]:
sum_count = sum([token_count_dict[k] for k in token_count_dict.keys()])

In [212]:
sum_count

331693

In [213]:
token_prop_dict = {k:(v/sum_count) for k,v in token_count_dict.items()}

In [214]:
token_prop_dict

{'and': 0.4914544473353372,
 'or': 0.09535021842486878,
 'more': 0.06976330522501228,
 'fewer': 0.0006602490857509805,
 'behind': 0.11506121624514234,
 'in front': 0.11463009469599901,
 'same': 0.1130804689878894}

In [None]:
### Remove more behind, in front, same questions that do not have 'and', 'or', 'more', 'fewer'

In [215]:
len(and_questions)

81506

In [216]:
len(and_questions & behind_questions_notincluded)

29775

In [217]:
29775/81506

0.3653105292861875

In [218]:
len(and_questions & front_questions_notincluded)

30214

In [220]:
len(and_questions & behind_questions_notincluded & front_questions_notincluded & same_questions_notincluded)

1454

In [221]:
and_questions_without = and_questions - (and_questions & behind_questions_notincluded & front_questions_notincluded & same_questions_notincluded)

In [222]:
len(and_questions_without)

80052

In [223]:
len(and_questions - (behind_questions_notincluded | front_questions_notincluded | same_questions_notincluded))

9019

In [224]:
9019/81506

0.11065443034868598

In [226]:
!nvidia-smi

Wed Feb  9 08:36:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.04    Driver Version: 455.23.04    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 3080    Off  | 00000000:01:00.0 Off |                  N/A |
| 30%   46C    P2   144W / 320W |   8509MiB / 10015MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------