# Getting Sketch Partitions for Rating Study

## Fetch the sketch paths from that folder

In [70]:
import os
import json
import pandas as pd
import numpy as np
from IPython.display import clear_output


In [71]:
proj_dir = os.path.abspath('../..')
exp_name = 'classify_iternum'
exp_dir = os.path.join(proj_dir,exp_name)
sketch_dir = os.path.abspath(os.path.join(proj_dir,'sketches'))

full_stim_paths = os.listdir(sketch_dir) # list out all the sketches in that directory
sketches = [i for i in full_stim_paths if i.split('/')[-1] != '.DS_Store']

stimListDir = os.path.abspath('../../experiments/classify_iternum/stimList')

if not os.path.exists(stimListDir):
    os.makedirs(stimListDir)

## Assemble a dataframe from all the sketches

In [72]:
sketch_info = pd.DataFrame(columns = ["orig_gameid","orig_animal","orig_cardinality","orig_trial","orig_cond","orig_version","sketch_url"]) # initialize dataframe

for i in range(len(sketches)): # for every sketch
    name = sketches[i].split('_') # split up its metadata
                                                        #    gameID         animal            cardinality             trialnum       condition      stim_version
    stimurl = "https://iternum-sketches.s3.amazonaws.com/" + name[0] + '_' + name[1] + '_' + str(int(name[2])) + '_' + name[3] + '_' + name[4] + '_' + name[5]

    # following two lines are dead:
#     stimID = name[4].split('_') # ... by multiple delimiters
#     stimurl = "https://iternum-sketches.s3.amazonaws.com/" + name[0] + '_' + stimID[0] + '_' + str(int(stimID[1])-1) + '_' + name[2] + '_' + name[3] + '_' + name[4]     
    row = np.array([name[0],name[1],str(int(name[2])),name[3],name[4],name[5].split('.')[0],stimurl]) # put into relevant column
    sketch_info.loc[len(sketch_info)] = row # now append that to the sketch info dataframe
    
sketch_info


Unnamed: 0,orig_gameid,orig_animal,orig_cardinality,orig_trial,orig_cond,orig_version,sketch_url
0,0074-988d4ee1-5766-47b5-bcbb-49a720aee30d,bear,1,22,number,057,https://iternum-sketches.s3.amazonaws.com/0074...
1,0074-988d4ee1-5766-47b5-bcbb-49a720aee30d,bear,2,31,number,051,https://iternum-sketches.s3.amazonaws.com/0074...
2,0074-988d4ee1-5766-47b5-bcbb-49a720aee30d,bear,3,25,number,062,https://iternum-sketches.s3.amazonaws.com/0074...
3,0074-988d4ee1-5766-47b5-bcbb-49a720aee30d,bear,4,14,number,009,https://iternum-sketches.s3.amazonaws.com/0074...
4,0074-988d4ee1-5766-47b5-bcbb-49a720aee30d,bear,5,6,number,073,https://iternum-sketches.s3.amazonaws.com/0074...
...,...,...,...,...,...,...,...
1947,9628-d7914c9d-2ee9-4706-8d26-93a3fa8008e3,rabbit,4,5,number,092,https://iternum-sketches.s3.amazonaws.com/9628...
1948,9628-d7914c9d-2ee9-4706-8d26-93a3fa8008e3,rabbit,5,16,number,059,https://iternum-sketches.s3.amazonaws.com/9628...
1949,9628-d7914c9d-2ee9-4706-8d26-93a3fa8008e3,rabbit,6,30,number,047,https://iternum-sketches.s3.amazonaws.com/9628...
1950,9628-d7914c9d-2ee9-4706-8d26-93a3fa8008e3,rabbit,7,21,number,081,https://iternum-sketches.s3.amazonaws.com/9628...


In [73]:
sketch_info.iloc[0]['sketch_url']

'https://iternum-sketches.s3.amazonaws.com/0074-988d4ee1-5766-47b5-bcbb-49a720aee30d_bear_1_22_number_057.png'

### make a small list of catch trials, referencing their location on s3

In [74]:
catch_nums = ['https://iternum-recog-catches.s3.amazonaws.com/1.png',
              'https://iternum-recog-catches.s3.amazonaws.com/2.png',
              'https://iternum-recog-catches.s3.amazonaws.com/3.png',
              'https://iternum-recog-catches.s3.amazonaws.com/4.png',
              'https://iternum-recog-catches.s3.amazonaws.com/5.png',
              'https://iternum-recog-catches.s3.amazonaws.com/6.png',
              'https://iternum-recog-catches.s3.amazonaws.com/7.png',
              'https://iternum-recog-catches.s3.amazonaws.com/8.png']

catch_amls = ['https://iternum-recog-catches.s3.amazonaws.com/bear.png',
              'https://iternum-recog-catches.s3.amazonaws.com/deer.png',
              'https://iternum-recog-catches.s3.amazonaws.com/owl.png',
              'https://iternum-recog-catches.s3.amazonaws.com/rabbit.png']



## Assemble the partitions

In [75]:
bag = sketch_info    # the bag of sketches to sample, because sampling without replacement

games = bag.orig_gameid.unique() # we want this to be a unique list of all the games
paradigms = [] # this will be a list of dataframes, each dataframe containing the sketches to be rated by a rater


batch = 0
while len(bag) > 0: # sample from the bag without replacement
    
    # initialize paradigm 
    number_paradigm = {'versionID':batch,        # which partition is it (set of sketches) ?
                       'classify_condition':'number',  # what feature will mturk recognizers be asked to classify?
                       'games':[],               # empty list to be filled with classification games as they happen    
                       'meta':[]}                # the whole [unordered] trial list goes in this 'meta' structure
    shape_paradigm = {'versionID':batch+1,        # which partition is it (set of sketches) ?
                       'classify_condition':'shape',  # what feature will mturk recognizers be asked to classify?
                       'games':[],               # empty list to be filled with classification games as they happen    
                       'meta':[]}                # the whole [unordered] trial list goes in this 'meta' structure

    number_catches = catch_nums
    animal_catches = catch_amls
    random_seed = 619
    for i in range(len(games)): # we want each rater to see [no more than] one sketch from each game
        trial = {} # initialize a dictionary for this rater, 1 game per trial
        
        row = bag[bag['orig_gameid']==games[i]].sample(n=1,replace=False,random_state=random_seed) # sample a sketch at random from the game
        bag = bag.drop(index = row.index) # remove it from the bag
        
        trial["orig_gameid"] = row.iloc[0]["orig_gameid"]
        trial["orig_animal"] = row.iloc[0]["orig_animal"]
        trial["orig_cardinality"] = row.iloc[0]["orig_cardinality"]
        trial["orig_trial"] = row.iloc[0]["orig_trial"]
        trial["orig_cond"] = row.iloc[0]["orig_cond"]
        trial["orig_version"] = row.iloc[0]["orig_version"]
        trial["sketch_url"] = row.iloc[0]["sketch_url"]
        trial["catchTrial"] = False
        
        
        number_paradigm['meta'].append(trial)
        shape_paradigm['meta'].append(trial)
        
        # insert catch trials at regular intervals
        if i in [0,1,2,3]:
            np.random.seed(seed=random_seed + len(bag) + i)
            catch_aml_trial = {}
            catch_num_trial = {}
            num_stim = np.random.choice(number_catches,replace=False)
            aml_stim = np.random.choice(animal_catches,replace=False)
            
            
            # first do the number one
            catch_num_trial["orig_gameid"] = np.nan
            catch_num_trial["orig_animal"] = np.nan
            catch_num_trial["orig_cardinality"] = num_stim.split('/')[-1].split('.')[0]
            catch_num_trial["orig_trial"] = np.nan
            catch_num_trial["orig_cond"] = np.nan
            catch_num_trial["orig_version"] = np.nan
            catch_num_trial["sketch_url"] = num_stim
            catch_num_trial["catchTrial"] = True
            number_paradigm['meta'].append(catch_num_trial)
            
            # then do the animal one
            catch_aml_trial["orig_gameid"] = np.nan
            catch_aml_trial["orig_animal"] = aml_stim.split('/')[-1].split('.')[0]
            catch_aml_trial["orig_cardinality"] = np.nan
            catch_aml_trial["orig_trial"] = np.nan
            catch_aml_trial["orig_cond"] = np.nan
            catch_aml_trial["orig_version"] = np.nan
            catch_aml_trial["sketch_url"] = aml_stim
            catch_aml_trial["catchTrial"] = True
            shape_paradigm['meta'].append(catch_aml_trial)
            
        
        random_seed += 1 # this means that each paradigm is matched in its sequence for number and shape recognizers
    
    # when a paradigm is assembled, put it into the list:
    paradigms.append(number_paradigm) # first put the version for people classifying the number info     
    paradigms.append(shape_paradigm) # then store the exact same data structure but change the classification goal
    
    batch += 2
    
num_partitions = len(paradigms)    
print('We have {} unique partitions.'.format(num_partitions)) # Should be 32*2=64 paradigms of 61 sketches; each rater sees one per game, requiring 64 raters        

# print(paradigms[1].iloc[3,6])    # print one of the urls



We have 64 unique partitions.


In [76]:
for i in paradigms:
    print(i['versionID'], i['classify_condition'],i['meta'][5]['orig_animal'],i['meta'][5]['orig_cardinality'])

# for thing in paradigms[0]['meta']:
#     print(thing,'\n')



0 number nan 3
1 shape rabbit nan
2 number nan 4
3 shape bear nan
4 number nan 4
5 shape deer nan
6 number nan 8
7 shape bear nan
8 number nan 1
9 shape deer nan
10 number nan 6
11 shape deer nan
12 number nan 5
13 shape rabbit nan
14 number nan 6
15 shape owl nan
16 number nan 5
17 shape rabbit nan
18 number nan 6
19 shape rabbit nan
20 number nan 2
21 shape deer nan
22 number nan 5
23 shape bear nan
24 number nan 5
25 shape bear nan
26 number nan 5
27 shape deer nan
28 number nan 6
29 shape owl nan
30 number nan 2
31 shape bear nan
32 number nan 7
33 shape bear nan
34 number nan 4
35 shape bear nan
36 number nan 6
37 shape bear nan
38 number nan 6
39 shape owl nan
40 number nan 8
41 shape bear nan
42 number nan 1
43 shape bear nan
44 number nan 1
45 shape rabbit nan
46 number nan 1
47 shape owl nan
48 number nan 7
49 shape owl nan
50 number nan 6
51 shape bear nan
52 number nan 1
53 shape rabbit nan
54 number nan 4
55 shape bear nan
56 number nan 4
57 shape rabbit nan
58 number nan 4

## Put that datastructure into Mongo

In [77]:
# set vars 
auth = pd.read_csv('auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org'

import pymongo as pm
import socket
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27017') 
db = conn['stimuli']
coll = db['iternum_classification']


In [78]:
## now really insert data
# reallyDelete = False
reallyRun = False

## sometimes during beta testing we want to delete what's already in the collection
# if reallyDelete:
#     coll.delete_many({})

## insert the data
if reallyRun:
    for (i,j) in enumerate(paradigms):
        print ('%d of %d uploaded ...' % (i+1,len(paradigms)))
        clear_output(wait=True)
        coll.insert_one(j)

print('Done!')


Done!


In [79]:
# extra notes and things for remembering syntax:
# coll.find_one()
# len(list(coll.find({})))
# print(db.command("collstats", 'iternum_classification'))
# list(coll.find({'games':'8067-374f2239-6bb9-4349-9786-afcfdc2ceb0c'}))[0]['versionID']


## Patch for data lacunae (incomplete or invalid games)

In [82]:
# figure out which partitions need to be done, because we forgot to store versionID in the recog data
# get the list of valid games that form the complement to the set of games we still need to recruit
validGamesList = np.load("../../analysis/datastructures/valid_game_ids.npy",allow_pickle=True)

# find out which games in each partition are actually valid and store them in a dictionary
partitions_with_games = {}
for partition in list(coll.find({})):
    partitions_with_games[partition['versionID']] = list(set(partition['games']).intersection(validGamesList)) #partition['games']

# copy the paradigms defined above for the original task
patch_paradigms = paradigms

# and then populate the copy with only the valid games that don't need to be done all over again
for patch_paradigm in patch_paradigms:
    patch_paradigm['games'] = partitions_with_games[patch_paradigm['versionID']]

# check to see that it looks right:
# for thing in patch_paradigms:
#     print(len(thing['games']))

In [61]:
# set vars 
auth = pd.read_csv('auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org'

import pymongo as pm
import socket
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27017') 
db = conn['stimuli']
patch_coll = db['iternum_classification_patching']


In [86]:
## now really insert data
# reallyDelete = False
reallyRun = False

## sometimes during beta testing we want to delete what's already in the collection
# if reallyDelete:
#     patch_coll.delete_many({})

## insert the data
if reallyRun:
    for (i,j) in enumerate(patch_paradigms):
        print ('%d of %d uploaded ...' % (i+1,len(patch_paradigms)))
        clear_output(wait=True)
        patch_coll.insert_one(j)

print('Done!')


Done!


In [94]:
# check to make sure it worked
patch_coll.find_one()