# Getting Sketch Partitions for Rating Study

## Fetch the sketch paths from that folder

In [None]:
import os
import json
import pandas as pd
import numpy as np
from IPython.display import clear_output


In [None]:
proj_dir = os.path.abspath('../..')
exp_name = 'classify_iternum'
exp_dir = os.path.join(proj_dir,exp_name)
sketch_dir = os.path.abspath(os.path.join(proj_dir,'sketches'))

full_stim_paths = os.listdir(sketch_dir) # list out all the sketches in that directory
sketches = [i for i in full_stim_paths if i.split('/')[-1] != '.DS_Store']

stimListDir = os.path.abspath('../../experiments/classify_iternum/stimList')

if not os.path.exists(stimListDir):
    os.makedirs(stimListDir)

## Assemble a dataframe from all the sketches

In [None]:
sketch_info = pd.DataFrame(columns = ["orig_gameid","orig_animal","orig_cardinality","orig_trial","orig_cond","orig_version","sketch_url"]) # initialize dataframe

for i in range(len(sketches)): # for every sketch
    name = sketches[i].split('_') # split up its metadata
                                                        #    gameID         animal            cardinality             trialnum       condition      stim_version
    stimurl = "https://iternum-sketches.s3.amazonaws.com/" + name[0] + '_' + name[1] + '_' + str(int(name[2])) + '_' + name[3] + '_' + name[4] + '_' + name[5]

    # following two lines are dead:
#     stimID = name[4].split('_') # ... by multiple delimiters
#     stimurl = "https://iternum-sketches.s3.amazonaws.com/" + name[0] + '_' + stimID[0] + '_' + str(int(stimID[1])-1) + '_' + name[2] + '_' + name[3] + '_' + name[4]     
    row = np.array([name[0],name[1],str(int(name[2])),name[3],name[4],name[5].split('.')[0],stimurl]) # put into relevant column
    sketch_info.loc[len(sketch_info)] = row # now append that to the sketch info dataframe
    
sketch_info


In [None]:
sketch_info.iloc[0]['sketch_url']

### make a small list of catch trials, referencing their location on s3

In [None]:
catch_nums = ['https://iternum-recog-catches.s3.amazonaws.com/1.png',
              'https://iternum-recog-catches.s3.amazonaws.com/2.png',
              'https://iternum-recog-catches.s3.amazonaws.com/3.png',
              'https://iternum-recog-catches.s3.amazonaws.com/4.png',
              'https://iternum-recog-catches.s3.amazonaws.com/5.png',
              'https://iternum-recog-catches.s3.amazonaws.com/6.png',
              'https://iternum-recog-catches.s3.amazonaws.com/7.png',
              'https://iternum-recog-catches.s3.amazonaws.com/8.png']

catch_amls = ['https://iternum-recog-catches.s3.amazonaws.com/bear.png',
              'https://iternum-recog-catches.s3.amazonaws.com/deer.png',
              'https://iternum-recog-catches.s3.amazonaws.com/owl.png',
              'https://iternum-recog-catches.s3.amazonaws.com/rabbit.png']



## Assemble the partitions

In [None]:
bag = sketch_info    # the bag of sketches to sample, because sampling without replacement

games = bag.orig_gameid.unique() # we want this to be a unique list of all the games
paradigms = [] # this will be a list of dataframes, each dataframe containing the sketches to be rated by a rater


batch = 0
while len(bag) > 0: # sample from the bag without replacement
    
    # initialize paradigm 
    number_paradigm = {'versionID':batch,        # which partition is it (set of sketches) ?
                       'classify_condition':'number',  # what feature will mturk recognizers be asked to classify?
                       'games':[],               # empty list to be filled with classification games as they happen    
                       'meta':[]}                # the whole [unordered] trial list goes in this 'meta' structure
    shape_paradigm = {'versionID':batch+1,        # which partition is it (set of sketches) ?
                       'classify_condition':'shape',  # what feature will mturk recognizers be asked to classify?
                       'games':[],               # empty list to be filled with classification games as they happen    
                       'meta':[]}                # the whole [unordered] trial list goes in this 'meta' structure

    number_catches = catch_nums
    animal_catches = catch_amls
    random_seed = 619
    for i in range(len(games)): # we want each rater to see [no more than] one sketch from each game
        trial = {} # initialize a dictionary for this rater, 1 game per trial
        
        row = bag[bag['orig_gameid']==games[i]].sample(n=1,replace=False,random_state=random_seed) # sample a sketch at random from the game
        bag = bag.drop(index = row.index) # remove it from the bag
        
        trial["orig_gameid"] = row.iloc[0]["orig_gameid"]
        trial["orig_animal"] = row.iloc[0]["orig_animal"]
        trial["orig_cardinality"] = row.iloc[0]["orig_cardinality"]
        trial["orig_trial"] = row.iloc[0]["orig_trial"]
        trial["orig_cond"] = row.iloc[0]["orig_cond"]
        trial["orig_version"] = row.iloc[0]["orig_version"]
        trial["sketch_url"] = row.iloc[0]["sketch_url"]
        trial["catchTrial"] = False
        
        
        number_paradigm['meta'].append(trial)
        shape_paradigm['meta'].append(trial)
        
        # insert catch trials at regular intervals
        if i in [0,1,2,3]:
            np.random.seed(seed=random_seed + len(bag) + i)
            catch_aml_trial = {}
            catch_num_trial = {}
            num_stim = np.random.choice(number_catches,replace=False)
            aml_stim = np.random.choice(animal_catches,replace=False)
            
            
            # first do the number one
            catch_num_trial["orig_gameid"] = np.nan
            catch_num_trial["orig_animal"] = np.nan
            catch_num_trial["orig_cardinality"] = num_stim.split('/')[-1].split('.')[0]
            catch_num_trial["orig_trial"] = np.nan
            catch_num_trial["orig_cond"] = np.nan
            catch_num_trial["orig_version"] = np.nan
            catch_num_trial["sketch_url"] = num_stim
            catch_num_trial["catchTrial"] = True
            number_paradigm['meta'].append(catch_num_trial)
            
            # then do the animal one
            catch_aml_trial["orig_gameid"] = np.nan
            catch_aml_trial["orig_animal"] = aml_stim.split('/')[-1].split('.')[0]
            catch_aml_trial["orig_cardinality"] = np.nan
            catch_aml_trial["orig_trial"] = np.nan
            catch_aml_trial["orig_cond"] = np.nan
            catch_aml_trial["orig_version"] = np.nan
            catch_aml_trial["sketch_url"] = aml_stim
            catch_aml_trial["catchTrial"] = True
            shape_paradigm['meta'].append(catch_aml_trial)
            
        
        random_seed += 1 # this means that each paradigm is matched in its sequence for number and shape recognizers
    
    # when a paradigm is assembled, put it into the list:
    paradigms.append(number_paradigm) # first put the version for people classifying the number info     
    paradigms.append(shape_paradigm) # then store the exact same data structure but change the classification goal
    
    batch += 2
    
num_partitions = len(paradigms)    
print('We have {} unique partitions.'.format(num_partitions)) # Should be 32*2=64 paradigms of 61 sketches; each rater sees one per game, requiring 64 raters        

# print(paradigms[1].iloc[3,6])    # print one of the urls



In [None]:
for i in paradigms:
    print(i['versionID'], i['classify_condition'],i['meta'][5]['orig_animal'],i['meta'][5]['orig_cardinality'])

# for thing in paradigms[0]['meta']:
#     print(thing,'\n')



## Put that datastructure into Mongo

In [None]:
# set vars 
auth = pd.read_csv('auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org'

import pymongo as pm
import socket
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27017') 
db = conn['stimuli']
coll = db['iternum_classification']


In [None]:
## now really insert data
# reallyDelete = False
reallyRun = False

## sometimes during beta testing we want to delete what's already in the collection
# if reallyDelete:
#     coll.delete_many({})

## insert the data
if reallyRun:
    for (i,j) in enumerate(paradigms):
        print ('%d of %d uploaded ...' % (i+1,len(paradigms)))
        clear_output(wait=True)
        coll.insert_one(j)

print('Done!')


In [None]:
# extra notes and things for remembering syntax:
# coll.find_one()
# len(list(coll.find({})))
# print(db.command("collstats", 'iternum_classification'))
# list(coll.find({'games':'8067-374f2239-6bb9-4349-9786-afcfdc2ceb0c'}))[0]['versionID']


## Patch for data lacunae (incomplete or invalid games)

In [None]:
# figure out which partitions need to be done, because we forgot to store versionID in the recog data
# get the list of valid games that form the complement to the set of games we still need to recruit
validGamesList = np.load("../../analysis/datastructures/valid_game_ids.npy",allow_pickle=True)

# find out which games in each partition are actually valid and store them in a dictionary
partitions_with_games = {}
for partition in list(patch_coll.find({})): # for 1st patch, we looked at coll. For 2nd patch, we look at patch_coll
    partitions_with_games[partition['versionID']] = list(set(partition['games']).intersection(validGamesList)) #partition['games']

# copy the paradigms defined above for the original task
patch_paradigms = paradigms

# and then populate the copy with only the valid games that don't need to be done all over again
for patch_paradigm in patch_paradigms:
    patch_paradigm['games'] = partitions_with_games[patch_paradigm['versionID']]
# the above two lines break when you put in 'iternum_classification_patching3';
# probably because there is only one partition

# check to see that it looks right:
# for thing in patch_paradigms:
#     print(len(thing['games']))

In [None]:
# set vars 
auth = pd.read_csv('auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org'

import pymongo as pm
import socket
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27017') 
db = conn['stimuli']
patch_coll = db['iternum_classification_patching3']

# previously did 'iternum_classification_patching', but still had some pesky extras so needed to even it out again
# after doing 'iternum_classification_patching2', there was ONE partition that needed ONE recog still
# made 'iternum_classification_patching3' just to get one more valid game for 63rd partition ('versionID' : 62)

In [None]:
## now really insert data
# reallyDelete = False
reallyRun = False

## sometimes during beta testing we want to delete what's already in the collection
# if reallyDelete:
#     patch_coll2.delete_many({})

# insert the data
if reallyRun:
    for (i,j) in enumerate(patch_paradigms):
        if i == 62:
            print ('%d of %d uploaded ...' % (i+1,len(patch_paradigms)))
            clear_output(wait=True)
            patch_coll.insert_one(j)

print('Done!')


In [None]:
# check to make sure it worked
patch_coll.find_one({})

In [None]:

for thing in patch_coll.find({}):
    print(len(thing['games']))