# This notebook will: 

1. Save out to file (.js) a JSON object that contains an array of six dictionaries, one for each machine
    - Each dictionary will contain all the attributes corresponding to each machine, e.g., toy_type, variant, list of “Regions of Interest” corresponding to each colored region. 

In [1]:
import os, sys
import numpy as np
from io import BytesIO
from PIL import Image
from skimage import io, img_as_float
import base64

import matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')

import string
import pandas as pd
import json
import re
import pymongo as pm
from glob import glob
from IPython.display import clear_output
import itertools 
from random import sample
import importlib

import boto3
import botocore
import requests

import warnings

In [2]:
# directory & file hierarchy
proj_dir = os.path.abspath('..')
stimuli_dir = os.getcwd()
ROI_dir = os.path.abspath(os.path.join(stimuli_dir,'segmented'))

## add helpers to python path
import sys
if os.path.join(proj_dir, 'utils',) not in sys.path:
    sys.path.append(os.path.join(proj_dir, 'utils'))
import utils as h

In [3]:
## load in ROIs
import rois as rois
importlib.reload(rois)
from rois import ROIS, GEARS_1, GEARS_2, LEVERS_1, LEVERS_2, PULLEYS_1, PULLEYS_2

## convert ROI dictionary into ROI dataframe
R = pd.DataFrame(ROIS)

## add pretty names for ROIs and machines
R['roi_id'] = R.apply(lambda x: '{}_{}'.format(x['roi_name'], x['roi_num']), axis=1)
R['toy_id'] = R.apply(lambda x: '{}_{}'.format(x['toy_type'], x['toy_variant']), axis=1)

In [4]:
## inspect R
R.head()

Unnamed: 0,causal,functional,roi_color,roi_name,roi_num,toy_type,toy_variant,roi_id,toy_id
0,False,True,#fed541,gear,1,gears,1,gear_1,gears_1
1,False,True,#70d7ff,gear,2,gears,1,gear_2,gears_1
2,True,True,#b355ed,gear,3,gears,1,gear_3,gears_1
3,True,True,#f78968,gear,4,gears,1,gear_4,gears_1
4,False,False,#a8e1b8,background,5,gears,1,background_5,gears_1


In [5]:
## sanity check: same number of causal and non-causal elements in toys
from copy import deepcopy
causalcheck = deepcopy(R)

#exclude background and light
noncausal = R[((R.roi_name == 'background') | (R.roi_name == 'light'))].index
causalcheck.drop(noncausal , inplace=True)

#check same number of True's and False's for elements of each toy
for group, checkcausal in causalcheck.groupby(['toy_type', 'toy_variant']):
    num_True = checkcausal[checkcausal['causal']==True]['causal'].shape[0]
    num_False = checkcausal[checkcausal['causal']==False]['causal'].shape[0]
    assert num_True == num_False
print('Checks passed for causal and non-causal counts!')
    

Checks passed for causal and non-causal counts!


### generate stim URLs for each segmented machine

In [6]:
import boto3
import botocore
import utils as h
importlib.reload(h)

<module 'utils' from '/Users/hollyhuey/causaldraw_annotations/utils/utils.py'>

In [7]:
reallyRun = False
if reallyRun:
    ## upload stims
    bucket_name = 'causaldraw-annotations'
    path_to_stim = ROI_dir
    ref_urls = h.upload_stims_to_s3(path_to_stim, bucket_name, overwrite=True)

    ## look at ref_urls
    print(ref_urls)
else:
    ref_urls = h.generate_s3_url(ROI_dir, bucket_name='causaldraw-annotations')

### now generate trial metadata for annotations task

In [8]:
## task params
num_sketches_per_annot_session = 10

In [9]:
## add segmented reference urls to metadata
machineIDs = ['_'.join(i.split('/')[-1].split('_')[:2]) for i in ref_urls]
mid2url = dict(zip(machineIDs,ref_urls))
R['ref_url'] = R.apply(lambda x: mid2url[x['toy_id']],axis=1)

## also make sure to add unsegmented ref images (image_urls)
image_urls = ['gears_2.png', 
             'pulleys_2.png', 
             'levers_2.png', 
             'pulleys_1.png', 
             'gears_1.png', 
             'levers_1.png']
imid2url = dict(zip(machineIDs,image_urls))
R['raw_ref_url'] = R.apply(lambda x: imid2url[x['toy_id']],axis=1)

## save out ROI list as tidy dataframe (CSV)
R.to_csv('causaldraw_rois.csv',index=False)

In [10]:
reallyRun = False
if reallyRun:
    ## sanity check: all image urls work
    checkurls = np.unique(R['ref_url'])

    broken_urls = []

    for i, url in enumerate(checkurls):
        if requests.get(url).status_code == 200:
            print('URL {} of {} works!'.format(i+1, len(checkurls)))
        else: 
            broken_urls.append(url)
        clear_output(wait=True)
        
        

In [11]:
reallyRun = False
if reallyRun:
    ## sanity check (manual version): view each image
    for url in checkurls:
        print(url)

    #select any url and copy-paste into next code in source src="XXX.png" to check whether file was uploaded online
    from IPython.display import HTML

    ## check whether image was uploaded online 
    HTML("""
        <img width="320" height="240"src="https://causaldraw-annotations.s3.amazonaws.com/gears_1_annotations.png"> 
    """)

### load in stroke data

In [12]:
## assuming causaldraw repo is in same location on machine as causaldraw_annotations, grab stroke results csv
iterationName = 'run1'
path_to_strokes = (os.path.join(proj_dir,'..', 'causaldraw','results',
                                'csv',iterationName,
                                'causaldraw_stroke_data_{}.csv'.format(iterationName)))
S = pd.read_csv(path_to_strokes)

## preprocessing
S = S.drop(labels=['workerId'],axis=1) # scrub workerId from this csv
S['toy_id'] = S.apply(lambda x: '{}_{}'.format(x['toy_type'], x['toy_variant']), axis=1) 
S['sketch_id'] = S.apply(lambda x: '{}.{}'.format(x['toy_id'], x['gameID']), axis=1)
(S.rename(columns={'trialNum':'orig_trialNum', 'gameID': 'orig_gameID', 
                   'hitID': 'orig_hitID', 'aID': 'orig_aID', 
                   'version': 'orig_version'}, inplace=True))
assert len(np.unique(S['sketch_id'].values))==S['orig_gameID'].nunique() * S['toy_id'].nunique()

## double check that our invalid gameIDs are NOT in S (see causaldraw_analysis.ipynb in causaldraw repo)
invalid_gameids = ['8412-0ff784f1-021a-4e29-a013-0a69fc96dcb2', '4893-26ef64b2-9251-46ed-8f79-7921af54a772']
assert (invalid_gameids[1] in S['orig_gameID'].unique())==False

In [13]:
## some sanity checks

## grouping by toy_id and gameID will give us bundles of strokes belonging to the same sketch
for name, group in S.groupby(['toy_id','orig_gameID']):
    ## make sure that all strokes in each group are from the same trial
    assert len(np.unique(group['orig_trialNum'].values))==1 
    
## iterate over toy_ids, fetching all N=50 unique drawings of each toy
for name, group in S.groupby(['toy_id']):
    assert len(np.unique(group['sketch_id'].values))==S['orig_gameID'].nunique()    
    pass    

#### Create copy of S that has extra batch ID columns

In [14]:
### Generating copy of S called S2 that has extra batch ID columns.
### So we can group sets of strokes together into annotation sessions.

S2 = pd.DataFrame()
## iterate over toy_ids, fetching all N=50 unique drawings of each toy
for name, group in S.groupby(['toy_id']):
    assert len(np.unique(group['sketch_id'].values))==S['orig_gameID'].nunique()    
    
    ## permute using specific random seed
    seeds = np.arange(S['toy_id'].nunique())
    shuffle_ids = np.random.RandomState(0).permutation(np.arange(S['orig_gameID'].nunique()))

    ## create dictionary that maps from sketch_ids to shuffle_ids
    skid2shuff = dict(zip(list(np.unique(group['sketch_id'].values)),list(shuffle_ids)))

    ## add shuffle_id column to this group
    group = group.assign(shuffle_id = group.apply(lambda x: skid2shuff[x['sketch_id']], axis=1))

    ## how many batches do we want? 
    ## if there are N=50 drawings of each machine, and 10 drawings in each annotation session
    ## then we want 50/10 = 5 batches for each machine ... 30 batches in total for all 6 machines
    if len(shuffle_ids) % num_sketches_per_annot_session == 0:
        num_batches = np.int(len(shuffle_ids) / num_sketches_per_annot_session)
        
    ## generate a batch_id to associate with every batch WITHIN MACHINE 
    group = group.assign(batch_id_within_machine = (group.apply(
                                    lambda x: np.int(np.floor(x['shuffle_id']/num_sketches_per_annot_session)), axis=1)))
    assert group['batch_id_within_machine'].nunique() == num_batches

    group = group.assign(batch_id_across_machines = (group.apply(lambda x: '{}_batch_{}'
                                                 .format(x['toy_id'],x['batch_id_within_machine']), axis=1)))

    ## ok, not elegant, but re-build up a copy of S that has the additional columns
    S2 = pd.concat([S2, group], axis=0)
    
print('Created copy of S called S2 that has extra batch ID columns.')    
assert S2.shape[0]==S.shape[0]
assert S2.shape[1]!=S.shape[1]    
assert S2['batch_id_across_machines'].nunique() == S2['toy_id'].nunique() * S2['batch_id_within_machine'].nunique()

Created copy of S called S2 that has extra batch ID columns.


#### create Meta object

In [None]:
### To remind ourselves: our goal is to make a Meta that is a list of 30 dictionaries
## each of those 30 dictionaries contains an entire annotation session, with a unique batch_id ranging from 0-29.
## an annotation session consists of 10 sketches

## So for each batch_id_across machines, we want:
## an OUTER list of an INNER list of dictionaries, 
## where each dictionary is a single stroke
## and the INNER list is a list of strokes within a sketch
## and the OUTER list is a list of trials within a batch

## the key metadata attributes we want
keys_to_keep = ['arcLength','condition','currStrokeNum','demo_dur','demo_seq',
                'endStrokeTime','eventType','file_id','orig_gameID','iterationName',
                'phase','startResponseTime','startStrokeTime','stim_url', 
                'svg','time','toy_type','toy_variant','orig_trialNum','orig_version',
                'toy_id','sketch_id','shuffle_id','batch_id_within_machine',
                'batch_id_across_machines']

Meta = [] 
printed=False
for name, group in S2.groupby(['batch_id_across_machines']):  
    print('Adding {} to Meta'.format(name)) 
    clear_output(wait=True)
    Batch = {} # initialize batch-level dictionary 
    Trials = [] # list of sketches, each list containing sublist of strokes & ROI information
    for n,g in group.groupby('sketch_id'): # looping over sketches within a batch
        _g = g[keys_to_keep] # select only metadata we want
        rois = R[R['toy_id']==g.toy_id.unique()[0]].to_dict(orient='records')
        strokes = _g.to_dict(orient='records')
        Trials.append({'strokes': strokes, 'rois': rois}) # append list of strokes and rois to trial list
    Batch['meta'] = Trials # attaching trial list to batch metadata
    Batch['batch_id'] = np.unique(group['batch_id_across_machines'].values)[0]  
    Batch['games'] = []
    Meta.append(Batch)

print('Done!')

## quick sanity check: length of each batch checks out
assert len(Meta[0]['meta'])==num_sketches_per_annot_session

## sanity check: each sketch is a dictionary containing (data) strokes and (new metadata) roi information
assert list(Meta[0]['meta'][0].keys())==['strokes','rois']

#### save Meta out to file (for redundancy)

In [None]:
# save stimulus dictionary out to file (for redundancy)
dataset_name = 'causaldraw_annotations'
print('Saving out json dictionary out to file...') 
with open('{}_meta.js'.format(dataset_name), 'w') as fout:
    json.dump(Meta, fout)
print('Done!')

#### Load JSON file that we just saved out, back in (not strictly necessary to do this, but anyway)

In [None]:
### next todo is to upload this JSON to initialize the new stimulus collection
J = json.loads(open('{}_meta.js'.format(dataset_name),mode='r').read())
print('dataset_name: {}'.format(dataset_name))
print('Length of J is: {}'.format(len(J)))

#### extract stroke data to construct catch trial for causaldraw_annotations

These are trials to use as catch trials:

```
- gears1catch = {'orig_gameID':'4659-683c0845-cf14-4240-bd20-573c8b7926d8', 'orig_trialNum':1}
- gears2catch = {'orig_gameID':'5235-eb1c8b46-bab0-42e7-8fea-bc50850cd0de', 'orig_trialNum':1}
- levers1catch = {'orig_gameID':'3420-94c6faa4-2c3d-4f6f-a409-e9484a130847', 'orig_trialNum':4}
- levers2 = {'orig_gameID':'8798-4a19eaf6-12f9-4a97-adf2-469707d7995f', 'orig_trialNum':3}
- pulley1 = {'orig_gameID':'9226-c136ef9a-0131-4850-bf5f-1beaf85afcc6', 'orig_trialNum':3}
- pulley2 = {'orig_gameID':'7873-86193d3b-b7ba-4998-b5c2-eef9e242d7d5', 'orig_trialNum':4}
```

###### single example

In [None]:
## these are the unique identifiers for the catch trials for each of the six machines
gears2catch = {'orig_gameID':'2989-156d9df6-e309-430b-bbac-e47b6792127e', 'orig_trialNum':0}

## iterate over J, which contains 30 batches 
for i,j in enumerate(J):
    ## iterate over j['meta'] which contains 10 annotation trials
    for _i,_j in enumerate(j['meta']):
        ## grab original gameID/trialNum from first stroke and compare to our target identifier (e.g., gears2catch)
        gameID_match = (_j['strokes'][0]['orig_gameID']==gears2catch['orig_gameID'])
        trialNum_match = (_j['strokes'][0]['orig_trialNum']==gears2catch['orig_trialNum'])
        if gameID_match and trialNum_match:
            gears2catch['strokes'] = _j['strokes']
            gears2catch['rois'] = _j['rois']
            print('Found the matching stroke data and assigned it to our dictionary!')
        else:
            pass

###### extracting for any machine given J and the corresponding "catch trial dictionary" (aka `catch_dict`)

In [None]:
def extract_matching_strokes(J, catch_dict):
    '''
    input:
        J = JSON object with metadata for causaldraw_annotations
        catch_dict = dictionary with orig_gameID and orig_trialNum we are using to filter for matching stroke data in J
    output:
        updated version of catch_dict with stroke/roi data
    '''
    ## iterate over J, which contains 30 batches 
    for i,j in enumerate(J):
        ## iterate over j['meta'] which contains 10 annotation trials
        for _i,_j in enumerate(j['meta']):
            ## grab original gameID/trialNum from first stroke and compare to our target identifier (e.g., catch_dict)
            gameID_match = (_j['strokes'][0]['orig_gameID']==catch_dict['orig_gameID'])
            trialNum_match = (_j['strokes'][0]['orig_trialNum']==catch_dict['orig_trialNum'])
            if gameID_match and trialNum_match:
                catch_dict['strokes'] = _j['strokes']
                catch_dict['rois'] = _j['rois']
                print('Found the matching stroke data and assigned it to our dictionary!')
            else:
                pass    
    return catch_dict

In [None]:
## testing function out
gears2catch = {'orig_gameID':'5235-eb1c8b46-bab0-42e7-8fea-bc50850cd0de', 'orig_trialNum':1}
gears2catch = extract_matching_strokes(J, gears2catch)
len(gears2catch['strokes'])

#note: removed 1 ambiguous stroke in post-production, so len = 19

In [None]:
gears1catch = {'orig_gameID':'4659-683c0845-cf14-4240-bd20-573c8b7926d8', 'orig_trialNum':1}
gears1catch = extract_matching_strokes(J, gears1catch)
len(gears1catch['strokes'])

#removed 2 ambiguous strokes in post-production, so len = 18

In [None]:
pulleys1catch = {'orig_gameID':'9226-c136ef9a-0131-4850-bf5f-1beaf85afcc6', 'orig_trialNum':3}
pulleys1catch = extract_matching_strokes(J, pulleys1catch)
len(pulleys1catch['strokes'])

#note: removed 2 ambigious strokes in post-production/also removed 4 strokes in attempt to match stroke num, so len = 22

In [None]:
pulleys2catch = {'orig_gameID':'7873-86193d3b-b7ba-4998-b5c2-eef9e242d7d5', 'orig_trialNum':4}
pulleys2catch = extract_matching_strokes(J, pulleys2catch)
len(pulleys2catch['strokes'])

#note: 0 strokes removed in post-production, so len = 17

In [None]:
levers2catch = {'orig_gameID':'8798-4a19eaf6-12f9-4a97-adf2-469707d7995f', 'orig_trialNum':3}
levers2catch = extract_matching_strokes(J, levers2catch)
len(levers2catch['strokes'])

#note: 0 strokes removed in post-production, so len = 20

In [None]:
levers1catch = {'orig_gameID':'3420-94c6faa4-2c3d-4f6f-a409-e9484a130847', 'orig_trialNum':4}
levers1catch = extract_matching_strokes(J, levers1catch)
len(levers1catch['strokes'])

#note: 0 strokes removed in post-production, so len = 21

### now insert records in J (same as elements of Meta) into mongodb

In [None]:
# set vars 
auth = pd.read_csv('auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org'

conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27017') 
db = conn['stimuli']
coll = db['causaldraw_annotations']

In [None]:
## now really insert data
reallyRun = False
if reallyRun:
    for (i,j) in enumerate(J):
        print ('%d of %d uploaded ...' % (i+1,len(J)))
        clear_output(wait=True)
        coll.insert_one(j)
print('Done!')

In [None]:
## do we have as many batches as we mean to?
assert coll.estimated_document_count()==30

In [None]:
## inspect one of these annotation sessions
#coll.find_one()



#### create minimeta that contains just the metadata for a single sketch

In [None]:
one_batch = coll.find_one()
one_sketch = one_batch['meta'][1]
num_strokes = len(one_sketch['strokes']) ## grab a simple sketch
## save single sketch data out
with open('../experiments/data/example.json', 'w') as fout:
    json.dump(one_sketch, fout)

In [None]:
coll.estimated_document_count()

In [None]:
db.list_collection_names()

In [None]:
db.list_collection_names()

In [None]:
conn.list_database_names()

In [None]:
## BE EXTRA CAREFUL WITH THIS -- this is to drop records in the causaldraw_annotations collection, which is stored in
## in the db. Because all grads' records are stored in the stimuli db, we want to be extra careful with this!!!

### db.drop_collection('causaldraw_annotations')