### Prep sketches for recognition experiment

Goal: To get empirical estimates of collab and solo sketch recognizability to compare with VGG-based recognizability estimates

In [1]:
from __future__ import division

import os
import urllib, cStringIO

import pymongo as pm
import numpy as np
import scipy.stats as stats
import pandas as pd
import json
import re
import ast

from PIL import Image
import base64
import sys

import json
from IPython.display import clear_output
from collections import Counter

#### define file paths, etc.

In [2]:
# directory & file hierarchy
proj_dir = os.path.abspath('..')
analysis_dir = os.getcwd()
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
exp_dir = os.path.abspath(os.path.join(proj_dir,'experiments'))
sketch_dir = os.path.abspath(os.path.join(proj_dir,'sketches'))
svg_dir = os.path.abspath(os.path.join(sketch_dir,'svg'))
png_dir = os.path.abspath(os.path.join(sketch_dir,'png'))

## add helpers to python path
if os.path.join(proj_dir,'analysis') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'python'))
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       
    
# Assign variables within imported analysis helpers
import analysis_helpers as h
import rendering_helpers as srh
if sys.version_info[0]>=3:
    from importlib import reload
reload(h)
reload(srh)

<module 'rendering_helpers' from 'rendering_helpers.pyc'>

#### load in group data csv

note: game '9053-d3a0c1d9-cb81-4bdd-a572-5e38b91b33e9' in pilot1 missing stroke data from the first sheep

In [3]:
# which iteration name should we use?
iterationName = 'pilot2'

## load in sketch-level dataframe 
M = pd.read_csv(os.path.join(csv_dir,'collabdraw_sketch_{}.csv'.format(iterationName)))
print 'Number of unique gameIDs = {}'.format(M.gameID.nunique())

Number of unique gameIDs = 90


#### render sketches using svg data (can be skipped if already rendered)

In [4]:
runThis = 0
reload(srh)
if runThis:
    ## render out all svg
    for name, group in D.groupby(['gameID','trialNum']):   

        ## get list of svg
        this_svg, bounds = srh.make_svg_list(group,crop=True)

        ## construct filename
        g = np.unique(group['gameID'])[0]
        l = np.unique(group['className'])[0]
        r = np.unique(group['repetition'])[0]    
        t = np.unique(group['trialNum'])[0]
        c = np.unique(group['condition'])[0]
        s = 'both'
        e = 'pilot2'
        this_fname = '{}_{}_{}_{}_{}_{}_{}'.format(g,l,r,t,c,s,e)

        print 'Rendering out svg data from game: {}  trial: {}'.format(g,t)
        clear_output(wait=True)

        ## render to svg file
        padding = 10
        stroke_pct_canvas = 0.02 ## what fraction of image size should a stroke's width be?
        stroke_width = np.int(np.round(stroke_pct_canvas*bounds))

        srh.render_svg(this_svg,
                     out_dir=svg_dir,
                     viewbox=[0, 0, bounds+padding, bounds+padding],
                     stroke_width = stroke_width,
                     stroke_color = 'black',
                     out_fname= '{}.svg'.format(this_fname)) 

    ## get svg path list for rendered out svg
    svg_paths = srh.generate_svg_path_list(os.path.join(sketch_dir,'svg'))    

    ## convert all svg to png
    srh.svg_to_png(svg_paths,base_dir=sketch_dir)

#### upload pngs to s3 (can be skipped if already uploaded)

In [5]:
import boto
bucket_name = 'collabdraw-collab8-sketches'
path_to_png = os.path.join(sketch_dir,'png')
runThis = 0
if runThis:
    conn = boto.connect_s3()
    b = conn.create_bucket(bucket_name) ### if bucket already exists, then get_bucket, else create_bucket
    for ind,im in enumerate(os.listdir(path_to_png)):
        if im[-3:]=='png':
            print ind, im
            k = b.new_key(im)
            k.set_contents_from_filename(os.path.join(path_to_png,im))
            k.set_acl('public-read')
            clear_output(wait=True)
            
print 'Done!'

Done!


#### build stimulus dictionary

png filenames are built: gameID + ClassName + repetition + trialNum + condition + sketcherId + iterationName.

e.g., '0019-78badd4a-e1f8-467e-80fa-bfbcec36e346_bear_0_7_solo_both_pilot2.png'

In [6]:
## get list of all images available
path_to_png = os.path.join(sketch_dir,'png')
png_list = [i for i in os.listdir(path_to_png) if i[-3:]=='png']

In [7]:
## manually subset columns of metadata dataframe that you want to have available as part of recog experiment stimdict
stimdict_cols = ['assignmentId','hitId','className','condition',\
                 'expDesign','firstMover','gameID','humanStrokes',\
                 'iterationName','numStrokes','repetition','robotStrokes',\
                 'sketcherId','sketchDuration','time','trialNum']

M2 = M[stimdict_cols]

In [8]:
def build_url(row,url_stem = 'https://s3.amazonaws.com',
              bucket_name = 'BUCKET-NAME',
              gameID = 'GAME-ID-PLACEHOLDER',
              className = 'CLASSNAME',
              repetition = 'REPETITION',
              trialNum = 'TRIAL-NUM',
              condition = 'CONDITION',
              sketcherId = 'SKETCHERID',
              iterationName = 'ITERATION-NAME'):

    return '{}/{}/{}_{}_{}_{}_{}_{}_{}.png'.format(url_stem,bucket_name,\
                                              gameID,className,repetition,\
                                              trialNum,condition,sketcherId,\
                                              iterationName)
    
## add URL column to dataframe
M2 = M2.assign(url = M2.apply(lambda row: build_url(row,bucket_name=bucket_name,\
              gameID=row['gameID'],className=row['className'],repetition=row['repetition'],\
              trialNum=row['trialNum'],condition=row['condition'],sketcherId='both',\
              iterationName=row['iterationName']), axis=1))     
    
## add filename column
M2 = M2.assign(filename = M2['url'].apply(lambda x: x.split('/')[-1]))

## add empty games column (for sorting in database when sampling sketches to recognize)
M2 = M2.assign(games = [[] for _ in range(len(M2))])

## subset by whether this image is in the rendered png list that is on S3
exists_in_metadata = M2['filename'].isin(png_list)
M3 = M2[exists_in_metadata]

## add "shuffle_ind" which determines order in which sketches are pulled out of database
M3 = M3.assign(shuffler_ind = np.random.RandomState(0).permutation(np.arange(M3.shape[0]))) 

print 'Originally {} images in metadata. {} render-able images on S3.'.format(M2.shape[0],M3.shape[0])

Originally 2876 images in metadata. 2848 render-able images on S3.


In [9]:
## save json dictionary to file
json_filename = 'collabdraw-collab8-recog-stimdict.js'
json_filepath = os.path.join(results_dir,'json',json_filename)
out = M3.to_json(json_filepath,orient='records')
print 'Saved stimdict JSON to file.'

Saved stimdict JSON to file.


#### upload stimulus dictionary to mongo

In [10]:
## load JSON back in
J = json.loads(open(json_filepath,mode='ru').read())

In [11]:
# set vars 
auth = pd.read_csv('auth_cogtoolslab.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org' ## mongo db server ip address

# have to fix this to be able to analyze from local
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['stimuli']
coll = db['collabdraw_collab8_recog']

In [12]:
runThis = 1
if runThis:    
    for (i,j) in enumerate(J):
        if i%1==0:
            print ('%d of %d uploaded ...' % (i,len(J)))
            clear_output(wait=True)
        coll.insert_one(j)  
        
print 'Done!'

Done!


In [13]:
'There are {} records in the stimuli database for this experiment.'.format(coll.count())

'There are 2848 records in the stimuli database for this experiment.'