# Generating metadata

### What this notebook does
**Step 1:** Create metadata file, containing a bunch of JSON-formatted trial metadata objects

**Step 2:**  Insert each trial as a record into a mongo database

This assumes that the stimuli have been uploaded to the S3 bucket using `upload_stims_to_s3.ipynb`.

In [171]:
import os
import numpy as np
from PIL import Image
import pandas as pd
import json
import pymongo as pm
from glob import glob
from IPython.display import clear_output
import ast
import itertools
import random

In [172]:
def list_files(paths, ext='mp4'):
    """Pass list of folders if there are stimuli in multiple folders. 
    Make sure that the containing folder is informative, as the rest of the path is ignored in naming. 
    Also returns filenames as uploaded to S3"""
    if type(paths) is not list:
        paths = [paths]
    results = []
    names = []
    for path in sorted(paths):
        results += [y for x in os.walk(path) for y in sorted(glob(os.path.join(x[0], '*.%s' % ext)))]
        names += [os.path.basename(os.path.dirname(y))+'_'+os.path.split(y)[1] for x in os.walk(path)\
                  for y in sorted(glob(os.path.join(x[0], '*.%s' % ext)))]
    return results,names

In [181]:
## where are your stimulus images stored?
# data_dirs = ['./example'] #Where on disk are the stimuli stored?
# bucket_name = 'human-physics-benchmarking-pilot'
data_dirs = [
#     '/Users/dbear/neuroailab/physics_benchmarking/stimuli/pilot_dominoes_SJ025_boxroom/',
#     '/Users/dbear/neuroailab/physics_benchmarking/stimuli/pilot_dominoes_SJ025_tdwroom/'
]
bucket_name = 'human-physics-benchmarking-dominoes-pilot'
stim_version = 'example'
dataset_name = '{}_{}'.format(bucket_name, stim_version)
stimulus_extension = "mp4" #what's the file extension for the stims? Provide without dot

## get a list of paths to each one
full_stim_paths,filenames = list_files(data_dirs,stimulus_extension)
print('We have {} stimuli to evaluate.'.format(len(full_stim_paths)))

We have 0 stimuli to evaluate.


In [174]:
# filenames

In [175]:
## helper to build stim urls
def build_s3_url(filename, bucket_name):    
    return 'https://{}.s3.amazonaws.com/{}'.format(bucket_name, filename)

In [176]:
## basic metadata lists
stim_urls = [build_s3_url(p,bucket_name) for p in filenames]
stim_IDs = [name.split('.')[0] for name in filenames]

In [177]:
## convert to pandas dataframe
M = pd.DataFrame([stim_IDs,stim_urls]).transpose()
M.columns = ['stim_ID', 'stim_url']

In [178]:
M[0:5]

Unnamed: 0,stim_ID,stim_url
0,pilot_dominoes_SJ025_boxroom_0000_img,https://human-physics-benchmarking-dominoes-pi...
1,pilot_dominoes_SJ025_boxroom_0001_img,https://human-physics-benchmarking-dominoes-pi...
2,pilot_dominoes_SJ025_boxroom_0002_img,https://human-physics-benchmarking-dominoes-pi...
3,pilot_dominoes_SJ025_boxroom_0003_img,https://human-physics-benchmarking-dominoes-pi...
4,pilot_dominoes_SJ025_boxroom_0004_img,https://human-physics-benchmarking-dominoes-pi...


In [179]:
# Add trial labels to the metadata using the stimulus metadata.json

target_hit_zone_labels = dict()
for _dir in data_dirs:
    with open(_dir + '/metadata.json', 'rb') as f:
        trial_metas = json.load(f)
        
    for i,meta in enumerate(trial_metas):
        stim_name = meta['stimulus_name']
        if stim_name[:2] == "b\'":
            stim_name = stim_name[2:-1]
        if '/' in stim_name:
            stim_name = '_'.join(stim_name.split('/'))
        stim_name = '_'.join([stim_name, 'img'])
        label = meta['does_target_contact_zone']
        target_hit_zone_labels[stim_name] = label
        
print("num positive labels: %d" % sum(list(target_hit_zone_labels.values())))
print("num negative labels: %d" % (len(target_hit_zone_labels) - sum(list(target_hit_zone_labels.values()))))

GT = pd.DataFrame([list(target_hit_zone_labels.keys()), list(target_hit_zone_labels.values())]).transpose()
GT.columns = ['stim_ID', 'target_hit_zone_label']
if all([col not in M.columns for col in GT.columns[1:]]):
    M = M.merge(GT, on='stim_ID')
    print("added labels %s" % list(GT.columns[1:]))

num positive labels: 20
num negative labels: 20
added labels Index(['target_hit_zone_label'], dtype='object')


In [180]:
M[0:10]

Unnamed: 0,stim_ID,stim_url,target_hit_zone_label
0,pilot_dominoes_SJ025_boxroom_0000_img,https://human-physics-benchmarking-dominoes-pi...,True
1,pilot_dominoes_SJ025_boxroom_0001_img,https://human-physics-benchmarking-dominoes-pi...,True
2,pilot_dominoes_SJ025_boxroom_0002_img,https://human-physics-benchmarking-dominoes-pi...,True
3,pilot_dominoes_SJ025_boxroom_0003_img,https://human-physics-benchmarking-dominoes-pi...,True
4,pilot_dominoes_SJ025_boxroom_0004_img,https://human-physics-benchmarking-dominoes-pi...,True
5,pilot_dominoes_SJ025_boxroom_0005_img,https://human-physics-benchmarking-dominoes-pi...,False
6,pilot_dominoes_SJ025_boxroom_0006_img,https://human-physics-benchmarking-dominoes-pi...,True
7,pilot_dominoes_SJ025_boxroom_0007_img,https://human-physics-benchmarking-dominoes-pi...,False
8,pilot_dominoes_SJ025_boxroom_0008_img,https://human-physics-benchmarking-dominoes-pi...,False
9,pilot_dominoes_SJ025_boxroom_0009_img,https://human-physics-benchmarking-dominoes-pi...,False


Ok, so we now have a list of all the stimuli to evaluate. Let's create random orders of them and submit them to the database. Set the number of stimuli shown to one participant and the number of different orders to generate (`num_of_permutations`). Each of these sets can be shown to a participant, with the least often shown set being shown (so we can get away with fewer sets than subjects). 

In [154]:
stims_per_run = len(M) #len(M) to show all stimuli
num_of_sets = 10

In [155]:
#generate list of stimuli as dicts
L = M.to_dict(orient='records')

In [131]:
#generate list of lists of shuffled, sampled w/o replacement stimuli
stim_sets = []
for i in range(num_of_sets):
    stim_set = random.sample(L, stims_per_run)
    stim_sets += [{str(num):stim for num,stim in enumerate(stim_set)}]

In [132]:
len(stim_sets)

10

In [108]:
# stim_sets

Now we have a list of sequences of stimuli.

Set up ssh bridge to write to mongodb. Insert your username. If you don't have an SSH secret set yet, run `ssh -fNL 27017:127.0.0.1:27017 USERNAME@cogtoolslab.org` in your shell.

In [110]:
!ssh -fNL 27017:127.0.0.1:27017 dbear@cogtoolslab.org

In [133]:
# set vars 
auth = pd.read_csv('../auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user. Place it in the toplevel of the repo
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org' ## cogtoolslab ip address

conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['stimuli']
coll = db[dataset_name]

In [None]:
#⚠️drop collection if necessary. 
# db.drop_collection(dataset_name) 

In [138]:
#get list of current collections
sorted(db.list_collection_names())

['block-construction-silhouette-exp01',
 'block-construction-silhouette-exp02',
 'causaldraw',
 'causaldraw_annotations',
 'causaldraw_annotations_patching',
 'causaldraw_identification',
 'causaldraw_intervention',
 'causaldraw_intervention_patching',
 'collabdraw_collab8_recog',
 'curiotower-tdw',
 'curiotower-tdw-height3Jitter3',
 'curiotower_curiodrop',
 'dominoes-pilot_example',
 'graphical_conventions_object_annotation',
 'graphical_conventions_semantic_mapping',
 'graphical_conventions_semantic_mapping_patching',
 'graphical_conventions_semantic_mapping_spline_version_old',
 'human-physics-benchmarking-dominoes-pilot_example',
 'human-physics-benchmarking-pilot_example',
 'iternum_classification',
 'photodraw2',
 'semantic_parts_graphical_conventions',
 'svg_annotation_sketchpad_basic_allcats',
 'tools_for_block_construction_given_subgoals']

In [135]:
## actually add data now to the database
for (i,m) in enumerate(stim_sets):
    coll.insert_one({'stims':m})
    print('{}_of_{}'.format(i+1, len(stim_sets)))
    clear_output(wait=True)

print('Done inserting records into mongo! The collection name is',dataset_name)

Done inserting records into mongo! The collection name is human-physics-benchmarking-dominoes-pilot_example


In [136]:
coll.estimated_document_count()

10

In [137]:
coll.find_one()

{'_id': ObjectId('603d50dab42167a41b6723b0'),
 'stims': {'0': {'stim_ID': 'pilot_dominoes_SJ025_boxroom_0002_img',
   'stim_url': 'https://human-physics-benchmarking-dominoes-pilot.s3.amazonaws.com/pilot_dominoes_SJ025_boxroom_0002_img.mp4',
   'target_hit_zone_label': True},
  '1': {'stim_ID': 'pilot_dominoes_SJ025_tdwroom_0006_img',
   'stim_url': 'https://human-physics-benchmarking-dominoes-pilot.s3.amazonaws.com/pilot_dominoes_SJ025_tdwroom_0006_img.mp4',
   'target_hit_zone_label': True},
  '2': {'stim_ID': 'pilot_dominoes_SJ025_tdwroom_0011_img',
   'stim_url': 'https://human-physics-benchmarking-dominoes-pilot.s3.amazonaws.com/pilot_dominoes_SJ025_tdwroom_0011_img.mp4',
   'target_hit_zone_label': False},
  '3': {'stim_ID': 'pilot_dominoes_SJ025_tdwroom_0005_img',
   'stim_url': 'https://human-physics-benchmarking-dominoes-pilot.s3.amazonaws.com/pilot_dominoes_SJ025_tdwroom_0005_img.mp4',
   'target_hit_zone_label': False},
  '4': {'stim_ID': 'pilot_dominoes_SJ025_boxroom_0004_i

In [120]:
# list(coll.find())