# Stimulus batching

## Uploading stimuli

Creates metadata for corpus collection on gadget and structure domains.

Partitions train/test data into batches of 25 stimuli (20 train and 5 test).

Multiple families of batches can be specified.



**Original metadata** for each domain are uploaded individually (i.e. with a single sweep of the notebook from here to TOPUP BATCH).

**Topup metadata** are uploaded to a separate database, but done all together.
This uploads a single record for every one participant missing from that batch.

In [475]:
from __future__ import division

import numpy as np
import os, sys
from PIL import Image
import pandas as pd
import json
import pickle
import ast

from matplotlib import pylab, mlab, pyplot
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.path import Path
import matplotlib.patches as patches
%matplotlib inline

from IPython.core.pylabtools import figsize, getfigs

import seaborn as sns
from sklearn.model_selection import StratifiedKFold

import random
from functools import reduce

from scipy.stats import norm
from IPython.display import clear_output

import copy
import importlib

# import urllib library
from urllib.request import urlopen

### Add Paths

## root paths
curr_dir = os.getcwd()
proj_dir = os.path.abspath(os.path.join(curr_dir,'..')) ## u\e relative paths

## add helpers to python path
import sys
if os.path.join(proj_dir, 'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir, 'stimuli'))

### Data storage setup

In [32]:
s3_bucket_path_template = "https://lax-{}-{}-all.s3.amazonaws.com/"

In [403]:
subdomains = {
    'structures' :  ['bridge', 'castle', 'house', 'city'],
    'drawing' :  ['nuts-bolts','wheels','furniture','dials']
}

domains = list(subdomains.keys())

In [404]:
domain = domains[0]
subdomain = subdomains[domain][0]
print(domain + ', ' + subdomain)

structures, bridge


### Get stimulus data from S3

In [421]:
def get_stim_df(domain, subdomain):

    experiment_name = 'lax_{}_{}_10'.format(domain, subdomain)

    # generate bucket path
    s3_bucket_path = s3_bucket_path_template.format(domain, subdomain)

    if domain == 'structures':
        # read manifest data
        df = pd.read_csv(s3_bucket_path + 'df_{}.csv'.format(subdomain))

        # estimated complexity
        df.loc[0:50, 'estimated_complexity'] = 'low'
        df.loc[50:, 'estimated_complexity'] = 'high'

        # assign grouping column
    #     df['group'] = np.where(df['train'], 'train', 'test')
        df['group'] = df['estimated_complexity']

        # assign id column
        df['stim_id'] = df['structure_number'] 


    elif domain == 'drawing':

        # read manifest data
        df = pd.read_csv(s3_bucket_path + 'manifest.csv'.format(subdomain))

        # estimated complexity
        df.loc[0:50, 'estimated_complexity'] = 'low'
        df.loc[50:, 'estimated_complexity'] = 'high'

        # assign grouping column
    #     df['group'] = df['data_split']
        df['group'] = df['estimated_complexity']

        # assign id column
        df['stim_id'] = df['stim_id'].apply(lambda x: x[-3:])
        
    df['experiment_name'] = experiment_name
    df['s3_bucket_path'] = s3_bucket_path
        
    return df

In [422]:
df = get_stim_df(domain, subdomain)

## Generate splits

In [546]:
def generate_splits(df, 
                    n_families = 2,
                    n_splits = 25,
                    id_column = 'stim_id',
                    grouping_column = 'group',
                    verbose = False):
    
    # batch size = len(df) / n_splits
    
    groups = df[[id_column,grouping_column]]
   
    if verbose:
        print(groups['group'].value_counts())
    
    X = df[id_column]
    y = df[grouping_column]

    for i in range(0, n_families):

        # set up partitioning
        skf = StratifiedKFold(n_splits=n_splits, 
                              random_state=i,  # tie random state to group num
                              shuffle=True)

        # apply partitioning and save to df
        for split_num, (train_index, test_index) in enumerate(skf.split(X, y)):
            df.loc[test_index,'family_'+str(i)] = split_num
    #         print(groups.loc[test_index])
    
    if verbose:
        print(str(n_splits) + ' splits generated in each of ' + str(n_families) + ' families')
        
    return df

In [424]:
df = generate_splits(df)

high    200
low      50
Name: group, dtype: int64
25 splits generated in each of 2 families


## To run multiple versions, upload the same metadate to separate collections, and update stimColName in configs accordingly

In [425]:
versionInd = 0

In [429]:
# connect to mongo

import pymongo as pm

# set vars 
auth = pd.read_csv('../auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org' ## experiment server ip address

conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['stimuli']
coll = db[df['experiment_name'][0]]

In [432]:
# convert to lists of stimulus names

assert subdomain in df['experiment_name'][0]
assert subdomain in coll.name

metadata = []

for f in range(0, n_families):
    for s in range(0,n_splits):
        stimIDs = list(df.groupby('family_'+str(f)).get_group(s)['stim_id'].apply(lambda x: str(x).zfill(3)))
#         print(stim_numbers)
        metadata.append(
            {
                'partitionFamily': f,
                'splitNumber': s,
                'stimIDs': stimIDs,
                'stimURLS': [s3_bucket_path + "lax-{}-{}-{}-{}.png".format(domain,
                                                              subdomain,
                                                              (stimID if domain=='structures' else 'all'),
                                                              ('all' if domain=='structures' else stimID))\
                            for stimID in stimIDs],
                'ntrials': len(stimIDs),
                'stimGroups': {n: df.groupby('family_'+str(f)).get_group(s).reset_index().loc[i,'group'] for i, n in enumerate(stimIDs)},
                'numGames': 0,
                'games': [],
                'experimentType': 'corpus_collection',
                'experimentName': df['experiment_name'][0],
                's3_bucket_url': s3_bucket_path,
                'versionInd': versionInd
            })

In [433]:
metadata

[{'partitionFamily': 0,
  'splitNumber': 0,
  'stimIDs': ['034',
   '047',
   '071',
   '090',
   '132',
   '145',
   '161',
   '199',
   '216',
   '235'],
  'stimURLS': ['https://lax-drawing-dials-all.s3.amazonaws.com/lax-structures-bridge-034-all.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-structures-bridge-047-all.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-structures-bridge-071-all.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-structures-bridge-090-all.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-structures-bridge-132-all.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-structures-bridge-145-all.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-structures-bridge-161-all.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-structures-bridge-199-all.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-structures-bridge-216-all.png',
   'https://lax-drawing-dials-all.s3.amazonaws.co

In [434]:
experiment_name

'lax_drawing_dials_10'

In [296]:
# Clear metadata collection

really_run = False;

if really_run:
    db.drop_collection(experiment_name)

In [297]:
## now, iterate through each version and insert into mongo
## loop through list of records and insert each into collection
reallyRun = False
if reallyRun:
    for (i,j) in enumerate(metadata):
        coll.insert_one(j)
        print('Inserted version {} of stimDict.'.format(j['versionInd']))
        clear_output(wait=True)

else:
    print('Did not insert any new data.')

Inserted version 0 of stimDict.


In [379]:
list(coll.find())

[{'_id': ObjectId('617733fe32bed710461f583d'),
  'partitionFamily': 0,
  'splitNumber': 0,
  'stimIDs': ['034',
   '047',
   '071',
   '090',
   '132',
   '145',
   '161',
   '199',
   '216',
   '235'],
  'stimURLS': ['https://lax-drawing-dials-all.s3.amazonaws.com/lax-drawing-dials-all-034.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-drawing-dials-all-047.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-drawing-dials-all-071.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-drawing-dials-all-090.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-drawing-dials-all-132.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-drawing-dials-all-145.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-drawing-dials-all-161.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-drawing-dials-all-199.png',
   'https://lax-drawing-dials-all.s3.amazonaws.com/lax-drawing-dials-all-216.png',
   'https://lax-drawing-dials-all.s3.a

## Test which versions have been run

This grabs the dataframe created by data generator, to see which records need to be run more times.

It wipes the metadata from mongo, and replaces it with individual records for each additional partition that needs to be run.

# TOP-UP BATCH 
### REFRESH ALL DOMAINS BY RUNNING FROM HERE

#### Generate dataframe from data collection

In [883]:
subdomains = {
    'structures' :  ['bridge', 'castle', 'house', 'city'],
    'drawing' :  ['nuts-bolts','wheels','furniture','dials']
}

domains = list(subdomains.keys())


iteration_names = ['corpus_prolific_test','corpus_prolific_test_3']
experiment_template = "lax-{}-{}-corpus-{}-10"
condition = 'procedural'
expected_trials = 10

df_trial = pd.DataFrame()
df_all = pd.DataFrame()

db_data = conn['lax']

for domain in domains:
    col_name = 'lax_{}_corpus'.format(domain)
    coll_data = db_data[col_name]
    
    for subdomain in subdomains[domain]:
        
        # get all data for subdomain from db
        df_subdomain_all = pd.DataFrame(coll_data.find({"$and":[ {'iterationName' : { '$in': iteration_names }},
                                          {'experimentName': experiment_template.format(domain, subdomain, condition)},
                                         ]}))
        
        if len(df_subdomain_all) > 0:

            df_subdomain_all['domain'] = domain
            df_subdomain_all['subdomain'] = subdomain


            # get metadata
            df_subdomain_meta = df_subdomain_all[(df_subdomain_all.datatype == 'stim_metadata')]\
                                        [["gameID","partitionFamily","splitNumber","stimIDs", "stimURLS", "stimGroups",
                                          "numGames","experimentType","experimentName","versionInd"]]

            # get trial data
            df_subdomain_trial = df_subdomain_all[\
                      (df_subdomain_all.trial_type == 'stimuli-contextual-language-production') &
                      (df_subdomain_all.datatype == 'trial_end') &
                      (~pd.isna(df_subdomain_all.stimId))]\
                      [['datatype', 'iterationName', 'condition', 'domain', 'subdomain',
                        'config_name', 'gameID', 'shuffle', 'trialOrder', 'rt', 'workerID', 
                        'trial_type', 'trial_index', 'time_elapsed', 'internal_node_id',
                        'view_history', 'stimId', 'stimURL', 'responses']]

            # merge metadata into trial data

            # verify stim groups in metadata are correct
            dicts = list(df_subdomain_all[df_subdomain_all.datatype=='stim_metadata']['stimGroups'])
            stim_groups = reduce(lambda dict1, dict2: {**dict1, **dict2}, dicts)
            stim_groups['demo_stim'] = 'demo_stim'
            # assign stim groups from metadata
            df_subdomain_trial['stim_group'] = df_subdomain_trial['stimId'].apply(lambda stim: stim_groups[stim])
            df_subdomain_trial = df_subdomain_trial.merge(df_subdomain_meta, how='left', on='gameID')

            # append subdomain data to main dataframe
            df_trial = df_trial.append(df_subdomain_trial, ignore_index=True)
            
        else:
            print('no data for ' + domain + '.' + subdomain)
            


In [884]:
df_trial.loc[:, 'responses'] = df_trial.responses.apply(ast.literal_eval)

#### Mark completed datasets

In [885]:
# find full datasets
did_complete = df_trial[df_trial.stim_group != 'demo_stim'].groupby(['gameID']).count()['datatype'] == expected_trials
complete_dataset_gameIDs = list(did_complete[did_complete].index)

df_trial.loc[:,'complete_dataset'] = False
df_trial.loc[(df_trial.gameID.isin(complete_dataset_gameIDs)), 'complete_dataset'] = True

# assign correct trial number
df_trial.loc[:,'trial_num'] = df_trial.trial_index - min(df_trial.trial_index.unique()[1:]) + 1
# assign practice trials to trial_num = 0
df_trial.loc[df_trial.trial_num < 0,'trial_num'] = 0

df_trial['rt_mins'] = df_trial.rt/(60*1000)


#### Find ppts for whom no trials hit the 8 step limit

In [886]:
# TODO: find datasets with no trials with 8 steps
def get_responses(response):

    whats = [key for key in response.keys() if 'what' in key]
    wheres = [key for key in response.keys() if 'where' in key]

    what_responses = [response[what] for what in whats]
    where_responses = [response[where] for where in wheres]

    return (what_responses, where_responses)

df_trial.loc[:, 'response_lists'] = df_trial.responses.apply(get_responses)
df_trial.loc[:, 'whats'] = df_trial.response_lists.apply(lambda x:x[0])
df_trial.loc[:, 'wheres'] = df_trial.response_lists.apply(lambda x:x[1])
df_trial.loc[:, 'n_steps'] = df_trial.whats.apply(len)

In [887]:
# mark those that hit 8 step limit

hit_8_step_limit = df_trial.groupby('gameID').n_steps.unique().apply(max) == 8

df_trial.loc[:, 'ppt_hit_8_step_limit'] = (df_trial.iterationName == 'corpus_prolific_test') & \
                                        (df_trial.gameID.apply(lambda id: hit_8_step_limit[id]))

In [888]:
df_trial[(df_trial.stimId!='demo_stim') &(df_trial.complete_dataset) & (~df_trial.ppt_hit_8_step_limit)]\
    .groupby(['subdomain','stimId'])['responses'].count().value_counts()



2     1580
3      314
4       54
5       39
11       6
6        2
12       2
14       2
7        1
Name: responses, dtype: int64

In [890]:
# how many complete datasets?
df_trial[(df_trial.complete_dataset) & (df_trial.trial_num > 0) & (~df_trial.ppt_hit_8_step_limit)].groupby(['domain','subdomain'])['rt'].count()/expected_trials



domain      subdomain 
drawing     dials         52.0
            furniture     53.0
            nuts-bolts    51.0
            wheels        51.0
structures  bridge        58.0
            castle        76.0
            city          68.0
            house         56.0
Name: rt, dtype: float64

#### find how many of each partition/ split

In [891]:
# based on completeness and hitting 8 step limit (USED ONLY ONCE TO REMOVE DATA)
# I.e. get complete datasets that are not from 'corpus_prolific_test' and hit the 8 step limit
complete_counts = (df_trial[(df_trial.complete_dataset) & 
                            ~(df_trial.ppt_hit_8_step_limit)]\
                   .groupby(['subdomain','partitionFamily','splitNumber'])\
                   .count()/(expected_trials+1))['datatype'].reset_index()


# # based on completeness only (i.e. )
# complete_counts = (df_trial[df_trial.complete_dataset]\
#                    .groupby(['subdomain','partitionFamily','splitNumber'])\
#                    .count()/(expected_trials+1))['datatype'].reset_index()



In [892]:
complete_counts

Unnamed: 0,subdomain,partitionFamily,splitNumber,datatype
0,bridge,0.0,0.0,1.0
1,bridge,0.0,1.0,1.0
2,bridge,0.0,2.0,1.0
3,bridge,0.0,3.0,1.0
4,bridge,0.0,4.0,1.0
...,...,...,...,...
395,wheels,1.0,20.0,1.0
396,wheels,1.0,21.0,1.0
397,wheels,1.0,22.0,1.0
398,wheels,1.0,23.0,1.0


#### top-up incomplete splits by adding individual record for each into top-up stimuli collection

In [893]:
# add a record in extra_metadata for each additional time a split needs to be run

def create_extra_metadata(complete_counts, domain, subdomain, df, n_expected = 1):

    experiment_name = df['experiment_name'][0]
    s3_bucket_path = df['s3_bucket_path'][0]
    
    assert subdomain in experiment_name
    
    extra_metadata = []

    for f in range(0, n_families):
        for s in range(0, n_splits):

            split_count = complete_counts[(complete_counts.partitionFamily == f) &
                                          (complete_counts.splitNumber == s) & 
                                          (complete_counts.subdomain == subdomain)
                                         ].reset_index()

            if len(split_count) == 0:
                n_completed = 0
            else:
                n_completed = split_count.loc[0,'datatype']

            i = n_completed

            while i < n_expected:

                i = i + 1

                stimIDs = list(df.groupby('family_'+str(f)).get_group(s)['stim_id'].apply(lambda x: str(x).zfill(3)))
        #         print(stim_numbers)
                extra_metadata.append(
                    {
                        'partitionFamily': f,
                        'splitNumber': s,
                        'stimIDs': stimIDs,
                        'stimURLS': [s3_bucket_path + "lax-{}-{}-{}-{}.png".format(domain,
                                                                      subdomain,
                                                                      (stimID if domain=='structures' else 'all'),
                                                                      ('all' if domain=='structures' else stimID))\
                                    for stimID in stimIDs],
                        'ntrials': len(stimIDs),
                        'stimGroups': {n: df.groupby('family_'+str(f)).get_group(s).reset_index().loc[i,'group'] for i, n in enumerate(stimIDs)},
                        'numGames': 0,
                        'games': [],
                        'experimentType': 'corpus_collection',
                        'experimentName': experiment_name,
                        's3_bucket_url': s3_bucket_path,
                        'versionInd': versionInd,
                        'extra_metadata_index': i
                    })
            
    return extra_metadata


In [894]:
upload_to_mongo = False

if not(upload_to_mongo):
    print('NO DATA UPLOADED')

for domain in domains:
    for subdomain in subdomains[domain]:
        
        print(domain + ', ' + subdomain)
        
        df_stim = get_stim_df(domain, subdomain)
        df_stim = generate_splits(df_stim)
        
        extra_metadata = create_extra_metadata(complete_counts,
                                               domain, 
                                               subdomain,
                                               df_stim,
                                               n_expected = 1)
        
        stim_col_name = df_stim['experiment_name'][0]
        top_up_stim_col_name = stim_col_name + '_top_up'
        print(top_up_stim_col_name)
        
        if upload_to_mongo:
            
            db = conn['stimuli']
            coll = db[top_up_stim_col_name]
            
            db.drop_collection(top_up_stim_col_name)
            print('cleared stimuli/' + top_up_stim_col_name)
            
            for (i,j) in enumerate(extra_metadata):
                coll.insert_one(j)
#                 clear_output(wait=True)
            
            print(str(len(list(coll.find()))) + ' inserted into stimuli/' + top_up_stim_col_name)
            
        else:
            print(str(len(extra_metadata)) + ' records to upload')
        
        print('')
        # TODO insert into db

NO DATA UPLOADED
structures, bridge
lax_structures_bridge_10_top_up
0 records to upload

structures, castle
lax_structures_castle_10_top_up
0 records to upload

structures, house
lax_structures_house_10_top_up
0 records to upload

structures, city
lax_structures_city_10_top_up
0 records to upload

drawing, nuts-bolts
lax_drawing_nuts-bolts_10_top_up
0 records to upload

drawing, wheels
lax_drawing_wheels_10_top_up
0 records to upload

drawing, furniture
lax_drawing_furniture_10_top_up
0 records to upload

drawing, dials
lax_drawing_dials_10_top_up
0 records to upload



# TO HERE

In [599]:
df_trial[~df_trial.ppt_hit_8_step_limit].groupby(['subdomain','complete_dataset']).count()/11

Unnamed: 0_level_0,Unnamed: 1_level_0,datatype,iterationName,condition,domain,config_name,gameID,shuffle,trialOrder,rt,workerID,...,experimentType,experimentName,versionInd,trial_num,rt_mins,response_lists,whats,wheres,n_steps,ppt_hit_8_step_limit
subdomain,complete_dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
bridge,False,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
bridge,True,8.0,8.0,8.0,8.0,8.0,8.0,0.0,0.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
castle,False,1.909091,1.909091,1.909091,1.909091,1.909091,1.909091,0.0,0.0,1.909091,1.909091,...,1.909091,1.909091,1.909091,1.909091,1.909091,1.909091,1.909091,1.909091,1.909091,1.909091
castle,True,11.0,11.0,11.0,11.0,11.0,11.0,0.0,0.0,11.0,11.0,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
city,False,1.272727,1.272727,1.272727,1.272727,1.272727,1.272727,0.0,0.0,1.272727,1.272727,...,1.272727,1.272727,1.272727,1.272727,1.272727,1.272727,1.272727,1.272727,1.272727,1.272727
city,True,7.0,7.0,7.0,7.0,7.0,7.0,0.0,0.0,7.0,7.0,...,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
dials,False,1.181818,1.181818,1.181818,1.181818,1.181818,1.181818,0.0,0.0,1.181818,1.181818,...,1.181818,1.181818,1.181818,1.181818,1.181818,1.181818,1.181818,1.181818,1.181818,1.181818
dials,True,11.0,11.0,11.0,11.0,11.0,11.0,0.0,0.0,11.0,11.0,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
furniture,False,2.636364,2.636364,2.636364,2.636364,2.636364,2.636364,0.0,0.0,2.636364,2.636364,...,2.636364,2.636364,2.636364,2.636364,2.636364,2.636364,2.636364,2.636364,2.636364,2.636364
furniture,True,22.0,22.0,22.0,22.0,22.0,22.0,0.0,0.0,22.0,22.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
