# LAX Reconstruction Experiment

## Upload descriptions to pull into reconstruction experiment

One record per description

<!-- 
**Original metadata** for each domain are uploaded individually (i.e. with a single sweep of the notebook from here to TOPUP BATCH).

**Topup metadata** are uploaded to a separate database, but done all together.
This uploads a single record for every one participant missing from that batch. -->

In [2]:
from __future__ import division

import numpy as np
import os, sys
from PIL import Image
import pandas as pd
import json
import pickle
import ast

from matplotlib import pylab, mlab, pyplot
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.path import Path
import matplotlib.patches as patches
%matplotlib inline

from IPython.core.pylabtools import figsize, getfigs

import seaborn as sns
from sklearn.model_selection import StratifiedKFold

import random
from functools import reduce

from scipy.stats import norm
from IPython.display import clear_output

import copy
import importlib

# import urllib library
from urllib.request import urlopen

### Add Paths

## root paths
curr_dir = os.getcwd()
proj_dir = os.path.abspath(os.path.join(curr_dir,'..')) ## u\e relative paths

## add helpers to python path
import sys
if os.path.join(proj_dir, 'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir, 'stimuli'))

### Data storage setup

(If we need to get data about stims)

In [3]:
s3_bucket_path_template = "https://lax-{}-{}-all.s3.amazonaws.com/"

In [4]:
subdomains = {
    'structures' :  ['bridge', 'castle', 'house', 'city'],
    'drawing' :  ['nuts-bolts','wheels','furniture','dials']
}

domains = list(subdomains.keys())

In [5]:
domain = domains[0]
subdomain = subdomains[domain][0]
print(domain + ', ' + subdomain)

structures, bridge


In [6]:
# connect to mongo

import pymongo as pm

# set vars 
auth = pd.read_csv('../../auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org' ## experiment server ip address

conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['stimuli']

#### Get stimulus data from S3

In [7]:
def get_stim_df(domain, subdomain):

    experiment_name = 'lax_{}_{}_10'.format(domain, subdomain)

    # generate bucket path
    s3_bucket_path = s3_bucket_path_template.format(domain, subdomain)

    if domain == 'structures':
        # read manifest data
        df = pd.read_csv(s3_bucket_path + 'df_{}.csv'.format(subdomain))

        # estimated complexity
        df.loc[0:50, 'estimated_complexity'] = 'low'
        df.loc[50:, 'estimated_complexity'] = 'high'

        # assign grouping column
    #     df['group'] = np.where(df['train'], 'train', 'test')
        df['group'] = df['estimated_complexity']

        # assign id column
        df['stim_id'] = df['structure_number'] 


    elif domain == 'drawing':

        # read manifest data
        df = pd.read_csv(s3_bucket_path + 'manifest.csv'.format(subdomain))

        # estimated complexity
        df.loc[0:50, 'estimated_complexity'] = 'low'
        df.loc[50:, 'estimated_complexity'] = 'high'

        # assign grouping column
    #     df['group'] = df['data_split']
        df['group'] = df['estimated_complexity']

        # assign id column
        df['stim_id'] = df['stim_id'].apply(lambda x: x[-3:])
        
    df['experiment_name'] = experiment_name
    df['s3_bucket_path'] = s3_bucket_path
        
    return df

In [8]:
df = get_stim_df(domain, subdomain)

## Load in description data

In [None]:
# read results csv

In [9]:
df_descriptions = pd.read_csv('../../results/csv/lax_corpus_1k_cogsci22.csv')

## Split the 10 descriptions from each architect/ describer trials into 2 sets of 5

In [10]:
def split_group(df):
    indices = list(range(0, len(df)))
    np.random.shuffle(indices)
    groups = (pd.Series(indices) < len(df)/2).map({True: 'A', False: 'B'})
    return groups

In [11]:
def create_metadatum_for_group(df):
    
    trials = []
    stimIDs = []
    stimURLs = []
    
    for _, row in df.iterrows():

        trials.append({
            'stimID' : row.stimId,
            'stimURL': row.stimURL,
            'subdomain': row.subdomain,
            'descriptionGameID': row.gameID,
            'descriptionTrialNum': row.trial_num,
            'description' :
                {
                'whats': ast.literal_eval(row.whats),
                'wheres': ast.literal_eval(row.wheres)
                },
            })
        stimURLs.append(row.stimURL)
        stimIDs.append(row.stimId)
        
    metadatum = {
        'domain': domain,
        'stimIDs': stimIDs,
        'stimURLs': stimURLs,
        'trials' : trials,
        'numGames': 0,
        'games': [],
        'experimentType': 'reconstruction',
        'experimentName': 'lax_reconstruction_towers_dev',
        'versionInd': version_ind
    }

    return metadatum

In [None]:
upload = False

for subdomain in subdomains['structures']:

    # get correct subdomain
    df_subdomain = df_descriptions[df_descriptions.subdomain==subdomain]

    # groupby gameID
    ppt_descriptions = df_subdomain.groupby('gameID')

    metadata = []

    # for each ppt
    for gameID, group in ppt_descriptions:
        group_labels = list(split_group(group)) #randomly split in two
        group['group_labels'] = group_labels #label each with a group label
        # and for each group create metadata
        metadata += list(group.groupby('group_labels').apply(create_metadatum_for_group)) 
        
    # upload to mongo
    stim_coll_name = 'lax_reconstruction_stims_' + subdomain
    print('uploading metadata to ' + stim_coll_name)
    
    if upload:
        coll = db[stim_coll_name]
        db.drop_collection(stim_coll_name)
        for (i,j) in enumerate(metadata):
            coll.insert_one(j)
#             print('Inserted version {} of stimDict.'.format(j['versionInd']))
        print('Uploaded version {} of {} metadata.'.format(j['versionInd'], subdomain))
    else:
        print('metadata created but not uploaded')
    
    

In [None]:
stim_coll_name = 'lax_reconstruction_stims_' + 'bridge'

In [None]:
coll = db[stim_coll_name]

In [None]:
list(coll.find())

## Upload a single metadatum of 5 trials (for all participants to recreate)

In [12]:
def split_group(df):
    indices = list(range(0, len(df)))
    np.random.shuffle(indices)
    groups = (pd.Series(indices) < len(df)/2).map({True: 'A', False: 'B'})
    return groups

In [13]:
def create_metadatum_for_group(df):
    
    trials = []
    stimIDs = []
    stimURLs = []
    
    for _, row in df.iterrows():

        trials.append({
            'stimID' : row.stimId,
            'stimURL': row.stimURL,
            'subdomain': row.subdomain,
            'descriptionGameID': row.gameID,
            'descriptionTrialNum': row.trial_num,
            'description' :
                {
                'whats': ast.literal_eval(row.whats),
                'wheres': ast.literal_eval(row.wheres)
                },
            })
        stimURLs.append(row.stimURL)
        stimIDs.append(row.stimId)
        
    metadatum = {
        'domain': domain,
        'stimIDs': stimIDs,
        'stimURLs': stimURLs,
        'trials' : trials,
        'numGames': 0,
        'games': [],
        'experimentType': 'reconstruction',
        'experimentName': 'lax_reconstruction_towers_dev',
        'versionInd': version_ind
    }

    return metadatum

In [15]:
version_ind = 1

In [79]:
upload = True

for subdomain in subdomains['structures']:

    # get correct subdomain
    df_subdomain = df_descriptions[df_descriptions.subdomain==subdomain]

    # groupby gameID
    ppt_descriptions = df_subdomain.groupby('gameID')

    metadata = []

    # for each ppt
    for gameID, group in ppt_descriptions:
        group_labels = list(split_group(group)) #randomly split in two
        group['group_labels'] = group_labels #label each with a group label
        # and for each group create metadata
        metadata += list(group.groupby('group_labels').apply(create_metadatum_for_group)) 
        
    # upload to mongo
    stim_coll_name = 'lax_reconstruction_stims_' + subdomain
    print('uploading metadata to ' + stim_coll_name)
    
    if upload:
        coll = db[stim_coll_name]
        db.drop_collection(stim_coll_name)
        for (i,j) in enumerate(metadata[70:71]): # Take only last one
            coll.insert_one(j)
#             print('Inserted version {} of stimDict.'.format(j['versionInd']))
        print('Uploaded version {} of {} metadata.'.format(j['versionInd'], subdomain))
    else:
        print('Metadata created but not uploaded')
    
    

uploading metadata to lax_reconstruction_stims_bridge
Uploaded version 1 of bridge metadata.
uploading metadata to lax_reconstruction_stims_castle
Uploaded version 1 of castle metadata.
uploading metadata to lax_reconstruction_stims_house
Uploaded version 1 of house metadata.
uploading metadata to lax_reconstruction_stims_city
Uploaded version 1 of city metadata.


In [95]:
stim_coll_name = 'lax_reconstruction_stims_' + subdomains['structures'][3]
stim_coll_name

'lax_reconstruction_stims_city'

In [96]:
coll = db[stim_coll_name]

In [97]:
list(coll.find())

[{'_id': ObjectId('6245ea26896ace31dc2aecf4'),
  'domain': 'structures',
  'stimIDs': [115, 190, 174, 139, 6],
  'stimURLs': ['https://lax-structures-city-all.s3.amazonaws.com/lax-structures-city-115-all.png',
   'https://lax-structures-city-all.s3.amazonaws.com/lax-structures-city-190-all.png',
   'https://lax-structures-city-all.s3.amazonaws.com/lax-structures-city-174-all.png',
   'https://lax-structures-city-all.s3.amazonaws.com/lax-structures-city-139-all.png',
   'https://lax-structures-city-all.s3.amazonaws.com/lax-structures-city-006-all.png'],
  'trials': [{'stimID': 115,
    'stimURL': 'https://lax-structures-city-all.s3.amazonaws.com/lax-structures-city-115-all.png',
    'subdomain': 'city',
    'descriptionGameID': '5032-b49130a3-5cda-434c-aadc-dc956e057838',
    'descriptionTrialNum': 2.0,
    'description': {'whats': ['2 blue blocks horizontally a few cms apart',
      'place 4 blue blocks horizontally creating a line of 4.',
      'Repeat the adding 4 bocks, then a furth

#### scratch

In [None]:
# groupby gameID
ppt_descriptions = df_subdomain.groupby('gameID')

In [None]:
metadata = []

# for each ppt
for gameID, group in ppt_descriptions:
    group_labels = list(split_group(group)) #randomly split in two
    group['group_labels'] = group_labels #label each with a group label
    metadata += list(group.groupby('group_labels').apply(create_metadatum_for_group)) # and for each group create metadata

In [None]:
df_subdomain = df_descriptions[df_descriptions.subdomain==subdomain]

In [None]:
# groupby gameID
ppt_descriptions = df_subdomain.groupby('gameID')

## Create record for each stim

In [None]:
version_ind = 0

In [None]:
domain = 'structures'

In [None]:
# df_domain = df_descriptions[df_descriptions.domain==domain]

In [None]:
df_subdomain = df_descriptions[df_descriptions.subdomain==subdomain]

In [None]:
stim_coll_name = 'lax_reconstruction_stims_' + subdomain
print(stim_coll_name)

In [None]:
# # TEMP: take 10 rows, keeping trials together

# metadata = []

# size = 20

# i = 0
# j = i + size

# while j < len(df_domain):
    
#     trials = []
#     stimIDs = []
#     stimURLs = []
    
#     for _, row in df_domain.iloc[i:j].iterrows():

#         trials.append({
#             'stimID' : row.stimId,
#             'stimURL': row.stimURL,
#             'subdomain': row.subdomain,
#             'descriptionGameID': row.gameID,
#             'description' :
#                 {
#                 'whats': ast.literal_eval(row.whats),
#                 'wheres': ast.literal_eval(row.wheres)
#                 },
#             })
#         stimURLs.append(row.stimURL)
#         stimIDs.append(row.stimId)

#     metadata.append({
#         'domain': domain,
#         'stimIDs': stimIDs,
#         'stimURLs': stimURLs,
#         'trials' : trials,
#         'numGames': 0,
#         'games': [],
#         'experimentType': 'reconstruction',
#         'experimentName': 'lax_reconstruction_towers_dev',
#         'versionInd': version_ind
#     })
    
#     i = j
#     j = j + size

In [None]:
# connect to mongo

import pymongo as pm

# set vars 
auth = pd.read_csv('../../auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org' ## experiment server ip address

conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['stimuli']
coll = db[stim_coll_name]

In [None]:
# Clear metadata collection

really_run = True;

if really_run:
    db.drop_collection(stim_coll_name)

In [None]:
## now, iterate through each version and insert into mongo
## loop through list of records and insert each into collection
reallyRun = True
if reallyRun:
    for (i,j) in enumerate(metadata):
        coll.insert_one(j)
        print('Inserted version {} of stimDict.'.format(j['versionInd']))
        clear_output(wait=True)

else:
    print('Did not insert any new data.')

## Generate splits

In [None]:
def generate_splits(df, 
                    n_families = 2,
                    n_splits = 25,
                    id_column = 'stim_id',
                    grouping_column = 'group',
                    verbose = False):
    
    # batch size = len(df) / n_splits
    
    groups = df[[id_column, grouping_column]]
   
    if verbose:
        print(groups['group'].value_counts())
    
    X = df[id_column]
    y = df[grouping_column]

    for i in range(0, n_families):

        # set up partitioning
        skf = StratifiedKFold(n_splits=n_splits, 
                              random_state=i,  # tie random state to group num
                              shuffle=True)

        # apply partitioning and save to df
        for split_num, (train_index, test_index) in enumerate(skf.split(X, y)):
            df.loc[test_index,'family_'+str(i)] = split_num
    #         print(groups.loc[test_index])
    
    if verbose:
        print(str(n_splits) + ' splits generated in each of ' + str(n_families) + ' families')
        
    return df

In [None]:
df = generate_splits(df)

## To run multiple versions, upload the same metadate to separate collections, and update stimColName in configs accordingly

In [None]:
versionInd = 0

In [None]:
# connect to mongo

import pymongo as pm

# set vars 
auth = pd.read_csv('../auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org' ## experiment server ip address

conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['stimuli']
coll = db[df['experiment_name'][0]]

In [None]:
# convert to lists of stimulus names

assert subdomain in df['experiment_name'][0]
assert subdomain in coll.name

metadata = []

for f in range(0, n_families):
    for s in range(0,n_splits):
        stimIDs = list(df.groupby('family_'+str(f)).get_group(s)['stim_id'].apply(lambda x: str(x).zfill(3)))
#         print(stim_numbers)
        metadata.append(
            {
                'partitionFamily': f,
                'splitNumber': s,
                'stimIDs': stimIDs,
                'stimURLS': [s3_bucket_path + "lax-{}-{}-{}-{}.png".format(domain,
                                                              subdomain,
                                                              (stimID if domain=='structures' else 'all'),
                                                              ('all' if domain=='structures' else stimID))\
                            for stimID in stimIDs],
                'ntrials': len(stimIDs),
                'stimGroups': {n: df.groupby('family_'+str(f)).get_group(s).reset_index().loc[i,'group'] for i, n in enumerate(stimIDs)},
                'numGames': 0,
                'games': [],
                'experimentType': 'corpus_collection',
                'experimentName': df['experiment_name'][0],
                's3_bucket_url': s3_bucket_path,
                'versionInd': versionInd
            })

In [None]:
metadata

In [None]:
experiment_name

In [None]:
list(coll.find())

## Test which versions have been run

This grabs the dataframe created by data generator, to see which records need to be run more times.

It wipes the metadata from mongo, and replaces it with individual records for each additional partition that needs to be run.

# TOP-UP BATCH 
### REFRESH ALL DOMAINS BY RUNNING FROM HERE

#### Generate dataframe from data collection

In [None]:
subdomains = {
    'structures' :  ['bridge', 'castle', 'house', 'city'],
    'drawing' :  ['nuts-bolts','wheels','furniture','dials']
}

domains = list(subdomains.keys())


iteration_names = ['corpus_prolific_test','corpus_prolific_test_3']
experiment_template = "lax-{}-{}-corpus-{}-10"
condition = 'procedural'
expected_trials = 10

df_trial = pd.DataFrame()
df_all = pd.DataFrame()

db_data = conn['lax']

for domain in domains:
    col_name = 'lax_{}_corpus'.format(domain)
    coll_data = db_data[col_name]
    
    for subdomain in subdomains[domain]:
        
        # get all data for subdomain from db
        df_subdomain_all = pd.DataFrame(coll_data.find({"$and":[ {'iterationName' : { '$in': iteration_names }},
                                          {'experimentName': experiment_template.format(domain, subdomain, condition)},
                                         ]}))
        
        if len(df_subdomain_all) > 0:

            df_subdomain_all['domain'] = domain
            df_subdomain_all['subdomain'] = subdomain


            # get metadata
            df_subdomain_meta = df_subdomain_all[(df_subdomain_all.datatype == 'stim_metadata')]\
                                        [["gameID","partitionFamily","splitNumber","stimIDs", "stimURLS", "stimGroups",
                                          "numGames","experimentType","experimentName","versionInd"]]

            # get trial data
            df_subdomain_trial = df_subdomain_all[\
                      (df_subdomain_all.trial_type == 'stimuli-contextual-language-production') &
                      (df_subdomain_all.datatype == 'trial_end') &
                      (~pd.isna(df_subdomain_all.stimId))]\
                      [['datatype', 'iterationName', 'condition', 'domain', 'subdomain',
                        'config_name', 'gameID', 'shuffle', 'trialOrder', 'rt', 'workerID', 
                        'trial_type', 'trial_index', 'time_elapsed', 'internal_node_id',
                        'view_history', 'stimId', 'stimURL', 'responses']]

            # merge metadata into trial data

            # verify stim groups in metadata are correct
            dicts = list(df_subdomain_all[df_subdomain_all.datatype=='stim_metadata']['stimGroups'])
            stim_groups = reduce(lambda dict1, dict2: {**dict1, **dict2}, dicts)
            stim_groups['demo_stim'] = 'demo_stim'
            # assign stim groups from metadata
            df_subdomain_trial['stim_group'] = df_subdomain_trial['stimId'].apply(lambda stim: stim_groups[stim])
            df_subdomain_trial = df_subdomain_trial.merge(df_subdomain_meta, how='left', on='gameID')

            # append subdomain data to main dataframe
            df_trial = df_trial.append(df_subdomain_trial, ignore_index=True)
            
        else:
            print('no data for ' + domain + '.' + subdomain)
            


In [None]:
df_trial.loc[:, 'responses'] = df_trial.responses.apply(ast.literal_eval)

#### Mark completed datasets

In [None]:
# find full datasets
did_complete = df_trial[df_trial.stim_group != 'demo_stim'].groupby(['gameID']).count()['datatype'] == expected_trials
complete_dataset_gameIDs = list(did_complete[did_complete].index)

df_trial.loc[:,'complete_dataset'] = False
df_trial.loc[(df_trial.gameID.isin(complete_dataset_gameIDs)), 'complete_dataset'] = True

# assign correct trial number
df_trial.loc[:,'trial_num'] = df_trial.trial_index - min(df_trial.trial_index.unique()[1:]) + 1
# assign practice trials to trial_num = 0
df_trial.loc[df_trial.trial_num < 0,'trial_num'] = 0

df_trial['rt_mins'] = df_trial.rt/(60*1000)


#### Find ppts for whom no trials hit the 8 step limit

In [None]:
# TODO: find datasets with no trials with 8 steps
def get_responses(response):

    whats = [key for key in response.keys() if 'what' in key]
    wheres = [key for key in response.keys() if 'where' in key]

    what_responses = [response[what] for what in whats]
    where_responses = [response[where] for where in wheres]

    return (what_responses, where_responses)

df_trial.loc[:, 'response_lists'] = df_trial.responses.apply(get_responses)
df_trial.loc[:, 'whats'] = df_trial.response_lists.apply(lambda x:x[0])
df_trial.loc[:, 'wheres'] = df_trial.response_lists.apply(lambda x:x[1])
df_trial.loc[:, 'n_steps'] = df_trial.whats.apply(len)

In [None]:
# mark those that hit 8 step limit

hit_8_step_limit = df_trial.groupby('gameID').n_steps.unique().apply(max) == 8

df_trial.loc[:, 'ppt_hit_8_step_limit'] = (df_trial.iterationName == 'corpus_prolific_test') & \
                                        (df_trial.gameID.apply(lambda id: hit_8_step_limit[id]))

In [None]:
df_trial[(df_trial.stimId!='demo_stim') &(df_trial.complete_dataset) & (~df_trial.ppt_hit_8_step_limit)]\
    .groupby(['subdomain','stimId'])['responses'].count().value_counts()



In [None]:
# how many complete datasets?
df_trial[(df_trial.complete_dataset) & (df_trial.trial_num > 0) & (~df_trial.ppt_hit_8_step_limit)].groupby(['domain','subdomain'])['rt'].count()/expected_trials



#### find how many of each partition/ split

In [None]:
# based on completeness and hitting 8 step limit (USED ONLY ONCE TO REMOVE DATA)
# I.e. get complete datasets that are not from 'corpus_prolific_test' and hit the 8 step limit
complete_counts = (df_trial[(df_trial.complete_dataset) & 
                            ~(df_trial.ppt_hit_8_step_limit)]\
                   .groupby(['subdomain','partitionFamily','splitNumber'])\
                   .count()/(expected_trials+1))['datatype'].reset_index()


# # based on completeness only (i.e. )
# complete_counts = (df_trial[df_trial.complete_dataset]\
#                    .groupby(['subdomain','partitionFamily','splitNumber'])\
#                    .count()/(expected_trials+1))['datatype'].reset_index()



In [None]:
complete_counts

#### top-up incomplete splits by adding individual record for each into top-up stimuli collection

In [None]:
# add a record in extra_metadata for each additional time a split needs to be run

def create_extra_metadata(complete_counts, domain, subdomain, df, n_expected = 1):

    experiment_name = df['experiment_name'][0]
    s3_bucket_path = df['s3_bucket_path'][0]
    
    assert subdomain in experiment_name
    
    extra_metadata = []

    for f in range(0, n_families):
        for s in range(0, n_splits):

            split_count = complete_counts[(complete_counts.partitionFamily == f) &
                                          (complete_counts.splitNumber == s) & 
                                          (complete_counts.subdomain == subdomain)
                                         ].reset_index()

            if len(split_count) == 0:
                n_completed = 0
            else:
                n_completed = split_count.loc[0,'datatype']

            i = n_completed

            while i < n_expected:

                i = i + 1

                stimIDs = list(df.groupby('family_'+str(f)).get_group(s)['stim_id'].apply(lambda x: str(x).zfill(3)))
        #         print(stim_numbers)
                extra_metadata.append(
                    {
                        'partitionFamily': f,
                        'splitNumber': s,
                        'stimIDs': stimIDs,
                        'stimURLS': [s3_bucket_path + "lax-{}-{}-{}-{}.png".format(domain,
                                                                      subdomain,
                                                                      (stimID if domain=='structures' else 'all'),
                                                                      ('all' if domain=='structures' else stimID))\
                                    for stimID in stimIDs],
                        'ntrials': len(stimIDs),
                        'stimGroups': {n: df.groupby('family_'+str(f)).get_group(s).reset_index().loc[i,'group'] for i, n in enumerate(stimIDs)},
                        'numGames': 0,
                        'games': [],
                        'experimentType': 'corpus_collection',
                        'experimentName': experiment_name,
                        's3_bucket_url': s3_bucket_path,
                        'versionInd': versionInd,
                        'extra_metadata_index': i
                    })
            
    return extra_metadata


In [None]:
upload_to_mongo = False

if not(upload_to_mongo):
    print('NO DATA UPLOADED')

for domain in domains:
    for subdomain in subdomains[domain]:
        
        print(domain + ', ' + subdomain)
        
        df_stim = get_stim_df(domain, subdomain)
        df_stim = generate_splits(df_stim)
        
        extra_metadata = create_extra_metadata(complete_counts,
                                               domain, 
                                               subdomain,
                                               df_stim,
                                               n_expected = 1)
        
        stim_col_name = df_stim['experiment_name'][0]
        top_up_stim_col_name = stim_col_name + '_top_up'
        print(top_up_stim_col_name)
        
        if upload_to_mongo:
            
            db = conn['stimuli']
            coll = db[top_up_stim_col_name]
            
            db.drop_collection(top_up_stim_col_name)
            print('cleared stimuli/' + top_up_stim_col_name)
            
            for (i,j) in enumerate(extra_metadata):
                coll.insert_one(j)
#                 clear_output(wait=True)
            
            print(str(len(list(coll.find()))) + ' inserted into stimuli/' + top_up_stim_col_name)
            
        else:
            print(str(len(extra_metadata)) + ' records to upload')
        
        print('')
        # TODO insert into db

# TO HERE

In [None]:
df_trial[~df_trial.ppt_hit_8_step_limit].groupby(['subdomain','complete_dataset']).count()/11

In [None]:
# connect to mongo
import pymongo as pm

# set vars 
auth = pd.read_csv('../../auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org' ## experiment server ip address

conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['lax']
coll = db['lax_reconstruction_dev']

In [None]:
pd.DataFrame(coll.find())