### Generate metadata objects to be inserted into MongoDB, such that we can ensure even sampling of stimuli


Inspired by Cameron's and Holly's code:

https://github.com/cogtoolslab/curiotower/blob/master/stimuli/generate_metadata.ipynb
https://github.com/cogtoolslab/causaldraw_intervention/blob/master/intervention/stimuli/generate_metadata_intervention.ipynb

In [None]:
import os
import ast
import json
import socket
import random
import pandas as pd
import pymongo as pm
from collections import Counter
from IPython.display import clear_output

In [None]:
# directory & file hierarchy
proj_dir = os.path.abspath('..')
analysis_dir = os.path.join(proj_dir,'analysis')
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')

#### Helper code to update metadata in mongo

In [None]:
def get_coll_from_mongo(coll_name, iter_name, auth, reallyRun = False):
    if reallyRun:
        # set vars 
        pswd = auth.values[0][0]
        user = 'sketchloop'
        host = 'cogtoolslab.org'

        # have to fix this to be able to analyze from local
        import socket
        if socket.gethostname().split('_')[0]=='Justin':
            conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27020')
        else:
            conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27017')
        db = conn['photodraw']
        coll = db[coll_name]

        ## how many records do we have in mongo?
        print(f'We have {coll.estimated_document_count()} records in {coll_name}: {iter_name}.')
              
        # Sketches
        t = coll.find({'iterationName':iter_name, 'eventType':'trial'})
        T = pd.DataFrame(t)

        # Surveys
        s = coll.find({'iterationName':iter_name, 'eventType':'survey'})
        S = pd.DataFrame(s) #.sort_values('aID').reset_index(drop=True)
    return T, S

In [None]:
def subset_valid_drawings(data):
    unique_cats = pd.DataFrame(data.groupby(['prolificID', 'gameID']).trial_index.nunique()).reset_index()
    complete_sessions = unique_cats[unique_cats.trial_index == 136].gameID
    return data[data.gameID.isin(complete_sessions)]


def subset_valid_drawings_full(trial_data, survey_data):
    # subset the sessions that were complete (all 136 trials done)
    trial_data = subset_valid_drawings(trial_data)

    # then sort the trial values comparably
    trial_data = trial_data.sort_values(by=['gameID', 'trial_index']).reset_index(drop=True)
    survey_data = survey_data.sort_values(by=['gameID']).reset_index(drop=True)

    # count the number of unique categories in each prolific-gameID combo
    id_counts = pd.DataFrame(trial_data.groupby(['prolificID','gameID']).trial_index.nunique()).reset_index()
    # get the gameIDs corresponding only to complete sessions 
    valid_gameIDs = id_counts[id_counts.trial_index == 136].drop_duplicates('prolificID', 'last').gameID.values

    # subset valid sessions
    trial_data = trial_data[trial_data.gameID.isin(valid_gameIDs)]
    survey_data = survey_data[survey_data.gameID.isin(valid_gameIDs)]
    
    return trial_data, survey_data

def get_batch_nums_used(data):
    batches_used = pd.DataFrame(data['batch_num'].value_counts() / 32)
    batches_used['batch_num'] = batches_used.batch_num.astype(int)
    batches_used = batches_used.sort_index()
    return batches_used

def get_remaining_inds(trial_data, survey_data):
    # first, fully subset complete sessions
    a, b = subset_valid_drawings_full(trial_data, survey_data)
    
    # then, get the the batch nums used and figure out which indices remain to be used
    batch_df = get_batch_nums_used(a)
    inds_used = [row.name for index, row in batch_df.iterrows() for i in range(row.batch_num)]
    inds = [i for i in range(32) for j in range(3)]
    diff = Counter(inds) - Counter(inds_used)
    remaining_inds = list(diff.elements())

    return remaining_inds

### Step 1: Create metadata file, containing simple arrays of index numbers

There is already client-side metadata being loaded in containing information for every cue in the experiments. One of the metadata parameters is an indexing variable from 0 to 96 for `recogdraw_category`, and 0 to 48 for `recogdraw_instance`. The goal for step 1 is to create a file for each participant we wish to run (288 participants for `recogdraw_category`, 144 participants for `recogdraw_instance`.)

#### Run this cell if on the first pass of the experiments

In [None]:
reallyRun = False
if reallyRun == True:
    # since the actual metadata is loaded in on the client-side, we just need to pop out an index number for
    # each of our 96*3 participants: 
    inds = [i for i in range(96) for j in range(3)] 
    print(inds, len(inds))
    
    # create a simple dictionary and save out to meta.js file
    M = pd.DataFrame(inds, columns=['sketch_ind'])
    M['games'] = '[]'
    M['games'] = M['games'].apply(lambda x: ast.literal_eval(x))

    J1 = M.to_dict(orient='records')

    with open('recogdraw_category_meta.js', 'w') as fout:
        json.dump(J1, fout)
        
reallyRun = False
if reallyRun == True:    
    # same concept, this time with 144 participants for instance recognition (each sketch gets 3 ratings)
    inds = [i for i in range(48) for j in range(3)]
    print(inds, len(inds))
    
    # create a simple dictionary and save out to meta.js file
    M = pd.DataFrame(inds, columns=['sketch_ind'])
    M['games'] = '[]'
    M['games'] = M['games'].apply(lambda x: ast.literal_eval(x))

    J2 = M.to_dict(orient='records')

    with open('recogdraw_instance_meta.js', 'w') as fout:
        json.dump(J2, fout)

### Step 2: Insert each trial as a record into a mongo database

#### establish connection to mongo
`ssh -fNL 27020:127.0.0.1:27017 jyang@cogtoolslab.org` <br>
`ssh -fNL 27017:127.0.0.1:27017 user@cogtoolslab.org`


In [None]:
reallyRun = False
if reallyRun == True:
    # set vars 
    auth = pd.read_csv('../analysis/auth.txt', header = None) 
    pswd = auth.values[0][0]
    user = 'sketchloop'
    host = 'cogtoolslab.org'

    # have to fix this to be able to analyze from local
    if socket.gethostname().split('_')[0]=='Justin':
        conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27020')
    else:
        conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27017')
    db = conn['photodraw']

In [None]:
reallyRun = False
if reallyRun == True:
    ## actually add data now to the database (category)
    coll = db['recogdraw_category_stims']
    random.shuffle(J1)
    for (i,m) in enumerate(J1):
        coll.insert_one(m)
        print(f'{i+1} of {len(J1)}| Inserting sketch index {m["sketch_ind"]}')
        clear_output(wait=True)

    print('Done inserting category records into mongo!')

    ## check collection to see what records look like
    print(coll.find_one(), coll.estimated_document_count())

reallyRun = False
if reallyRun == True:
    ## actually add data now to the database (instances)
    coll = db['recogdraw_instance_stims']
    random.shuffle(J2)
    for (i,m) in enumerate(J2):
        coll.insert_one(m)
        print(f'{i+1} of {len(J2)}| Inserting sketch index {m["sketch_ind"]}')
        clear_output(wait=True)

    print('Done inserting instance records into mongo!')
    
    ## check collection to see what records look like
    print(coll.find_one(), coll.estimated_document_count())

### Step 3: update mongo database with only the remaining indices that haven't been completed yet

importantly. only run this when the study is inactive, otherwise studies that would have been completed would get flagged as invalid and their index would be put back into the pool in mongo

In [None]:
reallyRun = False
if reallyRun == True:
    # set vars 
    auth = pd.read_csv('../analysis/auth.txt', header = None) 
    pswd = auth.values[0][0]
    user = 'sketchloop'
    host = 'cogtoolslab.org'

    # have to fix this to be able to analyze from local
    if socket.gethostname().split('_')[0]=='Justin':
        conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27020')
    else:
        conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27017')
    db = conn['photodraw']

    # clear existing entries in mongo and add just the ones remaining 
    for experiment in ['recogdraw_instance', 'recogdraw_category']:

        # first, get remaining (unused) sketch indices
        trial, survey = get_coll_from_mongo(experiment.replace('_', '-'), 'development', 
                                            auth = auth, reallyRun = True)
        remainder = get_remaining_inds(trial, survey)

        # create a simple dictionary and save out to meta.js file
        M = pd.DataFrame(remainder, columns=['sketch_ind'])
        M['games'] = '[]'
        M['games'] = M['games'].apply(lambda x: ast.literal_eval(x))
        J = M.to_dict(orient='records')
        with open(f'{experiment}_meta.js', 'w') as fout:
            json.dump(J, fout)

        # then shuffle indices
        random.shuffle(J)

        # declare and empty collection
        coll = db[f'{experiment}_stims']
        coll.drop()

        # insert new indices back into mongo
        for (i,m) in enumerate(J):
            coll.insert_one(m)
            print(f'{i+1} of {len(J)}| Inserting sketch index {m["sketch_ind"]}')
            clear_output(wait=True)

    print('Done inserting updated records into mongo!')

### To dump mongo stims, run: 

`db['instancedraw_photo_stims'].drop()` <br>
`db['categorydraw_photo_stims'].drop()`

In [None]:
reallyRun == True
if reallyRun:
    with open("categorydraw_photo_stims_freeze_2.txt", "w") as output:
        output.write(str(list(db['recogdraw_category_stims'].find({}))))

    with open("instancedraw_photo_stims_freeze_2.txt", "w") as output:
        output.write(str(list(db['recogdraw_instance_stims'].find({}))))