# LAX Corpus 1k dataframe generator

Grabs data from cogtoolslab server and creates CSVs.

In [1]:
import os
import sys
import urllib, io
os.getcwd()
sys.path.append("..")
# sys.path.append("../utils")
sys.path.append("../../../stimuli")

import numpy as np
import scipy.stats as stats
import pandas as pd

import pymongo as pm
from collections import Counter
from functools import reduce
import json
import re
import ast

from PIL import Image, ImageOps, ImageDraw, ImageFont 

from io import BytesIO
import base64

import  matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')

import spacy
nlp = spacy.load("en_core_web_lg")
from spacy.lang.en.stop_words import STOP_WORDS


from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

# import drawing_utils as drawing
# import importlib
# import scoring

sys.path.append("../../stimuli/towers/block_utils/")
import blockworld_utils as utils

In [2]:
## directory & file hierarchy
proj_dir = os.path.abspath('..')
datavol_dir = os.path.join(proj_dir,'data')
analysis_dir =  os.path.abspath('.')
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
json_dir = os.path.join(results_dir,'json')
exp_dir = os.path.abspath(os.path.join(proj_dir,'behavioral_experiments'))
png_dir = os.path.abspath(os.path.join(datavol_dir,'png'))

results_csv_directory = "../../results/csv"

## add helpers to python path
if os.path.join(proj_dir,'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'stimuli'))
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       

In [3]:
# set vars 
auth = pd.read_csv(os.path.join(analysis_dir,'../../auth.txt'), header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org'

# have to fix this to be able to analyze from local
import pymongo as pm
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')

db = conn['lax']

In [4]:
subdomains = {
    'structures' :  ['bridge', 'castle', 'house', 'city'],
    'drawing' :  ['nuts-bolts','wheels','furniture','dials']
}

domains = list(subdomains.keys())

#### Fetch from database

In [5]:
iteration_names = ['corpus_prolific_test', 'corpus_prolific_test_3'] # 2 intentionally left out
experiment_template = "lax-{}-{}-corpus-{}-10"
condition = 'procedural'
expected_trials = 10

df_trial = pd.DataFrame()
df_all = pd.DataFrame()

for domain in domains:
    col_name = 'lax_{}_corpus'.format(domain)
    coll = db[col_name]
    
    for subdomain in subdomains[domain]:
        
        # get all data for subdomain from db
        df_subdomain_all = pd.DataFrame(coll.find({"$and":[  {'iterationName' : { '$in': iteration_names }},
                                          {'experimentName': experiment_template.format(domain, subdomain, condition)},
                                         ]}))
        
        if len(df_subdomain_all) > 0:

            df_subdomain_all['domain'] = domain
            df_subdomain_all['subdomain'] = subdomain


            # get metadata
            df_subdomain_meta = df_subdomain_all[(df_subdomain_all.datatype == 'stim_metadata')]\
                                        [["gameID","partitionFamily","splitNumber","stimIDs", "stimURLS", "stimGroups",
                                          "numGames","experimentType","experimentName","versionInd"]]

            # get trial data
            df_subdomain_trial = df_subdomain_all[\
                      (df_subdomain_all.trial_type == 'stimuli-contextual-language-production') &
                      (df_subdomain_all.datatype == 'trial_end') &
                      (~pd.isna(df_subdomain_all.stimId))]\
                      [['datatype', 'iterationName', 'condition', 'domain', 'subdomain',
                        'config_name', 'gameID', 'shuffle', 'trialOrder', 'rt', 'workerID', 
                        'trial_type', 'trial_index', 'time_elapsed', 'internal_node_id',
                        'view_history', 'stimId', 'stimURL', 'responses']]

            # merge metadata into trial data

            # verify stim groups in metadata are correct
            dicts = list(df_subdomain_all[df_subdomain_all.datatype=='stim_metadata']['stimGroups'])
            stim_groups = reduce(lambda dict1, dict2: {**dict1, **dict2}, dicts)
            stim_groups['demo_stim'] = 'demo_stim'
            # assign stim groups from metadata
            df_subdomain_trial['stim_group'] = df_subdomain_trial['stimId'].apply(lambda stim: stim_groups[stim])
            df_subdomain_trial = df_subdomain_trial.merge(df_subdomain_meta, how='left', on='gameID')

            # append subdomain data to main dataframe
            df_trial = df_trial.append(df_subdomain_trial, ignore_index=True)
            
            df_all = df_all.append(df_subdomain_all, ignore_index=True)
            
        else:
            print('no data for ' + domain + '.' + subdomain)
            

#### Create additional columns

In [None]:
# Mark completed datasets
# find full datasets
did_complete = df_trial[df_trial.stim_group != 'demo_stim'].groupby(['gameID']).count()['datatype'] == expected_trials
complete_dataset_gameIDs = list(did_complete[did_complete].index)

df_trial.loc[:,'complete_dataset'] = False
df_trial.loc[(df_trial.gameID.isin(complete_dataset_gameIDs)), 'complete_dataset'] = True
df_all.loc[:,'complete_dataset'] = False
df_all.loc[(df_all.gameID.isin(complete_dataset_gameIDs)), 'complete_dataset'] = True

# assign correct trial number
df_trial.loc[:,'trial_num'] = df_trial.trial_index - min(df_trial.trial_index.unique()[1:]) + 1
# assign practice trials to trial_num = 0
df_trial.loc[df_trial.trial_num < 0,'trial_num'] = 0

df_trial['rt_mins'] = df_trial.rt/(60*1000)

df_trial.loc[:, 'responses'] = df_trial.responses.apply(ast.literal_eval)

In [None]:
# TODO: find datasets with no trials with 8 steps
def get_responses(response):

    whats = [key for key in response.keys() if 'what' in key]
    wheres = [key for key in response.keys() if 'where' in key]

    what_responses = [response[what] for what in whats]
    where_responses = [response[where] for where in wheres]

    return (what_responses, where_responses)

df_trial.loc[:, 'response_lists'] = df_trial.responses.apply(get_responses)
df_trial.loc[:, 'whats'] = df_trial.response_lists.apply(lambda x:x[0])
df_trial.loc[:, 'wheres'] = df_trial.response_lists.apply(lambda x:x[1])
df_trial.loc[:, 'n_steps'] = df_trial.whats.apply(len)

df_trial.loc[:, 'what_messages_lengths'] = df_trial.whats.apply(lambda responses: [len(response) for response in responses])
df_trial.loc[:, 'where_messages_lengths'] = df_trial.wheres.apply(lambda responses: [len(response) for response in responses])

df_trial.loc[:, 'what_char_sum'] = df_trial.what_messages_lengths.apply(np.sum)
df_trial.loc[:, 'where_char_sum'] = df_trial.where_messages_lengths.apply(np.sum)

df_trial.loc[:, 'char_sum'] = df_trial.what_char_sum + df_trial.where_char_sum 



In [None]:
# mark those that hit 8 step limit
hit_8_step_limit = df_trial.groupby('gameID').n_steps.unique().apply(max) == 8

df_trial.loc[:, 'ppt_hit_8_step_limit'] = (df_trial.iterationName == 'corpus_prolific_test') & \
                                        (df_trial.gameID.apply(lambda id: hit_8_step_limit[id]))

In [None]:
# True if correct multiple of trials
assert(df_trial[df_trial.stim_group != 'demo_stim'].groupby('complete_dataset').count()\
           ['trial_num'][True] % expected_trials == 0)

In [None]:
# how many complete datasets?
df_trial[(df_trial.complete_dataset) & (df_trial.trial_num > 0)].groupby(['domain','subdomain'])['rt'].count()/expected_trials


#### Basic linguistic pre-processing

In [None]:
df_trial['processed_whats'] = [list(nlp.pipe(text)) for text in df_trial['whats']]
df_trial['lemmatized_whats'] = [[[str(w.lemma_.lower()) for w in sentence] for sentence in text] for text in df_trial['processed_whats']]
df_trial['whats_pos'] = [[[str(w.pos_) for w in sentence] for sentence in text] for text in df_trial['processed_whats']]
df_trial['lemmatized_notstop_whats'] = [[[str(w.lemma_.lower()) for w in sentence if (not w.is_stop)] for sentence in text] for text in df_trial['processed_whats']]
df_trial['lemmatized_filtered_whats'] = [[[str(w.lemma_.lower()) for w in sentence if not(w.pos_ in ['DET','PUNCT'])] for sentence in text] for text in df_trial['processed_whats']]
df_trial['n_whats_filtered'] = df_trial['lemmatized_filtered_whats'].apply(lambda x: sum([len(sub) for sub in x]))

df_trial['processed_wheres'] = [list(nlp.pipe(text)) for text in df_trial['wheres']]
df_trial['lemmatized_wheres'] = [[[str(w.lemma_.lower()) for w in sentence] for sentence in text] for text in df_trial['processed_wheres']]
df_trial['wheres_pos'] = [[[str(w.pos_) for w in sentence] for sentence in text] for text in df_trial['processed_wheres']]
df_trial['lemmatized_notstop_wheres'] = [[[str(w.lemma_.lower()) for w in sentence if (not w.is_stop)] for sentence in text] for text in df_trial['processed_wheres']]
df_trial['lemmatized_filtered_wheres'] = [[[str(w.lemma_.lower()) for w in sentence if not(w.pos_ in ['DET','PUNCT'])] for sentence in text] for text in df_trial['processed_wheres']]
df_trial['n_wheres_filtered'] = df_trial['lemmatized_filtered_wheres'].apply(lambda x: sum([len(sub) for sub in x]))

In [None]:
df_trial.loc[:,'unique_whats'] = \
    df_trial.lemmatized_filtered_whats.apply(lambda xss: pd.unique([x for xs in xss for x in xs]))
df_trial.loc[:,'unique_wheres'] = \
    df_trial.lemmatized_filtered_whats.apply(lambda xss: pd.unique([x for xs in xss for x in xs]))

In [None]:
df_trial.loc[:,'n_unique_whats'] = df_trial.unique_whats.apply(len)

In [None]:
df_trial.loc[:, 'what_word_sum'] = df_trial['lemmatized_filtered_whats'].apply(lambda x: sum([len(sub) for sub in x]))
df_trial.loc[:, 'where_word_sum'] = df_trial['lemmatized_filtered_whats'].apply(lambda x: sum([len(sub) for sub in x]))

#### check dataset is complete (i.e. >= 2 annotations for each structure)

In [None]:
(df_trial[(df_trial.stimId!='demo_stim') &
         (df_trial.complete_dataset) & 
         (~df_trial.ppt_hit_8_step_limit)]\
    .groupby(['subdomain','stimId'])['responses'].count() >= 2).all()

#### Drop prolific ids

In [None]:
df_trial = df_trial.drop('workerID', axis=1)
df_all = df_all.drop('workerID', axis=1)
print()

In [None]:
# save whole corpus

really_save = True

if really_save:
    df_trial.to_csv(results_csv_directory + '/lax_corpus_1k_trial_unfiltered.csv')
    df_all.to_csv(results_csv_directory + '/lax_corpus_1k_all_unfiltered.csv')
    
# print(results_csv_directory + '/lax_corpus_' + iteration_name + '_' + condition + '_trial.csv')
# print(results_csv_directory + '/lax_corpus_' + iteration_name + '_' + condition + '_all.csv')

## Data cleaning

In [None]:
# Remove demo stimuli and participants who hit the 8-step limit before it was removed
df_trial = (df_trial
            .query('stimId != "demo_stim"')
            .query('~ppt_hit_8_step_limit')
            .query('complete_dataset'))

#### Add flags for unusual data

In [None]:
# RT less than 5 seconds
df_trial.loc[:, 'short_rt'] = df_trial.rt < 5000

# RT greater than 10 mins
df_trial.loc[:, 'long_rt'] = df_trial.rt > 600000

# Gave same response for more than one stimulus
df_trial.loc[:,'responses_str'] = df_trial['responses'].apply(str)
duplicate_responders = \
    list(df_trial.groupby(['gameID']).filter(lambda x: max(x['responses_str'].value_counts()) > 1).gameID.unique())
df_trial.loc[:, 'duplicate_responder'] = df_trial.gameID.isin(duplicate_responders)

# Unusually short descriptions (<2 words per cell)
df_trial.loc[:, 'length_outlier'] = df_trial.what_word_sum < 3

# Referring to pay/ money
off_task_words = ['paid', 'money', 'pay']
df_trial.loc[:,'off_task_flag'] = df_trial.unique_whats.apply(\
                                    lambda whats: len(set(off_task_words).intersection(set(whats))) > 0) | \
                                  df_trial.unique_wheres.apply(\
                                    lambda wheres: len(set(off_task_words).intersection(set(wheres))) > 0)

## Join trial data with program data

### Handcoded DSLs

In [None]:
df_structures_topdown = pd.DataFrame()

for subdomain in ['bridge','castle','city', 'house']:
    df_subdomain = pd.read_csv("https://github.com/CatherineWong/drawingtasks/raw/main/data/summaries/{}_programs_all.csv".format(subdomain))
    df_structures_topdown = df_structures_topdown.append(df_subdomain, ignore_index=True)

df_structures_topdown.loc[:,'subdomain'] = df_structures_topdown.structure_type
df_structures_topdown.loc[:,'domain'] = 'structures'
df_structures_topdown.loc[:,'stimId'] =  df_structures_topdown.structure_number.apply(lambda x: str(x).zfill(3))
df_structures_topdown = df_structures_topdown.drop(columns=['Unnamed: 0','Unnamed: 0.1','structure_type','structure_number'])

In [None]:
df_drawing_topdown = pd.DataFrame()

for subdomain in ['dials','furniture','nuts_bolts','wheels']:

    df_subdomain = pd.read_csv("https://github.com/CatherineWong/drawingtasks/raw/main/data/summaries/{}_programs_all.csv".format(subdomain))
    df_drawing_topdown = df_drawing_topdown.append(df_subdomain, ignore_index=True)

df_drawing_topdown.loc[:,'subdomain'] = df_drawing_topdown.task_name.apply(lambda x: x.split('_')[0])
df_drawing_topdown.loc[:,'domain'] = 'drawing'
df_drawing_topdown.loc[:,'stimId'] = df_drawing_topdown.s3_stimuli.apply(lambda x: x.split('-')[-1].split('.')[0])
df_drawing_topdown = df_drawing_topdown.drop(columns=['task_name'])

In [None]:
df_drawing_topdown.loc[df_drawing_topdown.subdomain == 'nuts','subdomain'] = 'nuts-bolts'

In [None]:
df_topdown = df_drawing_topdown.append(df_structures_topdown, ignore_index=True)

In [None]:
df_combined = \
    df_trial.merge(df_topdown, how='left', on=['stimId','subdomain','domain'])

In [None]:
libraries = {}

for level in ['low','mid','high']: #,'tower'
    
    libraries[level] = []
 
    df_structures_topdown.loc[:, level+'_level_parts'] = df_structures_topdown.loc[:, level+'_level_parts'].apply(ast.literal_eval)
    df_structures_topdown[level+'_level_prog_length'] = df_structures_topdown[level+'_level_parts'].apply(len)
    df_structures_topdown[level+'_level_prog_unique_tokens'] = df_structures_topdown[level+'_level_parts'].apply(lambda x: len(np.unique(x)))

In [None]:
# Still waiting on handcoded DSLs from Cathy

In [None]:
# save whole corpus

really_save = True

if really_save:
    df_topdown.to_csv(results_csv_directory + '/lax_corpus_1k_programs_cogsci22.csv')
    df_combined.to_csv(results_csv_directory + '/lax_corpus_1k_trials_cogsci22.csv')

In [None]:
df_drawing_topdown['domain'].unique()

### Check exit survey responses

In [None]:
# list exit survey comments

list(df_all[(df_all.trial_type=='survey-text') & (df_all.iterationName=='corpus_prolific_test_3')]\
         ['response'].apply(lambda x: x['Q0']))

#### Load base program data (Not used for cogsci22, but links to s3 stimulus dataframes) 

In [None]:
# # gadgets
# df_drawing_programs = pd.DataFrame()

# for drawing_subdomain in subdomains['drawing']:
#     summary_domain = 'nuts_bolts' if drawing_subdomain == 'nuts-bolts' else drawing_subdomain
    
#     df_sub = pd.read_csv('./gadget_programs_tmp/{}_all.csv'.format(summary_domain))
#     df_sub.loc[:,'domain'] = 'drawing'
#     df_sub.loc[:,'subdomain'] = drawing_subdomain
#     df_drawing_programs = df_drawing_programs.append(df_sub, ignore_index=True)
    
    
# # df_drawing_programs.dreamcoder_program_dsl_0_tokens = df_drawing_programs.dreamcoder_program_dsl_0_tokens.apply(ast.literal_eval)
# df_drawing_programs.loc[:,'stimId'] = df_drawing_programs.s3_stimuli.apply(lambda x: x.split('-')[-1].split('.')[0])

# # will be correct later when we have all dsls. Currently this only works for nuts-bolts and dials
# # df_drawing_programs.loc[:,'n_tokens'] = df_drawing_programs.dreamcoder_program_dsl_0_tokens.apply(lambda x: len(ast.literal_eval(x)))


In [None]:
# # structures
# df_structures_programs = pd.DataFrame()

# for structure_subdomain in subdomains['structures']:
#     df_sub = pd.read_csv('https://lax-structures-{}-all.s3.amazonaws.com/df_{}.csv'.format(structure_subdomain,
#                                                                                            structure_subdomain))
#     df_structures_programs = df_structures_programs.append(df_sub, ignore_index=True)
    
# # make columns consistent with trial dataframe
# df_structures_programs.loc[:,'stimId'] = df_structures_programs.structure_number.apply(lambda x: str(x).zfill(3))
# df_structures_programs.loc[:,'subdomain'] = df_structures_programs.structure_type
# df_structures_programs.loc[:,'domain'] = 'structures'

# df_structures_programs = df_structures_programs.rename(columns={'dreamcoder_program':'dreamcoder_program_dsl_auto_generated'})

In [None]:
# # merge drawing program data into df_trial
# df_combined = df_trial.merge(df_drawing_programs, how='left', on=['stimId','subdomain','domain'])

# # then merge structure program data into result 
# df_combined = df_combined.merge(df_structures_programs, how='left', on=['stimId','subdomain','domain'])