# Analyzing annotations of linguistic data from paired building experiment

We ran two separate annotations studies, each with two naive participants from the lab.

Here we collect data from the second version into a single dataframe.

In [3]:
import os
import sys
import urllib, io
os.getcwd()
sys.path.append("../../")
sys.path.append("../../utils")
sys.path.append("../../analysis/utils")


import numpy as np
import scipy.stats as stats
import pandas as pd
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE

import pymongo as pm
from collections import Counter
import json
import re
import ast

from PIL import Image, ImageOps, ImageDraw, ImageFont 

from io import BytesIO
import base64

import  matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')

from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

# import drawing_utils as drawing
import importlib
import scoring

In [9]:
## directory & file hierarchy
proj_dir = os.path.abspath('../..')
datavol_dir = os.path.join(proj_dir,'data')
analysis_dir =  os.path.abspath('../../')
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
json_dir = os.path.join(results_dir,'json')
exp_dir = os.path.abspath(os.path.join(proj_dir,'behavioral_experiments'))
png_dir = os.path.abspath(os.path.join(datavol_dir,'png'))

## add helpers to python path
if os.path.join(proj_dir,'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'stimuli'))
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       

In [10]:
# set vars 
auth = pd.read_csv(os.path.join(analysis_dir,'auth.txt'), header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org'

# have to fix this to be able to analyze from local
import pymongo as pm
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['compositional_abstractions']
coll = db['referring_expressions']

# which iteration name should we use?
iterationName = 'ca_ref_exp_prolific_block_tower_dev'

## look up number of trials (including paired-practice)
# numTrials = 13

In [11]:
query = coll.find({"$and":[
                        {'trial_type': 'ca-label-ref-exp'},
                        {'iterationName': iterationName}
                        ]})
df_trial_all = pd.DataFrame(query)

In [19]:
# data cleaning
df_trial_initial = df_trial_all[(df_trial_all.workerID.isin(['charles','zoe'])) &
                        (~pd.isna(df_trial_all.batch)) & # clean straggling data from development 
                        (df_trial_all.trialStartTime > 1.680631e+12) # clean straggling data from development
                       ]

In [20]:
# 12*49 = 588 expected trials
df_trial_initial.workerID.value_counts()

charles    588
zoe        579
Name: workerID, dtype: int64

#### Missing data topup

In [21]:
query = coll.find({"$and":[
                        {'trial_type': 'ca-label-ref-exp'},
                        {'iterationName': 'ca_ref_exp_prolific_block_tower_topup'}
                        ]})
df_trial_topup = pd.DataFrame(query)

In [22]:
# Remove incomplete data
df_trial_pruned = df_trial_initial[~(((df_trial_initial.workerID == "zoe") & 
         (df_trial_initial.dyad_gameid == "5895-de8412be-4cb1-49e4-9c61-ca6a11481af8") &
         (df_trial_initial.batch == 2.0)) |
         ((df_trial_initial.workerID == "zoe") & 
         (df_trial_initial.dyad_gameid == "0208-b497c845-b076-45c9-b958-a62a3ae6e65f") &
         (df_trial_initial.batch == 3.0)))
        ]

In [23]:
# Append topup
# df_trial = 
df_trial = df_trial_pruned.append(df_trial_topup)

  df_trial = df_trial_pruned.append(df_trial_topup)


In [25]:
# should be 60 trials in each batch (12 trials * 5 ppts) except final batch which should have 48.
df_trial.groupby(['workerID','batch']).apply(len)

workerID  batch
charles   0.0      60
          1.0      60
          2.0      60
          3.0      60
          4.0      60
          5.0      60
          6.0      60
          7.0      60
          8.0      60
          9.0      48
zoe       0.0      60
          1.0      60
          2.0      60
          3.0      60
          4.0      60
          5.0      60
          6.0      60
          7.0      60
          8.0      60
          9.0      48
dtype: int64

## Wrangle dataframe into message by message

In [26]:
def counts_to_series(df):
    return pd.DataFrame.from_records(df.counts)


def extract_counts(row):
    '''
    Extracts message responses from 
    '''
    df = pd.DataFrame.from_dict(row.responses)
    df.loc[:,'_id'] = row['_id']
    df = pd.concat([df, counts_to_series(df)], axis = 1)

    return df
    

In [27]:
df_ref_exp = pd.concat(list(df_trial.apply(extract_counts, axis=1))).reset_index(drop=True)
df_ref_exp_full = df_ref_exp.merge(df_trial, how='left', on='_id')

In [28]:
df_ref_exp.groupby('message_id').apply(len)

message_id
00183cfe-ceb0-9220-7984-f33f61c61ae4    2
0021b632-3246-b7d9-bb29-66398e4a295d    2
002b2f92-8dc5-7bd9-6689-ef79f8c3c461    2
002f9cc4-096b-faff-f5b7-751f497e28aa    2
004cdaf0-0ed9-1a32-4f0f-a9db4b6a3fea    2
                                       ..
ff52f3ab-afe3-e7ea-ede8-79c21541b74a    2
ff9e16ef-3a0b-9373-567f-fb07cbc96874    2
ffb59512-fa04-a706-ef8d-ff9bfae3fcee    2
ffbdfd47-dc79-72f0-1338-94b3854ca896    2
ffc573d5-fd0b-a70e-385a-f4635e4af862    2
Length: 2143, dtype: int64

In [29]:
pd.set_option('display.max_colwidth', 0)

In [30]:
n = 0

In [31]:
n += 30
df_ref_exp[['message','block','tower']][n:n+30]

Unnamed: 0,message,block,tower
30,Tall L-shape starting in the second grid from the right,0,1
31,upsidedown U-shape starting in the second grid from the left,0,1
32,tall C-shape starting in the third grid from the right,0,1
33,upsidedown U-shape starting in the second grid from the left,0,1
34,tall L-shape starting in the second grid from the right,0,1
35,place blue 1 to the right and flat,1,0
36,place blue next to the previous one and a red on top of the first,2,0
37,place another red on top of the previous red,1,0
38,place one red 1 to the left of the far right,1,0
39,place another red 2 to the left of the first red and then connect the two with blues,4,0


In [32]:
df_ref_exp_full

Unnamed: 0,msgNum,message,message_id,counts,refExps,_id,block,tower,trialStartTime,trialEndTime,...,time_elapsed,internal_node_id,experimentName,iterationName,workerID,gameID,studyLocation,datatype,dyad_gameid,batch
0,0,put a blue block three from the left.,e3e70682-c209-4cac-629f-6fbed82c07cd,"{'block': '1', 'tower': '0'}","blue block,",642e51f121425443943feaf9,1,0,1.680757e+12,1.680757e+12,...,110458,0.0-0.0,ca_referring_expressions,ca_ref_exp_prolific_block_tower_dev,zoe,0509-495c9837-2fda-4f87-9b77-143faf5a8d30,,trial_end,1494-029e2297-bd3f-4cfe-be00-d06d36724e4e,0.0
1,1,place a red block on top of the left side of the blue block and then another red on top of that red,f728b4fa-4248-5e3a-0a5d-2f346baa9455,"{'block': '2', 'tower': '0'}","red block, red,",642e51f121425443943feaf9,2,0,1.680757e+12,1.680757e+12,...,110458,0.0-0.0,ca_referring_expressions,ca_ref_exp_prolific_block_tower_dev,zoe,0509-495c9837-2fda-4f87-9b77-143faf5a8d30,,trial_end,1494-029e2297-bd3f-4cfe-be00-d06d36724e4e,0.0
2,2,Now place a blue block on top of that that lines up with the bottom blue block,eb1167b3-67a9-c378-7c65-c1e582e2e662,"{'block': '1', 'tower': '0'}","blue block,",642e51f121425443943feaf9,1,0,1.680757e+12,1.680757e+12,...,110458,0.0-0.0,ca_referring_expressions,ca_ref_exp_prolific_block_tower_dev,zoe,0509-495c9837-2fda-4f87-9b77-143faf5a8d30,,trial_end,1494-029e2297-bd3f-4cfe-be00-d06d36724e4e,0.0
3,3,"remove that last blue block, place it on top of the red blocks",f7c1bd87-4da5-e709-d471-3d60c8a70639,"{'block': '1', 'tower': '0'}","blue block,",642e51f121425443943feaf9,1,0,1.680757e+12,1.680757e+12,...,110458,0.0-0.0,ca_referring_expressions,ca_ref_exp_prolific_block_tower_dev,zoe,0509-495c9837-2fda-4f87-9b77-143faf5a8d30,,trial_end,1494-029e2297-bd3f-4cfe-be00-d06d36724e4e,0.0
4,4,place a red block on the second grid from the right of the screen\n,e443df78-9558-867f-5ba9-1faf7a024204,"{'block': '1', 'tower': '0'}","red block,",642e51f121425443943feaf9,1,0,1.680757e+12,1.680757e+12,...,110458,0.0-0.0,ca_referring_expressions,ca_ref_exp_prolific_block_tower_dev,zoe,0509-495c9837-2fda-4f87-9b77-143faf5a8d30,,trial_end,1494-029e2297-bd3f-4cfe-be00-d06d36724e4e,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4281,1,"Big L : Blue 8,1",f3f05323-3bd5-f8a2-073c-36ba5d393bf9,"{'block': '1', 'tower': '1'}","Big L, Blue,",644aee1421425443943fef9f,1,1,1.682632e+12,1.682632e+12,...,168689,0.0-9.0,ca_referring_expressions,ca_ref_exp_prolific_block_tower_topup,zoe,2537-81644a48-c1c9-4b72-b4be-111ae2fd4c6e,,trial_end,0208-b497c845-b076-45c9-b958-a62a3ae6e65f,3.0
4282,0,"Tall C : Blue 3, 1",88da091b-61f6-c877-0212-585b0b2f8580,"{'block': '1', 'tower': '1'}","Tall C, Blue,",644aee2021425443943fefa0,1,1,1.682632e+12,1.682632e+12,...,181008,0.0-10.0,ca_referring_expressions,ca_ref_exp_prolific_block_tower_topup,zoe,2537-81644a48-c1c9-4b72-b4be-111ae2fd4c6e,,trial_end,0208-b497c845-b076-45c9-b958-a62a3ae6e65f,3.0
4283,1,"Big L : blue 8, 1",d24e3104-1dd2-6234-2131-24bd01275a63,"{'block': '1', 'tower': '1'}","Big L, blue,",644aee2021425443943fefa0,1,1,1.682632e+12,1.682632e+12,...,181008,0.0-10.0,ca_referring_expressions,ca_ref_exp_prolific_block_tower_topup,zoe,2537-81644a48-c1c9-4b72-b4be-111ae2fd4c6e,,trial_end,0208-b497c845-b076-45c9-b958-a62a3ae6e65f,3.0
4284,0,"House : Red 2, 1",e998952c-ef58-951f-9b1a-e0199aca0c4a,"{'block': '1', 'tower': '1'}","House, Red,",644aee2821425443943fefa1,1,1,1.682632e+12,1.682632e+12,...,189539,0.0-11.0,ca_referring_expressions,ca_ref_exp_prolific_block_tower_topup,zoe,2537-81644a48-c1c9-4b72-b4be-111ae2fd4c6e,,trial_end,0208-b497c845-b076-45c9-b958-a62a3ae6e65f,3.0


In [35]:
df_ref_exp_full.to_csv('../../results/csv/ref_exp_annotations_2023.csv')