# Analyzing annotations of linguistic data from paired building experiment

We ran two separate annotations studies, each with two naive participants from the lab.

In [None]:
import os
import sys
import urllib, io
os.getcwd()
sys.path.append("../../")
sys.path.append("../../utils")
sys.path.append("../../analysis/utils")


import numpy as np
import scipy.stats as stats
import pandas as pd
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE

import pymongo as pm
from collections import Counter
import json
import re
import ast

from PIL import Image, ImageOps, ImageDraw, ImageFont 

from io import BytesIO
import base64

import  matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')

from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

# import drawing_utils as drawing
import importlib
import scoring

In [None]:
## directory & file hierarchy
proj_dir = os.path.abspath('../..')
datavol_dir = os.path.join(proj_dir,'data')
analysis_dir =  os.path.abspath('../../')
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
json_dir = os.path.join(results_dir,'json')
exp_dir = os.path.abspath(os.path.join(proj_dir,'behavioral_experiments'))
png_dir = os.path.abspath(os.path.join(datavol_dir,'png'))

## add helpers to python path
if os.path.join(proj_dir,'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'stimuli'))
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       

### load annotations and wrangle to match

In [3]:
# first set of annotations (cogsci 2021)
df_jj = pd.read_csv('{}/results/csv/JJ_content.csv'.format(analysis_dir))

# second set of annotations (2023)
df_zc = pd.read_csv('{}/results/csv/ref_exp_annotations_2023.csv'.format(analysis_dir))

df_chat = pd.read_csv('{}/results/csv/df_chat_ids_cogsci21.csv'.format(analysis_dir))

In [4]:
df_chat.loc[:,'dyad_gameid'] = df_chat.gameid
df_chat.loc[:,'turn_num'] = df_chat.turnNum
df_chat.loc[:,'trial_num'] = df_chat.trialNum

In [5]:
df_jj.loc[:,'message_num'] = (df_jj.turnNum/2).astype(int)

df_jj_small = df_jj[['gameid','trialNum', 'message_num','turnNum','message','block_justin', 'toer_justin',
       'scene_justin', 'Flagged', 'phrases_justin', 'block_julia',
       'tower_juli', 'scene_juli', 'phrases_julia']].copy()

df_jj_small.rename(
            columns={
            'gameid': 'dyad_gameid',
            'trialNum': 'trial_num',
            'turnNum': 'turn_num',
            'toer_justin': 'tower_justin',
            'tower_juli': 'tower_julia',
            'scene_juli': 'scene_julia'
            # add more column names as needed
        }, inplace=True)

df_jj_small = df_jj_small.merge(df_chat[['dyad_gameid','trial_num','turn_num','message_id']], on = ['dyad_gameid','trial_num','turn_num'], how = 'left')

df_jj_small = df_jj_small.sort_values(['dyad_gameid','trial_num','message_num']).reset_index(drop=True)

df_jj_small

Unnamed: 0,dyad_gameid,trial_num,message_num,turn_num,message,block_justin,tower_justin,scene_justin,Flagged,phrases_justin,block_julia,tower_julia,scene_julia,phrases_julia,message_id
0,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,0,0,two blue blocks on left and right side of left...,2,0,0,,two blue blocks,2.0,0,0.0,two blue,e5b3a67b-aa0e-563e-efe1-633cb87c6dd3
1,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,1,2,two red block on top of left blue block just p...,2,0,0,,two red block,2.0,0,0.0,two red,abbd2fe0-e095-bbfd-4c26-52d8b6eaf241
2,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,2,4,two red block one space apart on right line,2,0,0,,two red block,2.0,0,0.0,two red,d39dd7ca-1fde-56c1-9d41-a10b16b3d17d
3,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,3,6,one blue block on top red block,1,0,0,,one blue,1.0,0,0.0,blue,9f118a42-9cd3-a795-5edd-9aae7f104dcf
4,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,4,8,"on left red line, on more red block on top of ...",1,0,0,,red block,1.0,0,0.0,red,dc786e74-3ae7-ab54-169e-98cea7713b3c
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2138,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11,1,2,same on the opposite side,2,0,0,1.0,same,0.0,1,0.0,same,ae9e8d03-8498-3111-371a-239070c742d0
2139,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11,2,4,close it up with two blues,2,0,0,,two blues,2.0,0,0.0,two blues,24b27e83-01cb-d9c2-d35f-581e3648431f
2140,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11,3,6,take a blue and place it to the right of right...,1,0,0,,a blue,1.0,0,0.0,blue,17668e8a-78ca-810e-7428-63563c91e458
2141,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11,4,8,another blue placed next to it on the left side,1,0,0,,another blue,1.0,0,0.0,blue,a622053d-d5ff-7894-46ac-32019ab1874e


In [74]:
# melt and pivot

suffix_columns = [col for col in df_jj_small.columns if col.endswith('_justin') or col.endswith('_julia')]

suffix_df = df_jj_small[['dyad_gameid','trial_num','turn_num','message_id'] + suffix_columns]

# Then, melt the DataFrame with the new index as the identifier variable
melted_df = pd.melt(suffix_df, id_vars=['dyad_gameid','trial_num','turn_num','message_id'], var_name='Type', value_name='Value')

# melted_df

# # Now, split the 'Type' column to separate the suffix and create a new column
melted_df[['Category', 'Suffix']] = melted_df['Type'].str.split('_', expand=True)

# # Drop the 'Type' column as it's no longer needed
melted_df.drop(columns=['Type'], inplace=True)
melted_df
# # Finally, pivot the table to the desired format
pivoted_df = melted_df.pivot(index=['dyad_gameid','trial_num','turn_num','message_id','Suffix'], 
                             columns='Category', values='Value').reset_index()

pivoted_df = pivoted_df.rename(columns={'Suffix':'workerID'})


In [75]:
# adjust a bad annotation
pivoted_df.loc[pivoted_df['tower'] == 'L','tower'] = 1

In [87]:
# convert to ints
pivoted_df.loc[:,'block'] = pivoted_df['block'].fillna(0).astype(int)
pivoted_df.loc[:,'tower'] = pivoted_df['tower'].fillna(0).astype(int)
pivoted_df.loc[:,'scene'] = pivoted_df['scene'].fillna(0).astype(int)

In [88]:
pivoted_df

Category,dyad_gameid,trial_num,turn_num,message_id,workerID,block,phrases,scene,tower
0,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,0,e5b3a67b-aa0e-563e-efe1-633cb87c6dd3,julia,2,two blue,0,0
1,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,0,e5b3a67b-aa0e-563e-efe1-633cb87c6dd3,justin,2,two blue blocks,0,0
2,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,2,abbd2fe0-e095-bbfd-4c26-52d8b6eaf241,julia,2,two red,0,0
3,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,2,abbd2fe0-e095-bbfd-4c26-52d8b6eaf241,justin,2,two red block,0,0
4,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,4,d39dd7ca-1fde-56c1-9d41-a10b16b3d17d,julia,2,two red,0,0
...,...,...,...,...,...,...,...,...,...
4281,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11,6,17668e8a-78ca-810e-7428-63563c91e458,justin,1,a blue,0,0
4282,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11,8,a622053d-d5ff-7894-46ac-32019ab1874e,julia,1,blue,0,0
4283,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11,8,a622053d-d5ff-7894-46ac-32019ab1874e,justin,1,another blue,0,0
4284,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11,10,bcee7fb1-543c-41b5-6466-18914a3eacf0,julia,2,two reds,0,0


In [89]:
# merge in metadata
pivoted_df_merged = pivoted_df.merge(df_jj_small[['dyad_gameid','message_id','message_num','message']], 
                 on=['message_id','dyad_gameid'], how='left')

pivoted_df_merged

Unnamed: 0,dyad_gameid,trial_num,turn_num,message_id,workerID,block,phrases,scene,tower,message_num,message
0,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,0,e5b3a67b-aa0e-563e-efe1-633cb87c6dd3,julia,2,two blue,0,0,0,two blue blocks on left and right side of left...
1,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,0,e5b3a67b-aa0e-563e-efe1-633cb87c6dd3,justin,2,two blue blocks,0,0,0,two blue blocks on left and right side of left...
2,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,2,abbd2fe0-e095-bbfd-4c26-52d8b6eaf241,julia,2,two red,0,0,1,two red block on top of left blue block just p...
3,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,2,abbd2fe0-e095-bbfd-4c26-52d8b6eaf241,justin,2,two red block,0,0,1,two red block on top of left blue block just p...
4,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,4,d39dd7ca-1fde-56c1-9d41-a10b16b3d17d,julia,2,two red,0,0,2,two red block one space apart on right line
...,...,...,...,...,...,...,...,...,...,...,...
4281,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11,6,17668e8a-78ca-810e-7428-63563c91e458,justin,1,a blue,0,0,3,take a blue and place it to the right of right...
4282,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11,8,a622053d-d5ff-7894-46ac-32019ab1874e,julia,1,blue,0,0,4,another blue placed next to it on the left side
4283,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11,8,a622053d-d5ff-7894-46ac-32019ab1874e,justin,1,another blue,0,0,4,another blue placed next to it on the left side
4284,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11,10,bcee7fb1-543c-41b5-6466-18914a3eacf0,julia,2,two reds,0,0,5,stack two reds on the left edge


In [90]:
df_zc_small = df_zc[['workerID','message_id','dyad_gameid','msgNum','message','block','tower','refExps']].copy()

df_zc_small = df_zc_small.merge(df_chat[['message_id','trialNum']], 
                               how ='left',
                               on = 'message_id')

df_zc_small.rename(
            columns={
            'trialNum': 'trial_num',
            'msgNum': 'message_num'
        }, inplace=True)


df_zc_small.loc[:,'turn_num'] = (df_zc_small.message_num*2).astype(int)

df_zc_small = df_zc_small.sort_values(['workerID','dyad_gameid','trial_num','message_num']).reset_index(drop=True)


df_zc_small

Unnamed: 0,workerID,message_id,dyad_gameid,message_num,message,block,tower,refExps,trial_num,turn_num
0,charles,e5b3a67b-aa0e-563e-efe1-633cb87c6dd3,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0,two blue blocks on left and right side of left...,2,0,"two blue block,",0.0,0
1,charles,abbd2fe0-e095-bbfd-4c26-52d8b6eaf241,0110-5784fec9-109a-4d7a-a343-4820f4d42144,1,two red block on top of left blue block just p...,2,0,"two red block,",0.0,2
2,charles,d39dd7ca-1fde-56c1-9d41-a10b16b3d17d,0110-5784fec9-109a-4d7a-a343-4820f4d42144,2,two red block one space apart on right line,2,0,"two red block,",0.0,4
3,charles,9f118a42-9cd3-a795-5edd-9aae7f104dcf,0110-5784fec9-109a-4d7a-a343-4820f4d42144,3,one blue block on top red block,1,0,"one blue block,",0.0,6
4,charles,dc786e74-3ae7-ab54-169e-98cea7713b3c,0110-5784fec9-109a-4d7a-a343-4820f4d42144,4,"on left red line, on more red block on top of ...",1,0,"red block,",0.0,8
...,...,...,...,...,...,...,...,...,...,...
4281,zoe,ae9e8d03-8498-3111-371a-239070c742d0,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,1,same on the opposite side,1,0,"same,",11.0,2
4282,zoe,24b27e83-01cb-d9c2-d35f-581e3648431f,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,2,close it up with two blues,2,0,"two blues,",11.0,4
4283,zoe,17668e8a-78ca-810e-7428-63563c91e458,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,3,take a blue and place it to the right of right...,1,0,"blue,",11.0,6
4284,zoe,a622053d-d5ff-7894-46ac-32019ab1874e,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,4,another blue placed next to it on the left side,1,0,"blue,",11.0,8


In [91]:
df_ref_exps = pd.concat([pivoted_df_merged, df_zc_small], ignore_index=True)

In [92]:
df_ref_exps.to_csv('{}/results/csv/df_ref_exps.csv'.format(analysis_dir))

In [93]:
df_ref_exps

Unnamed: 0,dyad_gameid,trial_num,turn_num,message_id,workerID,block,phrases,scene,tower,message_num,message,refExps
0,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0.0,0,e5b3a67b-aa0e-563e-efe1-633cb87c6dd3,julia,2,two blue,0.0,0,0,two blue blocks on left and right side of left...,
1,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0.0,0,e5b3a67b-aa0e-563e-efe1-633cb87c6dd3,justin,2,two blue blocks,0.0,0,0,two blue blocks on left and right side of left...,
2,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0.0,2,abbd2fe0-e095-bbfd-4c26-52d8b6eaf241,julia,2,two red,0.0,0,1,two red block on top of left blue block just p...,
3,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0.0,2,abbd2fe0-e095-bbfd-4c26-52d8b6eaf241,justin,2,two red block,0.0,0,1,two red block on top of left blue block just p...,
4,0110-5784fec9-109a-4d7a-a343-4820f4d42144,0.0,4,d39dd7ca-1fde-56c1-9d41-a10b16b3d17d,julia,2,two red,0.0,0,2,two red block one space apart on right line,
...,...,...,...,...,...,...,...,...,...,...,...,...
8567,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11.0,2,ae9e8d03-8498-3111-371a-239070c742d0,zoe,1,,,0,1,same on the opposite side,"same,"
8568,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11.0,4,24b27e83-01cb-d9c2-d35f-581e3648431f,zoe,2,,,0,2,close it up with two blues,"two blues,"
8569,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11.0,6,17668e8a-78ca-810e-7428-63563c91e458,zoe,1,,,0,3,take a blue and place it to the right of right...,"blue,"
8570,9961-1bd92164-cbe7-4841-8225-7de8486bf84a,11.0,8,a622053d-d5ff-7894-46ac-32019ab1874e,zoe,1,,,0,4,another blue placed next to it on the left side,"blue,"


In [None]:
df_ref_exps.pivot(columns='workerID',)