In [1]:
from __future__ import division
import os, sys

import pymongo as pm
import numpy as np
import scipy.stats as stats
import pandas as pd
import json
import re
from io import BytesIO
from PIL import Image
from skimage import io, img_as_float
import base64
from collections import Counter

import matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')

from IPython.display import clear_output
import importlib

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

### set up paths, etc.

In [2]:
# directory & file hierarchy
proj_dir = os.path.abspath('..')
analysis_dir = os.getcwd()
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
exp_dir = os.path.abspath(os.path.join(proj_dir,'experiments'))

## add helpers to python path
if os.path.join(proj_dir,'utils') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'utils'))

### load in raw causaldraw annotation group dataframe

In [3]:
## load in annotated stroke data
A1 = pd.read_csv(os.path.join(csv_dir,'causaldraw_annotation_main.csv'))
A2 = pd.read_csv(os.path.join(csv_dir,'causaldraw_annotation_patch2.csv'))
A3 = pd.read_csv(os.path.join(csv_dir,'causaldraw_annotation_patch3.csv'))

A = pd.concat([A1, A2, A3])

## deborkify
if 'Unnamed: 0' in A.columns:
    A.drop(labels=['Unnamed: 0'], inplace=True, axis=1)
    
## add special column that mashes up label and stroke type
A = A.assign(label_type = A.apply(lambda x: '{}_{}_{}'.format(x['roi_labelName'], x['strokeType'], x['roi_labelNum']), axis=1))

**Notes on metadata:** 
1. "gameID" here refers to the annotator. 
2. We collected 3 annotations per sketch. So there should be exactly 3 unique gameIDs for every sketchID

**Types of stroke**
1. causal
2. functional
3. background
4. symbol
5. short
6. unintelligible
7. light 
**Important**: Types 5, 6, and 7 are NOT contained in the CSV "causaldraw_annotation_raw.csv." We are therefore just ignoring these strokes' semantics as not worthwhile to analyze right now.

**What we want:**
A dataframe where each row is a unique stroke, where each stroke has the most commonly assigned roi_labelname 

In [4]:
### check how many annotations we have for each sketch -- which have not been annotated 3 times?
exceptions1 = []
exceptions2 = []
for name, group in A.groupby('sketchID'):
    try:
        assert len(group['gameID'].unique()) >= 3
    except:
        print('Exception: We have {} annotations for sketch {}'.format(len(group['gameID'].unique()), name))

Exception: We have 2 annotations for sketch gears_1.1073-31f69960-6dfc-4e45-9a3c-6d7eaa33103e
Exception: We have 2 annotations for sketch gears_1.1887-0675d919-20c4-4940-9610-d294d43c9bc1
Exception: We have 2 annotations for sketch gears_1.2659-dd1b5c39-13c7-4a69-836f-03279f53939d
Exception: We have 2 annotations for sketch gears_1.2803-ce4e1ad2-c29d-4097-9d03-c5097c1e103e
Exception: We have 2 annotations for sketch gears_1.4439-0eca0566-7cd9-4da5-b710-085e65a285e5
Exception: We have 2 annotations for sketch gears_1.5369-de50eae9-8b3d-44d0-89c1-b279c4fea591
Exception: We have 2 annotations for sketch gears_2.5369-de50eae9-8b3d-44d0-89c1-b279c4fea591
Exception: We have 2 annotations for sketch levers_1.0366-5aa209bd-fda5-4afd-b896-9371d73ab1df
Exception: We have 2 annotations for sketch levers_1.0524-70721cbe-8f3d-4e95-ad78-7b70a6f7c538
Exception: We have 2 annotations for sketch levers_1.0641-f65a5813-73f1-4a74-8400-1df917c1e4d7
Exception: We have 2 annotations for sketch levers_1.1073

In [5]:
print("Note to self: pulleys_2.7376-6f5f07d7-86c3-4c4c-a052-3634d7755b9d only consists of 1 stroke, so we'd exclude this sketch anyways")

Note to self: pulleys_2.7376-6f5f07d7-86c3-4c4c-a052-3634d7755b9d only consists of 1 stroke, so we'd exclude this sketch anyways


In [6]:
## for each stroke in each sketch, compare annotations provided

## check inter-rater reliability
num_unique_labels = []
for name, group in A.groupby(['sketchID','strokeIndex','condition', 'orig_gameID']):
    num_unique_labels.append(len(group['label_type'].unique()))
print('How many unique labels for each stroke?')
print(Counter(num_unique_labels))

How many unique labels for each stroke?
Counter({1: 3486, 2: 1791, 3: 460, 4: 165, 5: 17, 6: 1, 7: 1})


### Stroke level

In [7]:
## get label counts for each stroke
B = A.groupby(['sketchID','strokeIndex','condition', 'orig_gameID'])['label_type'].value_counts().reset_index(name='label_counts')

## get the most commonly assigned roi_labelname 
C = B.groupby(['sketchID','strokeIndex','condition', 'orig_gameID']).apply(lambda x: x[x['label_counts']==x.label_counts.max()]['label_type'])

## just pull out these most common label names
D = C.reset_index(drop=False).drop(labels='level_4',axis=1)

## separate out label_type information into two columns
D = D.assign(strokeLabel=D['label_type'].apply(lambda x: x.split('_')[0]))
D = D.assign(strokeType=D['label_type'].apply(lambda x: x.split('_')[1]))
D = D.assign(strokeRoiNum=D['label_type'].apply(lambda x: x.split('_')[2]))
D.drop(labels='label_type',inplace=True, axis=1)
# D

In [9]:
np.random.seed(0)
E = D.iloc[np.random.permutation(np.arange(len(D)))]
Z = E.duplicated(subset=["sketchID", "strokeIndex"], keep="first")

Z = E[Z]
Z = Z.sort_values(["sketchID", "strokeIndex"])

# E = E.drop_duplicates(subset=["sketchID", "strokeIndex"], keep="first", ignore_index=True)
E = E.drop_duplicates(subset=["sketchID", "strokeIndex"], keep=False, ignore_index=True)

E = E.sort_values(["sketchID", "strokeIndex"])
F = E[(E["strokeLabel"] != "unintelligible") & (E["strokeLabel"] != "light") & (E["strokeLabel"] != "symbols")]
F = F.reset_index(drop=True)
F

Unnamed: 0,sketchID,strokeIndex,condition,orig_gameID,strokeLabel,strokeType,strokeRoiNum
0,gears_1.0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,0,explanatory,0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,gear,causal,4.0
1,gears_1.0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,1,explanatory,0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,gear,causal,3.0
2,gears_1.0366-5aa209bd-fda5-4afd-b896-9371d73ab1df,0,depictive,0366-5aa209bd-fda5-4afd-b896-9371d73ab1df,background,background,5.0
3,gears_1.0366-5aa209bd-fda5-4afd-b896-9371d73ab1df,1,depictive,0366-5aa209bd-fda5-4afd-b896-9371d73ab1df,gear,functional,1.0
4,gears_1.0366-5aa209bd-fda5-4afd-b896-9371d73ab1df,2,depictive,0366-5aa209bd-fda5-4afd-b896-9371d73ab1df,gear,functional,1.0
...,...,...,...,...,...,...,...
4436,pulleys_2.9810-988306f1-19e0-4158-a007-261fc2d...,8,depictive,9810-988306f1-19e0-4158-a007-261fc2d3ae0b,wheel,functional,3.0
4437,pulleys_2.9810-988306f1-19e0-4158-a007-261fc2d...,9,depictive,9810-988306f1-19e0-4158-a007-261fc2d3ae0b,string,functional,6.0
4438,pulleys_2.9810-988306f1-19e0-4158-a007-261fc2d...,10,depictive,9810-988306f1-19e0-4158-a007-261fc2d3ae0b,wheel,causal,1.0
4439,pulleys_2.9810-988306f1-19e0-4158-a007-261fc2d...,11,depictive,9810-988306f1-19e0-4158-a007-261fc2d3ae0b,wheel,causal,2.0


In [9]:
Y = E[(E["strokeLabel"] != "unintelligible") & (E["strokeLabel"] != "light")]
Y = Y.reset_index(drop=True)
Y["stimuli"] = Y["sketchID"].apply(lambda x: x.split(".")[0])
Y = Y[["sketchID", "orig_gameID", "stimuli", "condition", "strokeIndex", "strokeLabel", "strokeType", "strokeRoiNum"]]
Y.to_csv(os.path.join(csv_dir,'causaldraw_annotation_final_stoke_analysis_drop.csv'))

## note about svg data
for unknown reasons, the causaldraw_annotation dataframes contained inconsistent svg data

problems: 
1. some dataframes had columns containing the entire "svgArray" which were all the svg's in the sketch, but others did not; while some dataframes had columns containing a single "svg" 
2. however, the "svg" column did not correspond to the correct stroke index (e.g., the first svg was listed for the first, second, third, etc. stroke index)

therefore, the compiled causaldraw_annotation dataframe was <b><u>missing</u></b> svg data

#### to remedy this, Holly: 
1. checked that the strokeIndex of causaldraw_annotation corresponded to the strokeIndex (also called "currStrokenNum") in the original causaldraw experiment. In other words, Holly opened the causaldraw_annotation experiment and checked that the strokes were displayed in the order in which they were originally drawn by checking whether the arcLength matched the strokeIndex between the two experiments
2. merged the svg from causaldraw on the columns: sketchID and strokeIndex/currStrokeNum and arcLength (arcLength is probably not necessary but we're doing this to be extra safe)

In [10]:
## load in original causaldraw data
# orig_svgs = pd.read_csv('/Users/hollyhuey/causaldraw/original/results/csv/run1/causaldraw_processed_stroke_data.csv')
orig_svgs = pd.read_csv('/home/xul076/causaldraw/original/results/csv/run1/causaldraw_processed_stroke_data.csv')
orig_svgs.head()

Unnamed: 0,_id,aID,arcLength,condition,currStrokeNum,demo_dur,demo_seq,dev_mode,endStrokeTime,eventType,...,startStrokeTime,stim_url,svg,time,toy_type,toy_variant,trialNum,type,version,workerId
0,5e8fbab434c2a86d15ef03e1,3O6CYIULEH1LVF5ZZSVQQ69KA4EUW4,60.943575,explanatory,1,30,BAAB,False,1586477747178,stroke,...,1586477742567,https://causaldraw.s3.amazonaws.com/levers_1_B...,"M294,466c4.54694,0 26.69168,-2.30832 17,-12c-1...",1586477747189,levers,1,0,jspsych-cued-drawing,12,A36GTTPW17HUL1
1,5e8fbabc34c2a86d15ef03e3,3O6CYIULEH1LVF5ZZSVQQ69KA4EUW4,122.379761,explanatory,2,30,BAAB,False,1586477755346,stroke,...,1586477748527,https://causaldraw.s3.amazonaws.com/levers_1_B...,"M310,449c-3.15764,0 -1.59033,-5.59033 -3,-7c-1...",1586477755374,levers,1,0,jspsych-cued-drawing,12,A36GTTPW17HUL1
2,5e8fbabe34c2a86d15ef03e5,3O6CYIULEH1LVF5ZZSVQQ69KA4EUW4,14.605667,explanatory,3,30,BAAB,False,1586477757348,stroke,...,1586477756379,https://causaldraw.s3.amazonaws.com/levers_1_B...,"M292,457c0,-2.29735 11.09131,-2 14,-2",1586477757349,levers,1,0,jspsych-cued-drawing,12,A36GTTPW17HUL1
3,5e8fbabf34c2a86d15ef03e6,3O6CYIULEH1LVF5ZZSVQQ69KA4EUW4,9.386271,explanatory,4,30,BAAB,False,1586477758465,stroke,...,1586477758087,https://causaldraw.s3.amazonaws.com/levers_1_B...,"M299,458c2.85648,0 9.79223,-2 9,-2",1586477758466,levers,1,0,jspsych-cued-drawing,12,A36GTTPW17HUL1
4,5e8fbac434c2a86d15ef03e9,3O6CYIULEH1LVF5ZZSVQQ69KA4EUW4,58.124709,explanatory,5,30,BAAB,False,1586477763044,stroke,...,1586477760797,https://causaldraw.s3.amazonaws.com/levers_1_B...,"M302,446c0,-4.29851 -5.36524,-19.63476 -2,-23c...",1586477763050,levers,1,0,jspsych-cued-drawing,12,A36GTTPW17HUL1


In [11]:
## preprocessing of col names to match annotation data

# make sketchID
orig_svgs['toy_id'] = orig_svgs['toy_type'] + '_' + orig_svgs['toy_variant'].astype(str)
orig_svgs['sketchID'] = orig_svgs['toy_id'] + '.' + orig_svgs['gameID']

# make strokeIndex (since causaldraw_annotation has a strokeIndex with "0 indexing")
orig_svgs['strokeIndex'] = orig_svgs['currStrokeNum'] - 1

In [12]:
## merge svg data from causaldraw with annotation data from causaldraw_annotations
combo = pd.merge(F, orig_svgs[['svg', 'arcLength', 'sketchID', 'strokeIndex']], on=['sketchID', 'strokeIndex'], how='left')
combo

Unnamed: 0,sketchID,strokeIndex,condition,orig_gameID,strokeLabel,strokeType,strokeRoiNum,svg,arcLength
0,gears_1.0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,0,explanatory,0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,gear,causal,4.0,"M315,266c0,3.66667 0,7.33333 0,11c0,3 0,6 0,9c...",937.999262
1,gears_1.0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,1,explanatory,0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,gear,causal,3.0,"M253,190c16.21414,0 17.30774,1.30774 28,12c2.5...",1450.279985
2,gears_1.0366-5aa209bd-fda5-4afd-b896-9371d73ab1df,0,depictive,0366-5aa209bd-fda5-4afd-b896-9371d73ab1df,background,background,5.0,"M176,245c0,-11.78835 -0.1136,-106.8864 0,-107c...",671.450565
3,gears_1.0366-5aa209bd-fda5-4afd-b896-9371d73ab1df,1,depictive,0366-5aa209bd-fda5-4afd-b896-9371d73ab1df,gear,functional,1.0,"M209,167c-11.32421,0 -26.73795,-4.78615 -31,8c...",252.747348
4,gears_1.0366-5aa209bd-fda5-4afd-b896-9371d73ab1df,2,depictive,0366-5aa209bd-fda5-4afd-b896-9371d73ab1df,gear,functional,1.0,"M201,198c0,0.23509 -3.72683,25.72683 8,14c2.31...",55.661950
...,...,...,...,...,...,...,...,...,...
4492,pulleys_2.9810-988306f1-19e0-4158-a007-261fc2d...,8,depictive,9810-988306f1-19e0-4158-a007-261fc2d3ae0b,wheel,functional,3.0,"M386,81c-0.66667,0.33333 -1.4274,0.52283 -2,1c...",120.559612
4493,pulleys_2.9810-988306f1-19e0-4158-a007-261fc2d...,9,depictive,9810-988306f1-19e0-4158-a007-261fc2d3ae0b,string,functional,6.0,"M386,111c0,-5 0,10 0,15c0,10 0,20 0,30c0,19 0,...",152.076427
4494,pulleys_2.9810-988306f1-19e0-4158-a007-261fc2d...,10,depictive,9810-988306f1-19e0-4158-a007-261fc2d3ae0b,wheel,causal,1.0,"M258,22c-5.3108,-5.3108 -13.60541,31.19729 -8,...",153.588471
4495,pulleys_2.9810-988306f1-19e0-4158-a007-261fc2d...,11,depictive,9810-988306f1-19e0-4158-a007-261fc2d3ae0b,wheel,causal,2.0,"M263,38c4.67856,0 1.75173,9.38223 1,14c-1.7544...",376.025906


In [13]:
combo = combo.dropna(subset=["svg"])

In [14]:
## check that all sketches are still in this preprocessed dataframe
print('preprocessed: {}'.format(combo['sketchID'].nunique()))
print('original: {}'.format(E['sketchID'].nunique()))

X = F[~F.sketchID.isin(combo.sketchID)]
print('any differences? {}'.format(X['sketchID'].unique()))

preprocessed: 289
original: 300
any differences? []


In [15]:
## save out CSV
# combo.to_csv(os.path.join(csv_dir,'causaldraw_annotation_preprocessed_svg_data.csv'))
combo.to_csv(os.path.join(csv_dir,'causaldraw_annotation_preprocessed_final_svg_data.csv'))

In [16]:
# duplicates = pd.merge(Z, orig_svgs[['svg', 'arcLength', 'sketchID', 'strokeIndex']], on=['sketchID', 'strokeIndex'], how='left')
# duplicates = duplicates.dropna(subset=["svg"])

In [17]:
# duplicates.to_csv(os.path.join(csv_dir,'causaldraw_annotation_preprocessed_duplicate_svg_data.csv'))