In [1]:
from __future__ import division
import os, sys

import pymongo as pm
import numpy as np
import scipy.stats as stats
import pandas as pd
import json
import re
from io import BytesIO
from PIL import Image
from skimage import io, img_as_float
import base64
from collections import Counter

import matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')

from IPython.display import clear_output
import importlib

import utils

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

### set up paths, etc.

In [2]:
# directory & file hierarchy
proj_dir = os.path.abspath('..')
analysis_dir = os.getcwd()
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
exp_dir = os.path.abspath(os.path.join(proj_dir,'experiments'))

## add helpers to python path
if os.path.join(proj_dir,'utils') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'utils'))

def make_dir_if_not_exists(dir_name):   
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    return dir_name

## create directories that don't already exist        
result = [make_dir_if_not_exists(x) for x in [results_dir,plot_dir,csv_dir]]

### load in raw causaldraw annotation group dataframe

In [5]:
A = pd.read_csv(os.path.join('/Users/hollyhuey/causaldraw_public2021/experiments/semantic_annotations/results/csv','causaldraw_annotation_final_stoke_analysis_random_sample.csv'))

## deborkify
if 'Unnamed: 0' in A.columns:
    A.drop(labels=['Unnamed: 0'], inplace=True, axis=1)
    
## add special column that mashes up label and stroke type
A = A.assign(label_type = A.apply(lambda x: '{}_{}'.format(x['strokeLabel'], x['strokeType']), axis=1))
    
## add special column that mashes up ROI label and ROI number
A = A.assign(roi_labelName = A.apply(lambda x: '{}_{}'.format(x['strokeLabel'], x['strokeRoiNum']), axis=1))

### stroke level

In [6]:
A

Unnamed: 0,sketchID,orig_gameID,stimuli,condition,strokeIndex,strokeLabel,strokeType,strokeRoiNum,label_type,roi_labelName
0,gears_1.0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,gears_1,explanatory,0,gear,causal,4.0,gear_causal,gear_4.0
1,gears_1.0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,gears_1,explanatory,1,gear,causal,3.0,gear_causal,gear_3.0
2,gears_1.0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,gears_1,explanatory,2,symbols,symbol,,symbols_symbol,symbols_nan
3,gears_1.0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,gears_1,explanatory,3,symbols,symbol,,symbols_symbol,symbols_nan
4,gears_1.0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,gears_1,explanatory,4,symbols,symbol,,symbols_symbol,symbols_nan
...,...,...,...,...,...,...,...,...,...,...
5393,pulleys_2.9810-988306f1-19e0-4158-a007-261fc2d...,9810-988306f1-19e0-4158-a007-261fc2d3ae0b,pulleys_2,depictive,8,wheel,functional,3.0,wheel_functional,wheel_3.0
5394,pulleys_2.9810-988306f1-19e0-4158-a007-261fc2d...,9810-988306f1-19e0-4158-a007-261fc2d3ae0b,pulleys_2,depictive,9,string,functional,6.0,string_functional,string_6.0
5395,pulleys_2.9810-988306f1-19e0-4158-a007-261fc2d...,9810-988306f1-19e0-4158-a007-261fc2d3ae0b,pulleys_2,depictive,10,wheel,causal,1.0,wheel_causal,wheel_1.0
5396,pulleys_2.9810-988306f1-19e0-4158-a007-261fc2d...,9810-988306f1-19e0-4158-a007-261fc2d3ae0b,pulleys_2,depictive,11,wheel,causal,2.0,wheel_causal,wheel_2.0


In [18]:
## get label counts for each stroke
B = A.groupby(['sketchID','strokeIndex','roi_labelName'])['label_type'].value_counts().reset_index(name='label_counts')

## get the most commonly assigned roi_labelname 
C = B.groupby(['sketchID','strokeIndex','roi_labelName']).apply(lambda x: x[x['label_counts']==x.label_counts.max()]['label_type'])

## just pull out these most common label names
D = C.reset_index(drop=False).drop(labels='level_3',axis=1)

## separate out label_type information into two columns
D = D.assign(strokeLabel=D['label_type'].apply(lambda x: x.split('_')[0]))
D = D.assign(strokeType=D['label_type'].apply(lambda x: x.split('_')[1]))
D.drop(labels='label_type',inplace=True, axis=1)

In [20]:
D.head()

Unnamed: 0,sketchID,strokeIndex,roi_labelName,strokeLabel,strokeType
0,gears_1.0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,0,gear_4.0,gear,causal
1,gears_1.0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,1,gear_3.0,gear,causal
2,gears_1.0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,2,symbols_nan,symbols,symbol
3,gears_1.0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,3,symbols_nan,symbols,symbol
4,gears_1.0219-e77f751a-a934-4602-97a0-f2c0bd8bd638,4,symbols_nan,symbols,symbol


### sketch level

In [36]:
## get counts of each type of stroke
E = D.groupby(['sketchID', 'roi_labelName'])['strokeType'].value_counts().reset_index(name='type_counts')

# pivot long to wide, so a column for each stroke type
F = E.pivot(index=['sketchID', 'roi_labelName'], columns='strokeType', values='type_counts').reset_index()

## add total strokes column
F = F.assign(totalStrokes = F.apply(lambda x: np.nansum([x['background'],x['causal'],x['functional'],x['symbol']]),axis=1))

## replace NaNs with zeros
F = F.fillna(value=0)

## rename columns
F = F.rename({'background':'numBackground','causal':'numCausal','functional':'numFunctional',
          'symbol':'numSymbol'}, axis=1)

## add proportion columns
F = F.assign(propCausal = F.apply(lambda x: x['numCausal']/x['totalStrokes'],axis=1))
F = F.assign(propFunctional = F.apply(lambda x: x['numFunctional']/x['totalStrokes'],axis=1))
F = F.assign(propSymbol = F.apply(lambda x: x['numSymbol']/x['totalStrokes'],axis=1))
F = F.assign(propBackground = F.apply(lambda x: x['numBackground']/x['totalStrokes'],axis=1))

## save out CSV
F.to_csv(os.path.join(csv_dir,'causaldraw_annotation_data_random_sample.csv'))