In [1]:
import os, sys

import pymongo as pm
import numpy as np
import scipy.stats as stats
import pandas as pd
import json
import re
from io import BytesIO
from PIL import Image
from skimage import io, img_as_float
import base64

import matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')

from IPython.display import clear_output
import importlib

import utils

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [2]:
# directory & file hierarchy
proj_dir = os.path.abspath('..')
analysis_dir = os.getcwd()
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
exp_dir = os.path.abspath(os.path.join(proj_dir,'experiments'))
# sketch_dir = os.path.abspath(os.path.join(proj_dir,analysis_dir,'sketches'))
# gallery_dir = os.path.abspath(os.path.join(proj_dir,analysis_dir,'gallery'))

## add helpers to python path
if os.path.join(proj_dir,'utils') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'utils'))

def make_dir_if_not_exists(dir_name):   
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    return dir_name

## create directories that don't already exist        
result = [make_dir_if_not_exists(x) for x in [results_dir,plot_dir,csv_dir]]

## establish connection to mongo
First thing you need to do is to establish an ssh tunnel (aka remote port forwarding) to the server, so that requests to the mongodb can be made "as if" the mongodb server is running on your local computer. Run this from the command line before you begin data analysis if you plan to fetch data from mongo:

ssh -fNL 27017:127.0.0.1:27017 hhuey@cogtoolslab.org

In [17]:
# set vars 
auth = pd.read_csv(os.path.join(analysis_dir,'auth.txt'), header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org'

# have to fix this to be able to analyze from local
import pymongo as pm
import socket
if socket.gethostname().split('_')[0]=='Holly':
    conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27017')
else: 
    conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27017') 
db = conn['causaldraw']
coll = db['annotations']

# which iteration name should we use?
iterationName = 'debugging'

In [30]:
## how many records do we have in mongo? (this is total stroke records)
print('We have {} records in mongo.'.format(coll.estimated_document_count()))

We have 55519 records in mongo.


In [31]:
## grab me one of these records, plz
## this find one record from current iteration
# coll.find_one({'iterationName':iterationName, 'eventType': 'labels'})

In [32]:
## check: what are all the iterationNames in mongo? (use current iteration)
coll.find({}).distinct('iterationName')

['testing',
 'testing1',
 'testing2',
 'testing3',
 'testing4',
 'testing5',
 'testing6',
 'testing7',
 'testing8',
 'testing9',
 'testing10',
 'testing11',
 'testing12',
 'pilot1',
 'testing13',
 'pilot2',
 'reference1',
 'reference2',
 'reference3',
 'reference4',
 'reference5',
 'reference6',
 'pilot3',
 'testing14',
 'pilot4',
 'testing15',
 'pilot5',
 'testing16',
 'pilot6',
 'pilot7',
 'pilot8',
 'pilot9',
 'pilot10',
 'debugging',
 'pilot11']

### Generate dataframes

In [7]:
# coll.find_one({'iterationName':iterationName})

In [41]:
## dump all label records into df (of current iteration)
l = coll.find({'iterationName':iterationName, 'eventType':'labels'})
L = pd.DataFrame(l)

print('We have {} unique label records in all {} of our games.'.format(L.shape[0], L['gameID'].nunique()))

We have 1693 unique label records in all 13 of our games.


In [9]:
# L['rois']

In [34]:
## how to check if a workerID has done our task 
allW = coll.find({'iterationName':iterationName}) #.distinct('wID')
# 'A3K9GTQBOI7O5A' in allW

In [35]:
L['gameID'].unique()

array(['9588-613e2c7f-7a26-43cd-8b02-63022cc33942',
       '1595-53cbcd74-13da-4b1a-a561-161c307db510',
       '5332-2f8d9be6-4ae2-4f76-887c-4cc7380854f6',
       '8918-431dec0e-6639-4d0b-9028-9ec01d702a79',
       '6148-31eeb1cb-c788-4169-a2cb-b4aa8436e1d1',
       '1057-99fa7624-06c8-42ef-bebe-cf3e8dac6992',
       '4561-00e5866a-d8aa-4449-a4de-f6b4b2745f7e',
       '0756-532c5d7a-01c6-49a4-81a2-8b2d08ef046b',
       '1545-5845eab1-b6b7-462b-9a10-a033b33f0528',
       '8026-7241b0e6-5457-4778-becd-1d3b4e955cbb',
       '6059-e3a2e7a9-d06d-4ddb-8976-ceff441bf3af',
       '5688-6f0e6853-5aba-4463-9e29-52de39719287'], dtype=object)

In [42]:
## to iterate over rows of pandas dataframe to extract causal and functional roi info into new columns
for i,d in L.iterrows():
    ## looping over the ROIs for this machine & populating new causal and functional columns with values
    for roi in d['rois']:
        try:
            if (roi['roi_name']==d['roi_labelName']) & (roi['roi_num']==int(d['roi_labelNum'])):
                L.loc[i,'causal'] = roi['causal']
                L.loc[i,'functional'] = roi['functional']
        except:
            ## note to self: NaNs will appear for symbol annotations? 
            print('Something went wrong with {} {}'.format(d['gameID'], d['trialNum']))
            clear_output(wait=True)
            pass
print('Done!')

Done!


In [43]:
## input values for non-rois for causal information
L.loc[L.roi_labelName == 'symbols', 'causal'] = False
L.loc[L.roi_labelName == 'short', 'causal'] = False
L.loc[L.roi_labelName == 'unintelligible', 'causal'] = False

## input values for non-rois for functional information
L.loc[L.roi_labelName == 'symbols', 'functional'] = False
L.loc[L.roi_labelName == 'short', 'functional'] = False
L.loc[L.roi_labelName == 'unintelligible', 'functional'] = False

## add new symbol column and input values
L['symbol'] = ''
L.loc[(L.roi_labelName != 'unintelligible') | (L.roi_labelName != 'symbols') | (L.roi_labelName != 'short'), 'symbol'] = False
L.loc[L.roi_labelName == 'symbols', 'symbol'] = True
L.loc[L.roi_labelName == 'short', 'symbol'] = False
L.loc[L.roi_labelName == 'unintelligible', 'symbol'] = False

In [44]:
# L = L.drop(columns=['strokes'])
# list(L.keys())

In [45]:
## create dataframe of survey data
w = coll.find({'iterationName':iterationName, 'eventType':'survey'})
W = pd.DataFrame(w)

In [46]:
W['responses']

0                               {"Q0":"testing@hh.edu"}
1                               {"Q0":"testing@hh.edu"}
2                                     {"Q0":"testing2"}
3                                     {"Q0":"testing3"}
4     {"participantSex":"Neither/Other/Do Not Wish T...
5     {"TechnicalDifficultiesFreeResp":"sfsfs","part...
6                                    {"Q0":"testingHH"}
7     {"participantSex":"Neither/Other/Do Not Wish T...
8     {"TechnicalDifficultiesFreeResp":"nope","parti...
9                                     {"Q0":"checking"}
10    {"participantSex":"Female","judgedDifficulty":...
11    {"TechnicalDifficultiesFreeResp":"nada","parti...
12                                         {"Q0":"loz"}
13    {"participantSex":"Male","judgedDifficulty":"2...
14    {"TechnicalDifficultiesFreeResp":"heya","parti...
15                                       {"Q0":"hello"}
16    {"participantSex":"Neither/Other/Do Not Wish T...
17    {"TechnicalDifficultiesFreeResp":"hello","

In [47]:
## save out to csv
L.to_csv(os.path.join(csv_dir,'causaldraw_annotations_label_data_{}.csv'.format(iterationName)),index=False)
W.to_csv(os.path.join(csv_dir,'causaldraw_annotations_survey_data_{}.csv'.format(iterationName)),index=False)

## Basic Analyses

### What proportion of causal/non-causal and functional/non-functional elements are drawn?
Do sketchers selectively attend to more causally relevant information and thus, draw a higher proportion of causal elements? 

In [None]:
## make new columns for counts of causal and functional
df = L
df['causal_count'] = ''
df['functional_count'] = ''

# count Trues and Falses for causal and functional strokes
df = L.groupby(['condition','causal'])['causal_count'].count().reset_index()
df2 = L.groupby(['condition','functional'])['functional_count'].count().reset_index()
df['functional_count'] = df2['functional_count']
df['functional'] = df2['functional']

df

In [None]:
c = df.pivot(values='causal_count',columns='causal', index='condition').reset_index()
c['sum'] = c[False] + c[True]
c['propTrue'] = c[True]/c['sum']
c

In [None]:
## plot proportion of causal to non-causal annotations by condition
g = sns.barplot(data=c,x='condition',
            y='propTrue', 
            palette=sns.color_palette(['#9B9B9B', '#FF5720'])
            )
g.set(xticklabels=["depiction", "explanation"])

l = plt.ylabel('Prop')
m = plt.title('Proportion of strokes labeled as causal')

plt.tight_layout()

In [None]:
## raw count of causal to non-causal annotations by condition
g = sns.barplot(data=c,x='condition',
            y=True, 
            palette=sns.color_palette(['#9B9B9B', '#FF5720'])
            )
g.set(xticklabels=["depiction", "explanation"])

l = plt.ylabel('N Strokes')
m = plt.title('Strokes labeled as causal (raw count)')

plt.tight_layout()

In [None]:
## plot proportion of functional to non-functional annotations by condition

In [None]:
f = df.pivot(values='functional_count',columns='functional', index='condition').reset_index()
f['sum'] = f[False] + f[True]
f['propTrue'] = f[True]/f['sum']
f

In [None]:
## plot proportion of functional to non-functional annotations by condition
g = sns.barplot(data=f,x='condition',
            y='propTrue', 
            palette=sns.color_palette(['#9B9B9B', '#FF5720'])
            )
g.set(xticklabels=["depiction", "explanation"])

l = plt.ylabel('Prop')
m = plt.title('Proportion of strokes labeled as functional')

plt.tight_layout()

In [None]:
## raw count of functional to non-functional annotations by condition
g = sns.barplot(data=c,x='condition',
            y=True, 
            palette=sns.color_palette(['#9B9B9B', '#FF5720'])
            )
g.set(xticklabels=["depiction", "explanation"])

l = plt.ylabel('N Strokes')
m = plt.title('Strokes labeled as functional (raw count)')

plt.tight_layout()

### Do sketchers allocate more ink to causal elements?

In [None]:
## how much ink do sketchers devote to causal to non-causal by condition
g = sns.barplot(data=L,
            x='condition',
            y='arcLength', 
            hue='causal',
            palette=sns.color_palette(['#9B9B9B', '#FF5720'])
            )
g.set(xticklabels=["depictive", "explanation"])
plt.legend(bbox_to_anchor=(1, 1), loc=2, borderaxespad=0.5)

l = plt.ylabel('arcLength')
m = plt.title('length of strokes by condition')

plt.tight_layout()

### Do sketchers use more symbols in the explanatory condition?

In [None]:
sym = L
sym['count'] = ''
sym = L.groupby(['gameID','sketchID','condition','roi_labelName'])['count'].count().reset_index()
sym

In [None]:
g = sns.barplot(data=sym,
            x='condition',
            y='count', 
            hue='roi_labelName'
            )

plt.legend(bbox_to_anchor=(1, 1), loc=2, borderaxespad=0.5)


## Saving messy code below

In [None]:
list(L.keys())

In [None]:
# count within gameID and sketchID? Or only condition?
# count Trues and Falses for causal and functional strokes
df = L.groupby(['gameID','sketchID','condition','causal'])['causal_count'].count().reset_index()
df2 = L.groupby(['gameID','sketchID','condition','functional'])['functional_count'].count().reset_index()
df['functional_count'] = df2['functional_count']

df

In [None]:
# import seaborn as sns

tips = sns.load_dataset("tips")
g = sns.lmplot(x="total_bill", y="tip", hue="smoker",
 data=tips, markers=["o", "x"])

# title
new_title = 'My title'
g._legend.set_title(new_title)
# replace labels
new_labels = ['label 1', 'label 2']
for t, l in zip(g._legend.texts, new_labels): t.set_text(l)

# sns.plt.show()