## Loading libraries and setting up directories

In [None]:
from __future__ import division

import os
import urllib, cStringIO

import pymongo as pm
from collections import Counter
import matplotlib
import matplotlib.patheffects as PathEffects
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('poster')
sns.set_style('white')

import numpy as np
import scipy.stats as stats
import pandas as pd
import json
import re

from PIL import Image
import base64
import sys

from svgpathtools import parse_path



import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")




In [None]:
# directory & file hierarchy
proj_dir = os.path.abspath('../..')
analysis_dir = os.getcwd()
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
features_dir= os.path.join(results_dir,'features')
exp_dir = os.path.abspath(os.path.join(proj_dir,'experiments'))
sketch_dir = os.path.abspath(os.path.join(proj_dir,'sketches'))

## add helpers to python path
if os.path.join(proj_dir,'analysis') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'analysis'))

if not os.path.exists(results_dir):
    os.makedirs(results_dir)  

if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)  

if not os.path.exists(features_dir):
    os.makedirs(features_dir)
    
## add helpers to python path
if os.path.join(proj_dir,'analysis') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'analysis'))        
    
# Assign variables within imported analysis helpers
import analysis_helpers as h
if sys.version_info[0]>=3:
    from importlib import reload
reload(h)

In [None]:
def cleanup_df(X):
    if 'Unnamed: 0' in X.columns:
        X = X.drop(columns=['Unnamed: 0'])
    return X

In [None]:
##Read in raw dataframe with subsetted data for 3 annotations per sketch

D=cleanup_df(pd.read_csv(os.path.join(csv_dir,'raw_df.csv')))

In [None]:
D.head()

## Creating spline and stroke level dataframes for further analysis

In [None]:
## get the list of unique labels applied to sketches
unique_labels = np.unique(D.label.values)

## Removing Nones and obviously wrong super long lables
unique_labels = [i for i in unique_labels if i is not None]
unique_labels = [i for i in unique_labels if len(i)<900]

print 'we have {} unique labels'.format(len( unique_labels))

In [None]:

##Create empty dictionary with categories as keys. We will use this to store part occurrence data for our categories
label_vect_dict = {unique_cats[0]:None,unique_cats[1]:None,unique_cats[2]:None,unique_cats[3]:None}

In [None]:
##Create vectors that contain the number of part instances in each sketch

for category in unique_cats:
    DS= D[D['category']==category]
    unique_sketches_in_cat = np.unique(DS['sketch_id'])
    unique_labels_in_cat = np.unique(DS['label'])
    ## initialize matrix that has the correct dimensions
    Label_Vec = np.zeros((len(unique_sketches_in_cat),len(unique_labels_in_cat)), dtype=int)
    unique_labels_in_cat= np.array(unique_labels_in_cat)
    for s,this_sketch in enumerate(unique_sketches_in_cat):
        label_vec = np.zeros(len(unique_labels_in_cat),dtype=int)
        DSS = DS[DS['sketch_id']==this_sketch]
        annotation_ids = np.unique(DSS['annotation_id'].values)    
        for this_annotation in annotation_ids:
            DSA = DSS[DSS['annotation_id']==this_annotation]
            label_list = DSA.label.values
            for this_label in label_list:
                label_ind = unique_labels_in_cat==this_label
                label_vec[label_ind] += 1
            
        Label_Vec[s,:]=label_vec/num_annots
    label_vect_dict[category]= Label_Vec

In [None]:
valid_labels=[]
valid_labels_dict={}
for category in unique_cats:
    vect = label_vect_dict[category]
    thresh = 50
    #print 'These are the labels that appear at least {} times:'.format(thresh)
    #print unique_labels[np.sum(Label_Vec,0)>thresh]
    unique_labels_in_cat = np.unique(D[D['category']==category]['label'])
    plot_labels= unique_labels_in_cat[np.sum(vect,0)>thresh]
    valid_labels_dict[category]=plot_labels
    valid_labels.append(plot_labels)


    prop_labels=[]
    for part in plot_labels:
        DS=D[D['category']==category]
        prop_labels.append(DS[DS['label']==part]['annotation_id'].nunique()/DS['annotation_id'].nunique())
    
    
    sns.set_context('talk')
    plt.figure(figsize=(12,7))
    plt.ylim(0,1)
    h = plt.bar(plot_labels,prop_labels)
    plt.title('Proportion of {} annotations with labels'.format(category))
    plt.ylabel('proportion of annotations')
    plt.xlabel('Part')
    
##flattening valid labels
valid_labels = [item for sublist in valid_labels for item in sublist]


In [None]:
len(np.unique(valid_labels))

In [None]:
#Creating a spline-level df where the modal label is set as the 'true' label for any given spline
spline_df= D.groupby('spline_id').agg(lambda x: Counter(x).most_common(1)[0][0])
spline_df.reset_index(level=0, inplace=True)

In [None]:
##Creating a stroke-level dataframe that takes the mode value of annotation for its children splines to set as its
##label value

from collections import Counter


from collections import OrderedDict
stroke_svgs=OrderedDict()
for category in unique_cats:
    DS=D[D['category']==category]
    for sketch in np.unique(DS['sketch_id']):
        DSS=DS[DS['sketch_id']==sketch]
        for stroke in np.unique(DSS['stroke_num']):
            DSA=DSS[DSS['stroke_num']==stroke]
            DSA=DSA.reset_index()
            stroke_svgs[DSA['stroke_id'][0]] = DSA['sketch_svg_string'][0][stroke]

            
            
stroke_svg_df= pd.DataFrame.from_dict(stroke_svgs, orient='index')    
stroke_group_data= D.groupby('stroke_id').agg(lambda x: Counter(x).most_common(1)[0][0])
labels= pd.DataFrame(stroke_group_data[['sketch_id','label','stroke_num','condition','target','category','outcome']])
stroke_df=pd.merge(stroke_svg_df,labels,left_index=True, right_index =True)
stroke_df.reset_index(level=0, inplace=True)
stroke_df=stroke_df.rename(index=str, columns={"index": "stroke_id", 0: "svg"})


In [None]:
##Adding total arclength information to stroke dataframe
from svgpathtools import parse_path
import svgpathtools

def calculate_arclength(svg):
    try:
        arclength= parse_path(svg).length()
    except ZeroDivisionError:
        print 'zero div error'
        arclength = 0
    return arclength
        
                    

In [None]:
stroke_df['arc_length'] = stroke_df['svg'].apply(calculate_arclength)

## Creating feature vectors and normalizing 

In [None]:
###This is where we make a num unique labels * 2 X number of sketches vector 

feature_vec = np.zeros((len(stroke_df.sketch_id.unique()),len(valid_labels)*2), dtype=int)
ind=0
start_pos=0
end_pos=0
meta_list=[]
cols = ['sketch_id','target','condition','category','outcome']

for cat in unique_cats:
  
    DS= stroke_df[stroke_df['category']==cat]
    unique_labels_in_cat=valid_labels_dict[cat]
    unique_sketches_in_cat=DS['sketch_id'].unique()
    start_pos = end_pos
    end_pos+= len(unique_labels_in_cat)
    print start_pos, end_pos
    Label_Vec = np.zeros((len(unique_sketches_in_cat),len(unique_labels_in_cat)*2), dtype=int)
    arc_length_vec = np.zeros((len(unique_sketches_in_cat),len(valid_labels_dict[cat])), dtype=int)
    for s,sketch in enumerate(unique_sketches_in_cat):
        
        label_vec = np.zeros(len(unique_labels_in_cat),dtype=int)
        arc_vec = np.zeros(len(unique_labels_in_cat),dtype=int)
        DSA=DS[DS['sketch_id']==sketch]
      
        meta_list.append(pd.Series([DSA['sketch_id'],DSA['target'].unique(),DSA['condition'].unique(),DSA['category'].unique(),DSA['outcome'].unique()], index=cols))
        label_list = DSA.label.values        
        for label in label_list:
            if label in unique_labels_in_cat:
                label_ind = unique_labels_in_cat==label
                label_vec[label_ind] += 1
        for label in unique_labels_in_cat:
            DSB=DSA[DSA['label']==label]
            label_ind = unique_labels_in_cat==label
            arc_vec[label_ind] = DSB['arc_length'].sum()
            
        
        feature_vec[ind,start_pos:end_pos]=label_vec
        feature_vec[ind,start_pos+len(valid_labels):end_pos+len(valid_labels)]=arc_vec
        ind+=1
feature_vec_meta = pd.DataFrame(meta_list, columns=cols)

    


    

In [None]:
##Changing column values from np arrays to strings/boolean

def arr_to_str(arr):
    return (arr[0])
feature_vec_meta['target']=feature_vec_meta['target'].apply(arr_to_str)
feature_vec_meta['condition']=feature_vec_meta['condition'].apply(arr_to_str)
feature_vec_meta['category']=feature_vec_meta['category'].apply(arr_to_str)
feature_vec_meta['outcome']=feature_vec_meta['outcome'].apply(arr_to_str)



In [None]:
feature_df= pd.DataFrame(feature_vec, columns=[s + '_numstrokes' for s in valid_labels]+[s + '_total_arclength' for s in valid_labels])

In [None]:
##creating a compressed version of the feature df with no duplicates for parts

labs_numstrokes=[]
labs_total_arclength=[]
for lab in np.unique(valid_labels):
    labs_numstrokes.append(lab +'_numstrokes')
    labs_total_arclength.append(lab+'_total_arclength')
feature_df_labs=labs_numstrokes+labs_total_arclength   
feature_df_final= pd.DataFrame(columns=feature_df_labs)


for this_lab in feature_df_labs:
    duplicates=[col for col in feature_df if col.startswith(this_lab)]
    feature_df_final[this_lab]= feature_df[duplicates].sum(axis=1)
feature_df = feature_df_final

In [None]:
##Check to make sure the df looks okay
assert len(feature_df.columns)==len(np.unique(feature_df.columns))
feature_df.head()

In [None]:
#Normalizing within row within measure (numstrokes/arclength) 

feature_df.iloc[:,0:int(len(feature_df.columns)/2)]=feature_df.iloc[:,0:int(len(feature_df.columns)/2)].div(feature_df.iloc[:,0:int(len(feature_df.columns)/2)].sum(axis=1),axis=0)

feature_df.iloc[:,int(len(feature_df.columns)/2):int(len(feature_df.columns))]=feature_df.iloc[:,int(len(feature_df.columns)/2):int(len(feature_df.columns))].div(feature_df.iloc[:,int(len(feature_df.columns)/2):int(len(feature_df.columns))].sum(axis=1),axis=0)



In [None]:
###Execute this if we want to save a non-zscore matrix
run=True
if run==True:
    feature_df.to_csv(os.path.join(features_dir,'semantic_parts_sketch_features_compressed_non-whitened.csv'))
run=False    

In [None]:
#z-scoring within columns

columns=list(feature_df.columns)
for this_col in columns:
    feature_df[this_col]=(feature_df[this_col] - feature_df[this_col].mean())/feature_df[this_col].std(ddof=0)


In [None]:
feature_df.columns

### Saving out files as needed

In [None]:
feature_df.to_csv(os.path.join(features_dir,'semantic_parts_sketch_features_compressed.csv'))

In [None]:
np.save(os.path.join(features_dir, 'semantic_parts_sketch_features'),feature_vec)

In [None]:
feature_vec_meta.to_csv(os.path.join(features_dir,'semantic_parts_sketch_meta.csv'))

## Results : "Summer Analysis"

### Inter-annotator reliability

In [None]:
## Getting the number of unique labels assigned to a given spline across annotations
num_diff_annots = []
for this_cat in unique_cats:
    DS=D[D['category']==this_cat]
    labels = valid_labels_dict[this_cat]
    unique_sketches_in_cat=np.unique(DS['sketch_id'])
    

   
    for this_sketch_id in unique_sketches_in_cat:
        DSA=DS[DS['sketch_id']==this_sketch_id]
        unique_splines = np.unique(DSA['cumulative_spline_num'])
        for i,this_spline in enumerate(unique_splines):
            DSB =DSA[DSA['cumulative_spline_num']==this_spline]
            numannots= 4-len(np.unique(DSB['label']))
            if numannots==0:
                numannots=1
            num_diff_annots.append(numannots)

In [None]:
#plotting variability in spline annots
h= plt.hist(num_diff_annots, bins= range(1,5), align='left', density='True')
plt.title('Inter-annotator reliability')
plt.ylabel('proportion of splines')
plt.xlabel('Annotator agreement on label')
plt.xticks([1,2,3],['0/3','2/3','3/3'])

In [None]:
print h

### Stroke-part relationships

In [None]:
for this_cat in unique_cats:
    labels = valid_labels_dict[this_cat]
    DS=spline_df[spline_df['category']==this_cat]
    spline_annots_per_stroke = []
    unique_sketches_in_cat= np.unique(DS['sketch_id'])
    for this_sketch_id in unique_sketches_in_cat:
        DSA=DS[DS['sketch_id']==this_sketch_id]
        unique_strokes = np.unique(DSA['stroke_num'])
        for i,this_stroke in enumerate(unique_strokes):
            DSB =DSA[DSA['stroke_num']==this_stroke]
            numlabels= DSB['label'].nunique()
            spline_annots_per_stroke.append(numlabels)

In [None]:
h= plt.hist(spline_annots_per_stroke, bins =range(1,8), align='left', density="True")
plt.title('Within-stroke label agreement')
plt.ylabel('proportion of strokes')
plt.xlabel('number of different labels within stroke')

In [None]:
print h

In [None]:
for this_cat in unique_cats:
    DS=stroke_df[stroke_df['category']==this_cat]
    labels= valid_labels_dict[this_cat]
    strokes_in_part_vect = np.zeros((len(np.unique(DS['sketch_id']))*len(labels),3), dtype='|a1000')
    ind=0
    for this_sketch in np.unique(DS['sketch_id']):    
        DSA= DS[DS['sketch_id']==this_sketch]
        for this_label in labels:
            DSB=DSA[DSA['label']==this_label]
            strokes_in_part_vect[ind,]=[this_sketch, this_label,len(np.unique(DSB['stroke_num']))]
            ind+=1
    strokes_in_part_vect=strokes_in_part_vect[~np.all(strokes_in_part_vect == '', axis=1)]
    strokes_in_part_df= pd.DataFrame(strokes_in_part_vect, columns=['sketch_id','part','num_strokes'])
    strokes_in_part_df['num_strokes']=pd.to_numeric(strokes_in_part_df['num_strokes'])
    plt.figure()
    b=sns.barplot(x='part',y='num_strokes',data=strokes_in_part_df,ci=95,capsize=0.3, errwidth= 3)
    for item in b.get_xticklabels():
        item.set_rotation(45)

### Part-streak analysis

In [None]:
##Creating a dictionary of sketch_id with associated part sequences
seq_dict={}
for this_sketch in np.unique(stroke_df['sketch_id']):
    parts_list=[]
    DS=stroke_df[stroke_df['sketch_id']==this_sketch]
    for i, row in DS.iterrows():
        parts_list.append(stroke_df['label'][i])
    seq_dict[this_sketch]=parts_list

In [None]:
##functions for getting 'mean streak_length' from a particular sketch for ground truth and scrambled part orders

import random

def get_mean_streak(sketch_id):
    parts = seq_dict[sketch_id]
    streak_counter=1
    list_of_streaks=[]
    for obj in range(len(parts)-1):
        if parts[obj]==parts[obj+1]:
            streak_counter+=1
        else:
            list_of_streaks.append(streak_counter)
            streak_counter=1 
    list_of_streaks.append(streak_counter)
    return np.mean(list_of_streaks)

def get_scramble_mean_streak(sketch_id):
    parts = seq_dict[sketch_id]
    scram_parts=random.sample(parts,len(parts))
    streak_counter=1
    list_of_streaks=[]
    for obj in range(len(scram_parts)-1):
        if scram_parts[obj]==scram_parts[obj+1]:
            streak_counter+=1
        else:
            list_of_streaks.append(streak_counter)
            streak_counter=1 
    list_of_streaks.append(streak_counter)
    return np.mean(list_of_streaks)

In [None]:
#Iterating over all sketches to get mean streakiness for each sketch_id

gt_streak_mean={}
for this_cat in unique_cats:
    DS= stroke_df[stroke_df['category']==this_cat]
    streak_mean_list=[]
    for this_sketch in np.unique(DS['sketch_id']):
        streak_mean_list.append(get_mean_streak(this_sketch))
    gt_streak_mean[this_cat]=np.mean(streak_mean_list)

In [None]:
streak_diff_dict={}
for this_cat in unique_cats:
    mean_streak_diff_list=[]
    DS=stroke_df[stroke_df['category']==this_cat]
    for i in range(1000):
        scrambled_streaks=[] 
        real_streaks=[]
        for sketch in np.unique(DS['sketch_id']):
            scrambled_streaks.append(get_scramble_mean_streak(sketch))
            real_streaks.append(get_mean_streak(sketch))
        mean_streak_diff_list.append(np.mean(real_streaks)-np.mean(scrambled_streaks))
    streak_diff_dict[this_cat]=mean_streak_diff_list

In [None]:
def CIPlot(category): 
    stroke_df_lite_ss=stroke_df[stroke_df['category']==category]
    mean_streak_diff_list=[]
    for i in range(1000):
        this_round_scrambled_streak=[] 
        this_round_real_streak=[]
        for sketch in np.unique(stroke_df_lite_ss['sketch_id']):
            this_round_real_streak.append(get_mean_streak(sketch))
            this_round_scrambled_streak.append(get_scramble_mean_streak(sketch))
        mean_streak_diff_list.append(np.mean(this_round_real_streak)-np.mean(this_round_scrambled_streak))
    perm_observed_mean_streak_diff = np.mean(mean_streak_diff_list)    
    lb=np.percentile(mean_streak_diff_list,2.5)
    ub=np.percentile(mean_streak_diff_list,97.5)
    plt.figure(figsize=(10,8))
    h=sns.distplot(mean_streak_diff_list,kde=False,hist=True,norm_hist=False)
    plt.axvline(perm_observed_mean_streak_diff, color='yellow', linestyle='solid', linewidth=2)
    plt.axvline(lb, color='orange', linestyle='solid', linewidth=2)
    plt.axvline(ub, color='orange', linestyle='solid', linewidth=2)
    plt.title(category)
    plt.ylabel('count')
    plt.xlabel('streak length difference')
    plt.legend(['mean','95% CI'], ncol=2, bbox_to_anchor=(1, 1.05))
    
    plt.savefig(os.path.join(plot_dir,'Streakiness Diff'),edgecolor='w',bbox_inches='tight')
    plt.show()
    return np.mean(mean_streak_diff_list), np.std(mean_streak_diff_list)




def CIPlotCatCond(category,condition): 
    stroke_df_lite_ss=stroke_df[(stroke_df['category']==category)&(stroke_df['condition']==condition)]
    mean_streak_diff_list=[]
    for i in range(1000):
        this_round_scrambled_streak=[] 
        this_round_real_streak=[]
        for sketch in np.unique(stroke_df_lite_ss['sketch_id']):
            this_round_real_streak.append(get_mean_streak(sketch))
            this_round_scrambled_streak.append(get_scramble_mean_streak(sketch))
        mean_streak_diff_list.append(np.mean(this_round_real_streak)-np.mean(this_round_scrambled_streak))
    perm_observed_mean_streak_diff = np.mean(mean_streak_diff_list)    
    lb=np.percentile(mean_streak_diff_list,2.5)
    ub=np.percentile(mean_streak_diff_list,97.5)
    plt.figure(figsize=(10,8))
    h=sns.distplot(mean_streak_diff_list,kde=False,hist=True,norm_hist=False)
    plt.axvline(perm_observed_mean_streak_diff, color='yellow', linestyle='solid', linewidth=2)
    plt.axvline(lb, color='orange', linestyle='solid', linewidth=2)
    plt.axvline(ub, color='orange', linestyle='solid', linewidth=2)
    plt.title('{}_{}'.format(category,condition))
    plt.ylabel('count')
    plt.xlabel('streak length difference')
    plt.legend(['mean','95% CI'], ncol=2, bbox_to_anchor=(1, 1.05))
    
    plt.savefig(os.path.join(plot_dir,'mean_streak_difference_{}_{}'.format(category, condition)),edgecolor='w',bbox_inches='tight')
    plt.show()
    return perm_observed_mean_streak_diff, lb, ub
    

In [None]:
for this_cat in unique_cats:
    CIPlot(this_cat)

In [None]:
for this_condition in np.unique(stroke_df['condition']):
    for this_category in np.unique(stroke_df['category']):
        CIPlotCatCond(this_category, this_condition)

In [None]:
for this_cat in unique_cats:
    plot_data= CIPlot(this_cat)
    plt.figure
    CI_data= np.array(plot_data[0]-plot_data[0]-2*plot_data[1], plot_data[0]-plot_data[0]+2*plot_data[1])
    h= plt.bar([0,1,2],[0,plot_data[0],0],yerr=[0,CI_data,0],capsize=15)
    plt.xlabel('')
    plt.ylabel('Mean streakiness difference')
    plt.xticks([0,1,2],['','',''])
    plt.savefig(os.path.join(plot_dir,'{}_streak_diff'.format(this_cat)),edgecolor='w',bbox_inches='tight')
    plt.show()
    print plot_data

## Results : Sketch Feature Analysis