In [1]:
from __future__ import division

import os
import urllib, cStringIO

import pymongo as pm
from collections import Counter
import matplotlib
import matplotlib.patheffects as PathEffects
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('poster')
sns.set_style('white')

import numpy as np
import scipy.stats as stats
import pandas as pd
import json
import re
from svgpathtools import parse_path
import svgpathtools

from PIL import Image
import base64
import sys

from svgpathtools import parse_path
from IPython.display import clear_output



import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")



In [2]:
import svg_rendering_helpers as srh
if sys.version_info[0]>=3:
    from importlib import reload
reload(srh)

# directory & file hierarchy
proj_dir = os.path.abspath('../..')
analysis_dir = os.getcwd()
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
features_dir= os.path.join(results_dir,'features')
exp_dir = os.path.abspath(os.path.join(proj_dir,'experiments'))
sketch_dir = os.path.abspath(os.path.join(proj_dir,'sketches'))

## add helpers to python path
if os.path.join(proj_dir,'analysis') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'analysis'))

if not os.path.exists(results_dir):
    os.makedirs(results_dir)  

if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)  

if not os.path.exists(features_dir):
    os.makedirs(features_dir)
    
## add helpers to python path
if os.path.join(proj_dir,'analysis') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'analysis'))        
    
# Assign variables within imported analysis helpers
import analysis_helpers as h
if sys.version_info[0]>=3:
    from importlib import reload
reload(h)

<module 'analysis_helpers' from '/Users/kushin/Documents/GitHub/semantic_parts/analysis/analysis_helpers.pyc'>

In [3]:
## helper dictionaries 
OBJECT_TO_CATEGORY = {
    'basset': 'dog', 'beetle': 'car', 'bloodhound': 'dog', 'bluejay': 'bird',
    'bluesedan': 'car', 'bluesport': 'car', 'brown': 'car', 'bullmastiff': 'dog',
    'chihuahua': 'dog', 'crow': 'bird', 'cuckoo': 'bird', 'doberman': 'dog',
    'goldenretriever': 'dog', 'hatchback': 'car', 'inlay': 'chair', 'knob': 'chair',
    'leather': 'chair', 'nightingale': 'bird', 'pigeon': 'bird', 'pug': 'dog',
    'redantique': 'car', 'redsport': 'car', 'robin': 'bird', 'sling': 'chair',
    'sparrow': 'bird', 'squat': 'chair', 'straight': 'chair', 'tomtit': 'bird',
    'waiting': 'chair', 'weimaraner': 'dog', 'white': 'car', 'woven': 'chair',
}
CATEGORY_TO_OBJECT = {
    'dog': ['basset', 'bloodhound', 'bullmastiff', 'chihuahua', 'doberman', 'goldenretriever', 'pug', 'weimaraner'],
    'car': ['beetle', 'bluesedan', 'bluesport', 'brown', 'hatchback', 'redantique', 'redsport', 'white'],
    'bird': ['bluejay', 'crow', 'cuckoo', 'nightingale', 'pigeon', 'robin', 'sparrow', 'tomtit'],
    'chair': ['inlay', 'knob', 'leather', 'sling', 'squat', 'straight', 'waiting', 'woven'],
}

In [4]:
##Helpers



def subset_dataframe_by_condition(F,to_inspect='all',this_category='bird',this_object='bluejay'):
    '''
    input: F: dataframe (num_sketches x num_features)
           to_inspect: a string indicating whether to subset by ['object','category','all']
           this_category: IF to_inspect == 'category', then we define this to subset by that category only
           this_object: IF to_inspect == 'object', then we define this to subset by that object only
           
    returns: two feature matrices, c and f, corresponding to the close and far subsetted feature matrices
           
    '''
        
    F = F.sort_values(by=['category','target'])

    ## get context condition inds for subsetting dataframe
    close_inds = F['condition'] == 'closer'
    far_inds = F['condition'] == 'further'

    ## if we want to inspect particular category
    category_inds = F['category']==this_category

    ## if we want to inspect particular object
    obj_list = np.unique(F.target.values)
    obj_inds = F['target']==this_object  
    
    ## get names of columns that contain stroke-count & arclength information
    numstrokes_cols = [i for i in F.columns if i.split('_')[-1]=='numstrokes']
    arclength_cols = [i for i in F.columns if i.split('_')[-1]=='arclength']
    feat_cols = numstrokes_cols + arclength_cols
    
    if to_inspect == 'object':    
        ## extract particular row corresponding to this OBJECT in each condition
        f = F[(far_inds) & obj_inds][feat_cols].reset_index(drop=True)
        c = F[(close_inds) & obj_inds][feat_cols].reset_index(drop=True)
        obj_listf = F[(far_inds) & obj_inds]['target'].values
        obj_listc = F[(close_inds) & obj_inds]['target'].values
    elif to_inspect == 'category':
        ## extract particular rows corresponding to this CATEGORY in each condition
        f = F[(category_inds) & (far_inds)][feat_cols].reset_index(drop=True)
        c = F[(category_inds) & (close_inds)][feat_cols].reset_index(drop=True)
        obj_listf = F[(category_inds) & (far_inds)]['target'].values
        obj_listc = F[(category_inds) & (close_inds)]['target'].values
    elif to_inspect == 'all':
        ## extract particular rows corresponding to each condition
        f = F[far_inds][feat_cols].reset_index(drop=True)
        c = F[close_inds][feat_cols].reset_index(drop=True) 
        obj_listf = F[far_inds]['target'].values
        obj_listc = F[close_inds]['target'].values
        
    return c, f, obj_listc, obj_listf

In [5]:
#Load in annotated sketch dataset| subsetted for sketches with 3 annotations
D = pd.read_pickle(os.path.join(csv_dir, 'semantic_parts_annotated_data_pckl'))
D.shape

(58947, 28)

In [6]:
png= pd.read_csv(os.path.join(csv_dir, 'semantic_parts_annotated_pngstring.csv'))

In [7]:
png.shape

(864, 2)

In [8]:
## get the list of unique labels applied to sketches
unique_labels = np.unique(D.label.values)

## Removing Nones and obviously wrong super long lables
unique_labels = [i for i in unique_labels if i is not None]
unique_labels = [i for i in unique_labels if len(i)<900]

print 'we have {} unique labels'.format(len(unique_labels))

we have 24 unique labels


In [9]:
unique_cats= np.unique(D['category'])
##Create empty dictionary with categories as keys. We will use this to store part occurrence data for our categories
label_vect_dict = {unique_cats[0]:None,unique_cats[1]:None,unique_cats[2]:None,unique_cats[3]:None}

In [10]:
##Create vectors that contain the number of part instances in each sketch
num_annots=3

for category in unique_cats:
    DS= D[D['category']==category]
    unique_sketches_in_cat = np.unique(DS['sketch_id'])
    unique_labels_in_cat = np.unique(DS['label'])
    ## initialize matrix that has the correct dimensions
    Label_Vec = np.zeros((len(unique_sketches_in_cat),len(unique_labels_in_cat)), dtype=int)
    unique_labels_in_cat= np.array(unique_labels_in_cat)
    for s,this_sketch in enumerate(unique_sketches_in_cat):
        label_vec = np.zeros(len(unique_labels_in_cat),dtype=int)
        DSS = DS[DS['sketch_id']==this_sketch]
        annotation_ids = np.unique(DSS['annotation_id'].values)    
        for this_annotation in annotation_ids:
            DSA = DSS[DSS['annotation_id']==this_annotation]
            label_list = DSA.label.values
            for this_label in label_list:
                label_ind = unique_labels_in_cat==this_label
                label_vec[label_ind] += 1
            
        Label_Vec[s,:]=label_vec/num_annots
    label_vect_dict[category]= Label_Vec

In [11]:
valid_labels=[]
valid_labels_dict={}
for category in unique_cats:
    vect = label_vect_dict[category]
    thresh = 50
    #print 'These are the labels that appear at least {} times:'.format(thresh)
    #print unique_labels[np.sum(Label_Vec,0)>thresh]
    unique_labels_in_cat = np.unique(D[D['category']==category]['label'])
    plot_labels= unique_labels_in_cat[np.sum(vect,0)>thresh]
    valid_labels_dict[category]=plot_labels
    valid_labels.append(plot_labels)


    prop_labels=[]
    for part in plot_labels:
        DS=D[D['category']==category]
        prop_labels.append(DS[DS['label']==part]['annotation_id'].nunique()/DS['annotation_id'].nunique())
    
##flattening valid labels
valid_labels = [item for sublist in valid_labels for item in sublist]


In [12]:
#Creating a spline-level df where the modal label is set as the 'true' label for any given spline
spline_df= D.groupby('spline_id').agg(lambda x: Counter(x).most_common(1)[0][0])
spline_df.reset_index(level=0, inplace=True)

In [13]:
##Creating a stroke-level dataframe that takes the mode value of annotation for its children splines to set as its
##label value

from collections import Counter


from collections import OrderedDict
stroke_svgs=OrderedDict()
for category in unique_cats:
    DS=D[D['category']==category]
    for sketch in np.unique(DS['sketch_id']):
        DSS=DS[DS['sketch_id']==sketch]
        for stroke in np.unique(DSS['stroke_num']):
            DSA=DSS[DSS['stroke_num']==stroke]
            DSA=DSA.reset_index()
            stroke_svgs[DSA['stroke_id'][0]] = DSA['sketch_svg_string'][0][stroke]

            
            
stroke_svg_df= pd.DataFrame.from_dict(stroke_svgs, orient='index')    
stroke_group_data= D.groupby('stroke_id').agg(lambda x: Counter(x).most_common(1)[0][0])
labels= pd.DataFrame(stroke_group_data[['sketch_id','label','stroke_num','condition','target','category','outcome']])
stroke_df=pd.merge(stroke_svg_df,labels,left_index=True, right_index =True)
stroke_df.reset_index(level=0, inplace=True)
stroke_df=stroke_df.rename(index=str, columns={"index": "stroke_id", 0: "svg"})


In [14]:
##Adding total arclength information to stroke dataframe

def calculate_arclength(svg):
    try:
        arclength= parse_path(svg).length()
    except ZeroDivisionError:
        print 'zero div error'
        arclength = 0
    return arclength
stroke_df['arc_length'] = stroke_df['svg'].apply(calculate_arclength)

In [16]:
stroke_df

Unnamed: 0,stroke_id,svg,sketch_id,label,stroke_num,condition,target,category,outcome,arc_length
0,0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12_0,"M46,83c0,-28.69608 14.13771,-52.14372 45,-47c4...",0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12,body,0,closer,pigeon,bird,True,344.697229
1,0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12_1,"M50,61c0,11.48666 -21.38423,38.42366 -19,42c3....",0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12,beak,1,closer,pigeon,bird,True,84.519100
2,0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12_2,"M54,69c0,13.48032 0.39329,18.49161 8,28c5.7833...",0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12,body,2,closer,pigeon,bird,True,271.640270
3,0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12_3,"M154,223c0,10.56125 -3.89383,31.41893 -8,41c-3...",0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12,feet,3,closer,pigeon,bird,True,139.423205
4,0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12_4,"M170,243c17.68619,0 98.44573,-6.11432 104,-20c...",0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12,body,4,closer,pigeon,bird,True,125.204402
5,0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12_5,"M238,209c16.06228,0 32.86896,-5.96505 48,-10c1...",0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12,tail,5,closer,pigeon,bird,True,94.369632
6,0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12_6,"M192,229c0,8.444 -2.19794,14.39588 -6,22c-4.12...",0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12,feet,6,closer,pigeon,bird,True,51.867589
7,0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12_7,"M182,223c4.97474,4.97474 15.70538,29 10,29",0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12,leg,7,closer,pigeon,bird,True,32.651291
8,0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12_8,"M94,47c-5.31586,2.65793 -4.91055,27.46331 -2,1...",0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12,eye,8,closer,pigeon,bird,True,152.967740
9,0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_19_0,"M74,95c0,-28.53126 42.78856,-35.32929 65,-27c3...",0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_19,head,0,closer,tomtit,bird,True,113.614826


In [19]:
stroke_df_png = stroke_df.merge(png,how='right', on='sketch_id')