In [None]:
from __future__ import division

import os
import urllib, cStringIO

import pymongo as pm
from collections import Counter
import matplotlib
import matplotlib.patheffects as PathEffects
from matplotlib import pylab, mlab, pyplot, colors
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('poster')
sns.set_style('white')

import numpy as np
import scipy.stats as stats
import pandas as pd
import json
import re
from svgpathtools import parse_path, concatpaths
import svgpathtools

from PIL import Image
import base64
import sys
from IPython.display import clear_output
from random import shuffle



import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category = ConvergenceWarning )


In [None]:
import svg_rendering_helpers as srh
if sys.version_info[0]>=3:
    from importlib import reload
reload(srh)

# directory & file hierarchy
proj_dir = os.path.abspath('../..')
analysis_dir = os.getcwd()
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
features_dir= os.path.join(results_dir,'features')
exp_dir = os.path.abspath(os.path.join(proj_dir,'experiments'))
sketch_dir = os.path.abspath(os.path.join(proj_dir,'sketches'))
vgg_dir ='/Users/kushin/Documents/Github/semantic_parts/features'

##create a dir for testing chair sketches for lesion

chairs_dir = os.path.join(sketch_dir, 'chairs_only')

test_dir = os.path.join(chairs_dir, 'test')
control_dir = os.path.join(chairs_dir, 'control')
intact_dir = os.path.join(chairs_dir, 'intact')
lesion_dir = os.path.join(chairs_dir,'lesioned')

## add helpers to python path
if os.path.join(proj_dir,'analysis') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'analysis'))

if not os.path.exists(results_dir):
    os.makedirs(results_dir)  

if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)  

if not os.path.exists(features_dir):
    os.makedirs(features_dir)
    
## add helpers to python path
if os.path.join(proj_dir,'analysis') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'analysis'))        
    
# Assign variables within imported analysis helpers
import analysis_helpers as h
if sys.version_info[0]>=3:
    from importlib import reload
reload(h)

In [None]:
## helper dictionaries 
OBJECT_TO_CATEGORY = {
    'basset': 'dog', 'beetle': 'car', 'bloodhound': 'dog', 'bluejay': 'bird',
    'bluesedan': 'car', 'bluesport': 'car', 'brown': 'car', 'bullmastiff': 'dog',
    'chihuahua': 'dog', 'crow': 'bird', 'cuckoo': 'bird', 'doberman': 'dog',
    'goldenretriever': 'dog', 'hatchback': 'car', 'inlay': 'chair', 'knob': 'chair',
    'leather': 'chair', 'nightingale': 'bird', 'pigeon': 'bird', 'pug': 'dog',
    'redantique': 'car', 'redsport': 'car', 'robin': 'bird', 'sling': 'chair',
    'sparrow': 'bird', 'squat': 'chair', 'straight': 'chair', 'tomtit': 'bird',
    'waiting': 'chair', 'weimaraner': 'dog', 'white': 'car', 'woven': 'chair',
}
CATEGORY_TO_OBJECT = {
    'dog': ['basset', 'bloodhound', 'bullmastiff', 'chihuahua', 'doberman', 'goldenretriever', 'pug', 'weimaraner'],
    'car': ['beetle', 'bluesedan', 'bluesport', 'brown', 'hatchback', 'redantique', 'redsport', 'white'],
    'bird': ['bluejay', 'crow', 'cuckoo', 'nightingale', 'pigeon', 'robin', 'sparrow', 'tomtit'],
    'chair': ['inlay', 'knob', 'leather', 'sling', 'squat', 'straight', 'waiting', 'woven'],
}

In [None]:
##Helpers



def subset_dataframe_by_condition(F,to_inspect='all',this_category='bird',this_object='bluejay'):
    '''
    input: F: dataframe (num_sketches x num_features)
           to_inspect: a string indicating whether to subset by ['object','category','all']
           this_category: IF to_inspect == 'category', then we define this to subset by that category only
           this_object: IF to_inspect == 'object', then we define this to subset by that object only
           
    returns: two feature matrices, c and f, corresponding to the close and far subsetted feature matrices
           
    '''
        
    F = F.sort_values(by=['category','target'])

    ## get context condition inds for subsetting dataframe
    close_inds = F['condition'] == 'closer'
    far_inds = F['condition'] == 'further'

    ## if we want to inspect particular category
    category_inds = F['category']==this_category

    ## if we want to inspect particular object
    obj_list = np.unique(F.target.values)
    obj_inds = F['target']==this_object  
    
    ## get names of columns that contain stroke-count & arclength information
    numstrokes_cols = [i for i in F.columns if i.split('_')[-1]=='numstrokes']
    arclength_cols = [i for i in F.columns if i.split('_')[-1]=='arclength']
    feat_cols = numstrokes_cols + arclength_cols
    
    if to_inspect == 'object':    
        ## extract particular row corresponding to this OBJECT in each condition
        f = F[(far_inds) & obj_inds][feat_cols].reset_index(drop=True)
        c = F[(close_inds) & obj_inds][feat_cols].reset_index(drop=True)
        obj_listf = F[(far_inds) & obj_inds]['target'].values
        obj_listc = F[(close_inds) & obj_inds]['target'].values
    elif to_inspect == 'category':
        ## extract particular rows corresponding to this CATEGORY in each condition
        f = F[(category_inds) & (far_inds)][feat_cols].reset_index(drop=True)
        c = F[(category_inds) & (close_inds)][feat_cols].reset_index(drop=True)
        obj_listf = F[(category_inds) & (far_inds)]['target'].values
        obj_listc = F[(category_inds) & (close_inds)]['target'].values
    elif to_inspect == 'all':
        ## extract particular rows corresponding to each condition
        f = F[far_inds][feat_cols].reset_index(drop=True)
        c = F[close_inds][feat_cols].reset_index(drop=True) 
        obj_listf = F[far_inds]['target'].values
        obj_listc = F[close_inds]['target'].values
        
    return c, f, obj_listc, obj_listf

In [None]:
#Load in annotated sketch dataset| subsetted for sketches with 3 annotations
D = pd.read_pickle(os.path.join(csv_dir, 'semantic_parts_annotated_data_pckl'))
D.shape

In [None]:
png= pd.read_csv(os.path.join(csv_dir, 'semantic_parts_annotated_pngstring.csv'))

In [None]:
png.shape

In [None]:
## get the list of unique labels applied to sketches
unique_labels = np.unique(D.label.values)

## Removing Nones and obviously wrong super long lables
unique_labels = [i for i in unique_labels if i is not None]
unique_labels = [i for i in unique_labels if len(i)<900]

print 'we have {} unique labels'.format(len(unique_labels))

In [None]:
unique_cats= np.unique(D['category'])
##Create empty dictionary with categories as keys. We will use this to store part occurrence data for our categories
label_vect_dict = {unique_cats[0]:None,unique_cats[1]:None,unique_cats[2]:None,unique_cats[3]:None}

In [None]:
##Create vectors that contain the number of part instances in each sketch
num_annots=3

for category in unique_cats:
    DS= D[D['category']==category]
    unique_sketches_in_cat = np.unique(DS['sketch_id'])
    unique_labels_in_cat = np.unique(DS['label'])
    ## initialize matrix that has the correct dimensions
    Label_Vec = np.zeros((len(unique_sketches_in_cat),len(unique_labels_in_cat)), dtype=int)
    unique_labels_in_cat= np.array(unique_labels_in_cat)
    for s,this_sketch in enumerate(unique_sketches_in_cat):
        label_vec = np.zeros(len(unique_labels_in_cat),dtype=int)
        DSS = DS[DS['sketch_id']==this_sketch]
        annotation_ids = np.unique(DSS['annotation_id'].values)    
        for this_annotation in annotation_ids:
            DSA = DSS[DSS['annotation_id']==this_annotation]
            label_list = DSA.label.values
            for this_label in label_list:
                label_ind = unique_labels_in_cat==this_label
                label_vec[label_ind] += 1
            
        Label_Vec[s,:]=label_vec/num_annots
    label_vect_dict[category]= Label_Vec

In [None]:
valid_labels=[]
valid_labels_dict={}
for category in unique_cats:
    vect = label_vect_dict[category]
    thresh = 50
    #print 'These are the labels that appear at least {} times:'.format(thresh)
    #print unique_labels[np.sum(Label_Vec,0)>thresh]
    unique_labels_in_cat = np.unique(D[D['category']==category]['label'])
    plot_labels= unique_labels_in_cat[np.sum(vect,0)>thresh]
    valid_labels_dict[category]=plot_labels
    valid_labels.append(plot_labels)


    prop_labels=[]
    for part in plot_labels:
        DS=D[D['category']==category]
        prop_labels.append(DS[DS['label']==part]['annotation_id'].nunique()/DS['annotation_id'].nunique())
    
##flattening valid labels
valid_labels = [item for sublist in valid_labels for item in sublist]


In [None]:
#Creating a spline-level df where the modal label is set as the 'true' label for any given spline
spline_df= D.groupby('spline_id').agg(lambda x: Counter(x).most_common(1)[0][0])
spline_df.reset_index(level=0, inplace=True)

In [None]:
##Creating a stroke-level dataframe that takes the mode value of annotation for its children splines to set as its
##label value

from collections import Counter


from collections import OrderedDict
stroke_svgs=OrderedDict()
for category in unique_cats:
    DS=D[D['category']==category]
    for sketch in np.unique(DS['sketch_id']):
        DSS=DS[DS['sketch_id']==sketch]
        for stroke in np.unique(DSS['stroke_num']):
            DSA=DSS[DSS['stroke_num']==stroke]
            DSA=DSA.reset_index()
            stroke_svgs[DSA['stroke_id'][0]] = DSA['sketch_svg_string'][0][stroke]

            
            
stroke_svg_df= pd.DataFrame.from_dict(stroke_svgs, orient='index')    
stroke_group_data= D.groupby('stroke_id').agg(lambda x: Counter(x).most_common(1)[0][0])
labels= pd.DataFrame(stroke_group_data[['sketch_id','label','stroke_num','condition','target','category','outcome']])
stroke_df=pd.merge(stroke_svg_df,labels,left_index=True, right_index =True)
stroke_df.reset_index(level=0, inplace=True)
stroke_df=stroke_df.rename(index=str, columns={"index": "stroke_id", 0: "svg"})


In [None]:
##Adding total arclength information to stroke dataframe

def calculate_arclength(svg):
    try:
        arclength= parse_path(svg).length()
    except ZeroDivisionError:
        print 'zero div error'
        arclength = 0
    return arclength
stroke_df['arc_length'] = stroke_df['svg'].apply(calculate_arclength)

### Lesioning sketches test work

In [None]:
# stroke_df_png = stroke_df.merge(png,how='right', on='sketch_id')

# test_sketch = stroke_df_png[stroke_df_png['sketch_id']=='0647-bfcd78e5-085c-4631-a47c-0f3dadf71345_12']

# parsed= test_sketch.svg

# parsed= list(parsed)

# for part in test_sketch.label.unique():
#     test_sketch_l = test_sketch[test_sketch['label']!= part]
#     parsed= list(test_sketch_l.svg)
#     srh.render_svg(parsed,base_dir=sketch_dir,out_fname='without_{}.svg'.format(part))

# svg_paths= srh.generate_svg_path_list(os.path.join(sketch_dir,'svg'))

# srh.svg_to_png(svg_paths,base_dir=sketch_dir)

# unique_sketches = stroke_df_png.sketch_id.unique()


# rgbcols = sns.color_palette("husl", len(chair_parts))
# sns.palplot(rgbcols)

# hexcols=[]
# for this_col in rgbcols:
#     hexcols.append(colors.to_hex(this_col))
# hexcols= np.array(hexcols)

# target_part= 'body'

In [None]:
# for this_chair in chair_df.sketch_id.unique():
#     collist=[]
#     chair_df_s = chair_df[chair_df['sketch_id']==this_chair]
#     parts = chair_df_s.label
#     for this_part in parts:
#         if this_part == target_part:
#             collist.append(hexcols[this_part==bird_parts][0])
#         else:
#             collist.append('#000000')
#     svgstring = list(bird_dfs.svg)
#     srh.render_svg_color(svgstring,base_dir=sketch_dir,stroke_colors=collist,out_fname='{}_{}_highlight.svg'.format(this_bird,target_part))
        

In [None]:
# chair_df = stroke_df[stroke_df['category']=='chair']
# chair_df = chair_df.reset_index()
# chair_parts= stroke_df[stroke_df['category']=='chair'].label.unique()
# chair_parts

In [None]:
# sample_chairs = chair_df['sketch_id'].unique()[0:5]

In [None]:
# for this_chair in sample_chairs:
#     curr_chair = chair_df[chair_df['sketch_id']==this_chair]
#     curr_chair_parts = curr_chair.label.unique()
#     for this_part in chair_parts:
#         if this_part in curr_chair_parts:
#             les_sketch = curr_chair[curr_chair['label']!=this_part]
#             paths= list(les_sketch.svg)
#             srh.render_svg(paths,base_dir=sketch_dir,out_fname='{}_without_{}.svg'.format(this_chair,this_part))
#         else:
#             continue
            
            
        
    
    

In [None]:
# svg_paths= srh.generate_svg_path_list(os.path.join(sketch_dir,'svg'))
# srh.svg_to_png(svg_paths,base_dir=sketch_dir)

In [None]:
# for this_sketch in sample_chairs:
#     this_chair = chair_df[chair_df['sketch_id']==this_sketch]
#     path = list(this_chair.svg)
#     srh.render_svg(path,base_dir=sketch_dir,out_fname='{}_full.svg'.format(this_sketch))

In [None]:
# svg_paths= srh.generate_svg_path_list(os.path.join(sketch_dir,'svg'))
# srh.svg_to_png(svg_paths,base_dir=sketch_dir)

In [None]:
# samples = chair_df[chair_df['sketch_id'].isin(sample_chairs)]

### Create diagnostic lesions

Right now we're lesioning all instances of the part with the overall highest arc length

In [None]:
# problem_chairs=[]
# test_lesion_parts=[]
# test_sketches=[]
# for this_chair in chair_df.sketch_id.unique():
#     this_sketch = chair_df.query('sketch_id == @this_chair')
#     summed_al = pd.DataFrame(this_sketch.groupby('label').arc_length.agg(np.sum)).reset_index() ## df of parts with summed arclengths
#     dpart = summed_al[summed_al.arc_length == summed_al.arc_length.max()].label ## part with highest overall arclength
#     dpart = list(dpart)[0] ## dumb fix for df indexing issue
#     les_sketch = this_sketch.query('label != @dpart')
#     paths = list(les_sketch.svg) ## paths to use to render lesioned sketch
#     if len(paths)==0:
#         print("Lesion removes all paths in {}".format(this_chair))
#         problem_chairs.append(this_chair)
#     else:
#         test_lesion_parts.append(dpart)
#         test_sketches.append(this_chair)
#         srh.render_svg(paths,base_dir = test_dir,out_fname='{}.svg'.format(this_chair))
    
    
    
# svg_paths= srh.generate_svg_path_list(os.path.join(test_dir,'svg'))
# srh.svg_to_png(svg_paths,base_dir=test_dir)
# lesion_parts_meta = {'sketch_id':test_sketches, 'label':test_lesion_parts}
# lesion_parts_meta = pd.DataFrame(data =lesion_parts_meta)

# run=True
# if run == True:
#     lesion_parts_meta.to_csv(os.path.join(csv_dir,'test_lesion_meta.csv'))

    
    


In [None]:
# problem_chairs

### Create control lesions

In [None]:
# for this_chair in chair_df.sketch_id.unique():
#     if this_chair not in problem_chairs:
#         this_sketch = chair_df.query('sketch_id == @this_chair')
#         dpart = lesion_parts_meta.query('sketch_id==@this_chair').label
#         dpart = list(dpart)[0]
#         les_part = this_sketch.query('label==@dpart')
#         amt_lesioned = np.sum(les_part.arc_length)
#         rem_sketch = this_sketch.query('label!=@dpart')
#         rem_paths = list(rem_sketch.svg)
#         parsed_rem_paths = [parse_path(x) for x in rem_paths]
#         shuffle(parsed_rem_paths)
#         cc_path = concatpaths(parsed_rem_paths)
#         if cc_path.length()>= amt_lesioned:
#             rem_al = cc_path.ilength(cc_path.length()-amt_lesioned) ## keep only cc_path.length()-amt_lesioned worth of arclength
#             rem_les = cc_path.cropped(0, rem_al) ##lesioning out the lesioned amount from remaining arc length
#             les_part_paths =list(les_part.svg) ##paths for lesioned part
#             srh.render_svg([rem_les,les_part_paths],base_dir = control_dir,out_fname='{}.svg'.format(this_chair)) ##add back lesioned part and render
#         else:
#             print ("Lesion part too large to control")
            
            

In [None]:
# svg_paths= srh.generate_svg_path_list(os.path.join(control_dir,'svg'))
# srh.svg_to_png(svg_paths,base_dir=control_dir)
# lesion_parts_meta = {'sketch_id':test_sketches, 'label':test_lesion_parts}
# lesion_parts_meta = pd.DataFrame(data =lesion_parts_meta)

# run=True
# if run == True:
#     lesion_parts_meta.to_csv(os.path.join(csv_dir,'test_lesion_meta.csv'))


## Inferrring importance of stoke label on classifiability of lesioned sketches

In [None]:
chair_df = stroke_df[stroke_df['category']=='chair']
chair_df = chair_df.reset_index()
chair_parts= stroke_df[stroke_df['category']=='chair'].label.unique()
chair_parts

### Generate pngs of intact sketches for feature extraction

In [None]:
really_run = False

if really_run==True:

    for this_chair in chair_df.sketch_id.unique():
        curr_chair = chair_df.query('sketch_id == @this_chair')
        svgs = list(curr_chair.svg)
        srh.render_svg(svgs,base_dir=intact_dir,out_fname='{}.svg'.format(this_chair))


In [None]:
### Create path to lesioned svgs and convert to png for feature extraction
really_run = False

if really_run==True:
    svg_paths= srh.generate_svg_path_list(os.path.join(intact_dir,'svg'))
    srh.svg_to_png(svg_paths,base_dir=intact_dir)


### Load in features

In [None]:
intact_feats = pd.DataFrame(np.load(os.path.join(vgg_dir,'FEATURES_FC6_sketch_no-channel-norm.npy')))
intact_meta = pd.DataFrame(pd.read_csv(os.path.join(vgg_dir,'METADATA_sketch.csv')))
assert intact_feats.shape[0]==intact_meta.shape[0]
intact_df = pd.concat((intact_meta,intact_feats),axis=1)
intact_df = intact_df.drop(columns= 'feature_ind')



In [None]:
### Create dataframe of additional sketch level metadata
chair_trial_meta = chair_df.groupby('sketch_id')[['condition','category','target']].agg(pd.Series.mode)
chair_trial_meta = chair_trial_meta.reset_index()
intact_df = chair_trial_meta.join(intact_df.set_index('sketch_id'), on='sketch_id')


In [None]:
### Create new column for train test split stratification

intact_df['strata'] = intact_df['condition'].astype(str) + '_' +intact_df['target'].astype(str)


In [None]:
### create num_splits number of train test splits and get test accuracy for each split
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score

num_splits = 20 
lc_list = []
acc_list =[]
for i in range(num_splits):
    intact_df['strata'] = intact_df['condition'].astype(str) + '_' +intact_df['target'].astype(str)
    train, test = train_test_split(intact_df, test_size=0.2, stratify=intact_df['strata'])
    intact_df = intact_df.drop(columns='strata')
    train = train.drop(columns = 'strata')
    test = test.drop(columns = 'strata')
    assert test.sketch_id.nunique()+train.sketch_id.nunique() == intact_df.sketch_id.nunique()
    all_cols = list(intact_df.columns)
    meta_cols = ['sketch_id','condition','category','target']

    feat_cols = [x for x in all_cols if x not in meta_cols]
    X_train = np.array(train[feat_cols])
    Y_train = np.array(train['target'])


    X_test = np.array(test[feat_cols])
    Y_test = np.array(test['target'])
    LC = LogisticRegression(solver='lbfgs', multi_class='multinomial',max_iter=100).fit(X_train, Y_train)
    lc_list.append(LC)
    Y_pred = LC.predict_proba(X_test)
    print('Model log loss is : {}'.format(round(log_loss(Y_test,Y_pred),3)))
    Y_class_pred =  LC.predict(X_test)
    pred_df = pd.concat((test, pd.DataFrame(data = {"prediction":Y_class_pred==test.target})), axis=1)
    acc = sum(pred_df.prediction)/pred_df.shape[0]
    print('Accuracy is:{}'.format(acc))
    acc_list.append(acc)

    
### Zipping together the different logistic classifiers with their associated test accuracy    
val_list = zip(lc_list, acc_list)




In [None]:
###extract best performing classifier 

c= [x[1] for x in val_list]
d = np.array(c).max()
e= c.index(d)
val_list[e][1]

In [None]:
import math
print('Naive log loss would be : {}'.format(round(-math.log(1/8),3)))

In [None]:
train.groupby('target').agg('nunique')

In [None]:
test.groupby('target').agg('nunique')

### Create k lesion-sketches per intact sketch

In [None]:
## For each sketch create k lesioned sketches where k is the number of strokes in the sketch; each lesioned sketch\
## has one of the k strokes removed. Total number of sketches should be equal to total number of strokes in dataset

meta_labels = []
meta_arclength = []
meta_conds = []
meta_target = []
meta_objects = []
meta_categories = []
meta_les_ids = []
meta_sketch_ids = []
for this_chair in chair_df['sketch_id'].unique():
    this_sketch = chair_df.query('sketch_id == @this_chair')
    if this_sketch.stroke_num.nunique()<2:
        print ('single stroke sketch- {}'.format(this_chair))
        continue 
    intact_paths = list(this_sketch.svg)
    stroke_labels = this_sketch.label
    for this_stroke in this_sketch.stroke_num:
        this_lesion = this_sketch.query('stroke_num == @this_stroke')
        les_stroke_path = list(this_lesion.svg)

        meta_labels.append(this_lesion.label.iloc[0])
        meta_arclength.append(parse_path(les_stroke_path[0]).length())
        meta_conds.append(this_lesion.condition.iloc[0])
        meta_target.append(this_lesion.target.iloc[0])
        meta_objects.append(this_lesion.target.iloc[0])
        meta_categories.append(this_lesion.category.iloc[0])

        les_id = str(this_lesion.sketch_id.iloc[0])+'_'+str(this_lesion.label.iloc[0])+'_'+str(this_lesion.stroke_num.iloc[0])
        meta_les_ids.append(les_id)
        meta_sketch_ids.append(this_lesion.sketch_id.iloc[0])
        les_paths = [x for x in intact_paths if x not in les_stroke_path]
        really_run = False
        if really_run==True:
            srh.render_svg(les_paths,base_dir = lesion_dir,out_fname='{}.svg'.format(les_id))



In [None]:
### Create path to lesioned svgs and convert to png for feature extraction
really_run = False

if really_run==True:
    svg_paths= srh.generate_svg_path_list(os.path.join(lesion_dir,'svg'))
    srh.svg_to_png(svg_paths,base_dir=lesion_dir)

In [None]:
### create meta dataframe for the lesioned sketches sketches
les_chair_meta = {'lesion_id':meta_les_ids, 'label':meta_labels,'target':meta_target, 'condition':meta_conds, 'category':meta_categories,\
                 'sketch_id':meta_sketch_ids, 'arc_length':meta_arclength}
lesion_parts_meta = pd.DataFrame(data =les_chair_meta)


### Load in lesion features

In [None]:
lesioned_feats = pd.DataFrame(np.load(os.path.join(vgg_dir,'FEATURES_FC6_sketch_channel-norm_lesioned.npy')))#
lesioned_meta = pd.DataFrame(pd.read_csv(os.path.join(vgg_dir,'METADATA_sketch_lesioned.csv')))
assert lesioned_feats.shape[0]==lesion_parts_meta.shape[0]==lesioned_meta.shape[0]
### Concatenate feature columns with 'lesion_id' column (labeled as sketch_id in lesioned_meta)
lesioned_df = pd.concat((lesioned_meta,lesioned_feats),axis=1).drop(columns = ['feature_ind'])

In [None]:
### concat additional metadata df with lesioned_df and do some index resetting

lesioned_df = lesioned_df.set_index('sketch_id').join(lesion_parts_meta.set_index('lesion_id'))
lesioned_df.index.names = ['lesion_id']
lesioned_df = lesioned_df.reset_index()

In [None]:
### Helper function to calculate the classifiability score of a lesion, a.k.a. the class probability assigned to\
### the true class label for that sketch by the classifier
lc_classes = LC.classes_
def calc_class_score(df):
    df=df.to_frame().T.reset_index()
    features = df[feat_cols]
    target = df['target'].iloc[0]
    feats = LC.predict_proba(features).reshape(8,)   
    
    correct_pos = feats[lc_classes == target]
   
    return(correct_pos[0])

In [None]:
### add classifiability score to the dataframe

lesioned_df['c_score'] = lesioned_df.apply(calc_class_score,axis=1)

In [None]:
### Save out csv. TODO : Drop feature columns

lesioned_df.to_csv(os.path.join(csv_dir,'lesion_sketch_data.csv'))