# Exploratory analyses for photodraw2x2

### Import data and set up paths

In [None]:
import os
import sys
import math
import utils
import socket
import shutil
import numpy as np
import pandas as pd
from itertools import combinations 

from scipy import stats
from scipy.stats import f
from scipy.stats import ttest_rel
from scipy.stats import ttest_ind
from sklearn.manifold import TSNE
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from scipy.spatial.distance import pdist, squareform

from utils import generate_acc_probs, generate_acc_probs_2x2, generate_2x2_plots, \
perform_cross_validation, perform_cross_validation_twice, adjacent_plots, cat_cond_diffplots


import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline


sns.set(style="whitegrid")


In [None]:
# directory & file hierarchy
proj_dir = os.path.abspath('..')
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
if socket.gethostname() == 'nightingale':
    feature_dir = os.path.abspath('/mnt/pentagon/photodraw/features/')
else:
    feature_dir = os.path.abspath(os.path.join(proj_dir,'features'))

def make_dir_if_not_exists(dir_name):   
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    return dir_name

## create directories that don't already exist        
result = [make_dir_if_not_exists(x) for x in [results_dir,plot_dir,csv_dir,feature_dir]]

In [None]:
# modify data to get typicality rating information

K = pd.read_csv(os.path.join(csv_dir, 'photodraw2x2_sketch_data.csv'))
T = pd.read_csv(os.path.join(csv_dir, 'photodraw2x2_stroke_data.csv'))
S = pd.read_csv(os.path.join(csv_dir, 'photodraw2x2_survey_data.csv'))
R = pd.read_csv(os.path.join(csv_dir, 'photodraw_sketchy32_typicality_ratings.csv'))

R = R[(R.repeat_offender == False) &
      (R.tooDissimilar == False)   &
      (R.failed_catches == False)  &
      (R.catch_trial == False)]
R['imageURL'] = R.img_id.str.split('/', expand=True).iloc[:,-1].str.split('_', expand=True)\
                                                               .iloc[:,:2].agg('_'.join, axis=1)

rara = pd.DataFrame(R.groupby(['imageURL', 'category']).enumerated_ratings.mean()).reset_index()  
rara['isTypical'] = rara.apply(lambda row: row.enumerated_ratings >= \
                               rara[rara.category == row.category].enumerated_ratings.median(), axis = 1)
rara['enumerated_ratings'] = rara['enumerated_ratings'] + 0.001 * (np.random.rand(len(rara)) - 0.5)
rara['decile'] =  rara.groupby(['category'])['enumerated_ratings'].transform(
                     lambda x: pd.qcut(x, 8, labels=range(8)))


K['decile'] = K.imageURL.map(dict(zip(rara.imageURL, rara.decile)))
K['isTypical'] = K.imageURL.map(dict(zip(rara.imageURL, rara.isTypical)))

cat_ratings_map = R.groupby('category').enumerated_ratings.mean().to_dict()
inst_ratings_map = R.groupby('imageURL').enumerated_ratings.mean().to_dict()
K['cat_typicality'] = K.category.map(cat_ratings_map)
K['inst_typicality'] = K.imageURL.map(inst_ratings_map)

## Basic barplots

### Is there more effort being spent in one goal over another? Is there more effort being spent in one condition over another?

It appears so: participants put more effort drawing object instances over drawing object categories

In [None]:
# paired barplots with condiiton

g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="condition", y="activeSketchTime", hue="goal", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("", "active sketch time (ms)")
plt.title('Active sketching time per sketch');

g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="condition", y="totalInk", hue="goal", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("", "Total ink used")
plt.title('Total ink used per sketch');

g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="condition", y="numStrokes", hue="goal", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("", "Number of strokes")
plt.title('Number of strokes per sketch');

g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="condition", y="prob_true_predict_fc6", hue="goal", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("", "probability")
plt.title('Probability of correct classification');

#### You can also view goal on the x-axis instead

In [None]:
g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="goal", y="activeSketchTime", hue="condition", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("", "active sketch time (ms)")
plt.title('Active sketching time per sketch');

g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="goal", y="totalInk", hue="condition", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("", "Total ink used")
plt.title('Total ink used per sketch');

g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="goal", y="numStrokes", hue="condition", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("", "Number of strokes")
plt.title('Number of strokes per sketch');

g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="goal", y="prob_true_predict_fc6", hue="condition", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("", "probability")
plt.title('Probability of correct classification');

In [None]:
sns.set(style="whitegrid")

g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="experiment", y="numStrokes", hue="true_predict_fc6", palette="dark", alpha=.7, height=5, size = 5, aspect = 1.3
)
g.despine(left=True)
g.set_axis_labels("", "number of strokes")
g._legend.set_title('Correct classification')
plt.title('Do more strokes yield higher classification accuracy?');

In [None]:
sns.set(style="whitegrid")

g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="experiment", y="activeSketchTime", hue="true_predict_fc6", palette="dark", alpha=.7, height=5, size = 5, aspect = 1.3
)
g.despine(left=True)
g.set_axis_labels("", "active sketch time (ms)")
g._legend.set_title('Correct prediction')

In [None]:
sns.set(style="whitegrid")

g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="experiment", y="totalInk", hue="true_predict_fc6", palette="dark", alpha=.7, height=5, size = 5, aspect = 1.3
)
g.despine(left=True)
g.set_axis_labels("", "total ink")
g._legend.set_title('Correct prediction')

### Does the greater detail in instancedraw-text facilitate discrimination at the category level?

#### It appears that categorydraw-text is more discriminable at the category level than instancedraw-text

In [None]:
sns.barplot(data = K[K['isOutlier'] == False], x = 'experiment', y = 'prob_true_predict')
plt.title('Probability of correct classification')
plt.ylabel('probability');

In [None]:
sns.barplot(data = K[K['isOutlier'] == False], x = 'experiment', y = 'prob_true_predict_fc6')
plt.title('Probability of correct classification')
plt.ylabel('probability');

In [None]:
for var in ['numStrokes', 'activeSketchTime', 'totalInk', 'prob_true_predict_fc6']:
    photodata = K[K.experiment == 'categorydraw-text'][var].values
    textdata = K[K.experiment == 'instancedraw-text'][var].values
    
    print(f"Is {var} significantly different between the two experiments?:")
    
    # Are the variances approximately equal?
    varstats = stats.levene(photodata, textdata)
    print("Testing for equality of variance:")
    print(f"Levene test stat: {varstats[0]}. p-value: {varstats[1]}")
    if stats.levene(photodata, textdata)[1] < 0.05:
        welchtest = stats.ttest_ind(photodata, textdata, equal_var = False)
        print('The assumption for equality of variance is violated! Using Welch\'s t-test (two-sided), we get:')
        print(f'Welch\'s test stat: {welchtest[0]}. p-value: {welchtest[1]}\n')
    else:
        ttest = stats.ttest_ind(photodata, textdata)
        print('The assumption for equality of variance holds. Using student\'s t-test (two-sided), we get:')
        print(f'Student\'s t-test: {ttest[0]}. p-value: {ttest[1]}\n')

### F-statistic analyses: between category variance vs. within photo-id variance

In [None]:
def get_f_stat(features, metadata, between_groups = 'category'):
    mean_features = [features[i[1].values].mean(axis=0) for i in metadata.groupby(between_groups).feature_ind]
    overall_features = features[metadata.feature_ind.values].mean(axis = 0)
    
    between_group_var = 0
    within_group_var = 0
    for cat, group in zip(metadata[between_groups].unique(), mean_features):
        nsketches = len(metadata[metadata[between_groups] == cat])
        between_group_var += nsketches * (np.linalg.norm(group - overall_features))**2 
        
        diff = features[metadata[metadata[between_groups] == cat].feature_ind] - mean_features[0]
        within_group_var += sum((np.linalg.norm(diff , axis = 1))**2)
        
    between_group_var /= len(mean_features) - 1
    within_group_var  /= len(features[metadata.feature_ind.values]) - len(mean_features)
    
    return between_group_var / within_group_var

In [None]:
print(get_f_stat(inst_text_fc6, K_inst_text))
print(get_f_stat(cat_text_fc6, K_cat_text))
print(get_f_stat(sketchy_fc6, sketchy_meta_fc6))
F_stat = get_f_stat(sketchy_fc6, sketchy_meta_fc6)

In [None]:
fstats = []
for cat in sketchy_meta_fc6.category.unique():
    sketchy_meta_cat = sketchy_meta_fc6[sketchy_meta_fc6.category == cat]
    fstats.append(get_f_stat(sketchy_fc6, sketchy_meta_cat, between_groups='photo_id'))
sum(fstats) / len(fstats)

In [None]:
# plot pdf of F-distribution with df1 = 31, df2 = 24
x = np.linspace(0, 100, 5000)
plt.plot(x, f(31, 19490).pdf(x), label=r'F-distribution, df$_1$ = 11, df$_2$= 24')
plt.axvline(F_stat, color='green');
plt.xlabel('F'), plt.ylabel('Density'), 
plt.suptitle('Between-class (category) variability vs within-class (photo-id) variability');

### Construct RDMs

In [None]:
mean_category_features_photo = pd.DataFrame([sketchy_fc6[i[1].values].mean(axis=0) \
                                       for i in sketchy_meta_fc6.groupby('category').feature_ind])
mean_category_features_photo['category'] = sketchy_meta_fc6.category.unique()
mean_category_features_photo['condition'] = 'photo'

mean_category_features_text = []
mean_category_features_text_cat = []
mean_category_features_text_inst = []
K2 = K.sort_values('category', ignore_index=True)
for group in K2.groupby('category'):
    tempdf = pd.DataFrame(group[1])
    mean_category_features_text.append(\
        np.concatenate((cats_fc6[tempdf[tempdf.goal == 'categorydraw'].feature_ind.values], \
                        inst_fc6[tempdf[tempdf.goal == 'instancedraw'].feature_ind.values]), axis = 0).mean(axis=0))
    mean_category_features_text_cat.append(cats_fc6[tempdf[tempdf.goal == \
                                                           'categorydraw'].feature_ind.values].mean(axis=0))
    mean_category_features_text_inst.append(inst_fc6[tempdf[tempdf.goal == \
                                                            'instancedraw'].feature_ind.values].mean(axis=0))
    
mean_category_features_text = pd.DataFrame(mean_category_features_text)
mean_category_features_text['category'] = K2.category.unique()
mean_category_features_text['condition'] = 'text'

mean_category_features_text_cat = pd.DataFrame(mean_category_features_text_cat)
mean_category_features_text_cat['category'] = K2.category.unique()
mean_category_features_text_cat['condition'] = 'text'

mean_category_features_text_inst = pd.DataFrame(mean_category_features_text_inst)
mean_category_features_text_inst['category'] = K2.category.unique()
mean_category_features_text_inst['condition'] = 'text'


mean_category_features = pd.concat([mean_category_features_photo, mean_category_features_text], ignore_index=True)
mean_category_features.index = mean_category_features.category + '_' + mean_category_features.condition
mean_category_features = mean_category_features.drop(columns=['condition', 'category'])

mean_category_features_cat = pd.concat([mean_category_features_photo, mean_category_features_text_cat]\
                                            , ignore_index=True)
mean_category_features_cat.index = mean_category_features_cat.category + '_' + \
mean_category_features_cat.condition
mean_category_features_cat = mean_category_features_cat.drop(columns=['condition', 'category'])

mean_category_features_inst = pd.concat([mean_category_features_photo, mean_category_features_text_inst], ignore_index=True)
mean_category_features_inst.index = mean_category_features_inst.category + '_' + mean_category_features_inst.condition
mean_category_features_inst = mean_category_features_inst.drop(columns=['condition', 'category'])

In [None]:
between_condition_RDM = pd.DataFrame(squareform(pdist(mean_category_features.values)), \
            columns = mean_category_features.index, index = mean_category_features.index)

plt.figure(figsize=(18,25))
sns.heatmap(between_condition_RDM,cbar_kws={'orientation':'horizontal'})
plt.xlabel('category-condition pairs'), plt.ylabel('category-condition pairs')
plt.title(f'Correlation coefficient of mean feature vectors of each category-condition pair (fc6)', fontsize=26);

In [None]:
between_condition_RDM = pd.DataFrame(np.corrcoef(mean_category_features.values), \
            columns = mean_category_features.index, index = mean_category_features.index)

plt.figure(figsize=(18,25))
sns.heatmap(between_condition_RDM,cbar_kws={'orientation':'horizontal'})
plt.xlabel('category-condition pairs'), plt.ylabel('category-condition pairs')
plt.title(f'Correlation coefficient of mean feature vectors of each category-condition pair (fc6)', fontsize=26);

In [None]:
between_condition_RDM_cat = pd.DataFrame(np.corrcoef(mean_category_features_cat.values), \
            columns = mean_category_features_cat.index, index = mean_category_features_cat.index)

plt.figure(figsize=(18,25))
sns.heatmap(between_condition_RDM_cat,cbar_kws={'orientation':'horizontal'})
plt.xlabel('category-condition pairs'), plt.ylabel('category-condition pairs')
plt.title(f'Pairwise euclidean distance of mean feature vectors of each category-condition pair', fontsize=26);

In [None]:
between_condition_RDM_inst = pd.DataFrame(np.corrcoef(mean_category_features_inst.values), \
            columns = mean_category_features_inst.index, index = mean_category_features_inst.index)

plt.figure(figsize=(18,25))
sns.heatmap(between_condition_RDM_inst,cbar_kws={'orientation':'horizontal'})
plt.xlabel('category-condition pairs'), plt.ylabel('category-condition pairs')
plt.title(f'Pairwise correlation coefficients of mean feature vectors of each category-condition pair', fontsize=26);

In [None]:
between_condition_RDM_diff = np.abs(between_condition_RDM_cat - between_condition_RDM_inst)
plt.figure(figsize=(18,25))
sns.heatmap(between_condition_RDM_diff,cbar_kws={'orientation':'horizontal'})
plt.xlabel('category-condition pairs'), plt.ylabel('category-condition pairs')
plt.title(f'Difference between category and instance feature representations', fontsize=26);

In [None]:
ttest_ind(between_condition_RDM_cat.iloc[32:,32:].values.flatten(), \
          between_condition_RDM_inst.iloc[32:,32:].values.flatten())

In [None]:
cat_text_corrs = get_correlation_distances(cat_text_fc6_mean)
inst_text_corrs = get_correlation_distances(inst_text_fc6_mean)
#cat_photo_corrs = get_correlation_distances(cat_photo_fc6_mean)
#inst_photo_corrs = get_correlation_distances(inst_photo_fc6_mean)

photo_cues_corrs = get_correlation_distances(photo_cues_fc6_mean)
sketchy_sketches_corrs = get_correlation_distances(sketchy_sketches_fc6_mean)

# verify everything is the same size
assert(len(cat_text_corrs) == len(inst_text_corrs)) # == len(cat_photo_corrs) == len(inst_photo_corrs)

#### More RDM analyses

In [None]:
# we have 4 different feature representations, one for each experiments
cat_text_fc6 = np.load(os.path.join(feature_dir, f'FEATURES_FC6_sketch_no-channel-norm_categorydraw-text.npy'))
inst_text_fc6 = np.load(os.path.join(feature_dir, f'FEATURES_FC6_sketch_no-channel-norm_instancedraw-text.npy'))
#cat_photo_fc6 = np.load(os.path.join(feature_dir, f'FEATURES_FC6_sketch_no-channel-norm_categorydraw-photo.npy'))
#inst_photo_fc6 = np.load(os.path.join(feature_dir, f'FEATURES_FC6_sketch_no-channel-norm_instancedraw-photo.npy'))

photo_cues_fc6 = np.load(os.path.join(feature_dir, f'FEATURES_FC6_sketch_no-channel-norm_photodraw32_stims.npy'))
sketchy_sketches_fc6 = np.load(os.path.join(feature_dir, f'FEATURES_FC6_sketch_no-channel-norm_sketchy_sketches.npy'))


photo_cues_meta = pd.read_csv(os.path.join(feature_dir, f'METADATA_sketch_photodraw32_stims.csv'))
sketchy_sketches_meta = pd.read_csv(os.path.join(feature_dir, 'METADATA_sketch_sketchy_sketches.csv'))

photo_cues_meta = photo_cues_meta.rename(columns={'sketch_id': 'photo_id'})
sketchy_sketches_meta = sketchy_sketches_meta.rename(columns={'sketch_id': 'photo_id'})

photo_cues_meta['category'] = photo_cues_meta.photo_id.str.split('\\',expand=True).iloc[:,1]\
                                                       .str.rsplit('_', 2, expand=True).iloc[:,0]

photo_cues_meta['id'] = photo_cues_meta.photo_id.str.split('\\',expand=True).iloc[:,1]\
                                                 .str.rsplit('_', 2, expand=True)[[1,2]].agg('_'.join, axis=1)

                
photo_cues_meta = photo_cues_meta.rename(columns={"sketch_feature_ind": "feature_ind"})

sketchy_sketches_meta['category'] = sketchy_sketches_meta.photo_id.str.split('\\',expand=True).iloc[:,1]\
                                                                   .str.rsplit('_', 2, expand=True).iloc[:,0]

sketchy_sketches_meta['id'] = sketchy_sketches_meta.photo_id.str.split('\\',expand=True).iloc[:,1]\
                                                             .str.rsplit('_', 2, expand=True)[[1,2]].agg('_'.join, axis=1)

sketchy_sketches_meta[['id', 'sketchNum']] = sketchy_sketches_meta.id.str.split('-', expand=True)
sketchy_sketches_meta = sketchy_sketches_meta.rename(columns={"sketch_feature_ind": "feature_ind"})

In [None]:
def get_mean_feature_vectors(metadata, features, groupby_cat = 'category'):
    g = metadata.groupby(groupby_cat)
    g = g.apply(lambda cat: features[cat.feature_ind.values].mean(axis=0))
    return g.index.values, np.stack(list(g), axis=0)

def get_correlation_distances(mean_features, upper = True, metric = "euclidean"):
    corrs = squareform(pdist(mean_features, metric = metric))
    #corrs = stats.spearmanr(mean_features, axis=1)[0]
                                        
    if upper == True:
        return corrs[np.triu_indices(len(corrs), 1)]
    else:
        return corrs

In [None]:
# then we get the mean feature representations for each category for each of the 4 experiments:
c1, cat_text_fc6_mean = get_mean_feature_vectors(K_cat_text, cat_text_fc6)
c2, inst_text_fc6_mean = get_mean_feature_vectors(K_inst_text, inst_text_fc6)
#c3, cat_photo_fc6_mean = get_mean_feature_vectors(K_cat_photo, cat_photo_fc6)
#c4, inst_photo_fc6_mean = get_mean_feature_vectors(K_ins_photo, inst_photo_fc6)

c5, photo_cues_fc6_mean = get_mean_feature_vectors(photo_cues_meta, photo_cues_fc6)
c6, sketchy_sketches_fc6_mean = get_mean_feature_vectors(sketchy_sketches_meta, sketchy_sketches_fc6)

# verify feature vectors are in the same order
assert all([all(i == j) for i,j in combinations([c1, c2, c5, c6], 2)])

In [None]:
# note: we can subset just the sketch ids used in photodraw2x2 experiments as a placeholder for instancedraw-photo
inst_photo_meta =  sketchy_sketches_meta[sketchy_sketches_meta.id.isin(photo_cues_meta.id)]
inst_photo_fc6 =  sketchy_sketches_fc6[inst_photo_meta.feature_ind]
inst_photo_meta.loc[:, 'feature_ind'] = list(range(len(inst_photo_meta)))
c4, inst_photo_fc6_mean = get_mean_feature_vectors(inst_photo_meta, inst_photo_fc6)
inst_photo_corrs = get_correlation_distances(inst_photo_fc6_mean)

In [None]:
# note: we can subset just the sketch ids used in photodraw2x2 experiments as a placeholder for instancedraw-photo
inst_photo_meta =  sketchy_sketches_meta[sketchy_sketches_meta.id.isin(photo_cues_meta.id)]
inst_photo_fc6 =  sketchy_sketches_fc6[inst_photo_meta.feature_ind]
inst_photo_meta.loc[:, 'feature_ind'] = list(range(len(inst_photo_meta)))
c4, inst_photo_fc6_mean = get_mean_feature_vectors(inst_photo_meta, inst_photo_fc6)
inst_photo_corrs = get_correlation_distances(inst_photo_fc6_mean)

In [None]:
print(stats.spearmanr([cat_text_corrs, inst_text_corrs, inst_photo_corrs,\
                       photo_cues_corrs, sketchy_sketches_corrs], axis=1)[0])

print(stats.spearmanr([cat_text_corrs, inst_text_corrs, inst_photo_corrs,\
                       photo_cues_corrs, sketchy_sketches_corrs], axis=1)[1])

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(1 - stats.spearmanr([cat_text_corrs, inst_text_corrs, inst_photo_corrs,\
                       photo_cues_corrs, sketchy_sketches_corrs], axis=1)[0], 
           square = True, xticklabels=['cat_text', 'inst_text', 'inst_photo', 'photo_cues', 'sketchy_sketches'],
                          yticklabels=['cat_text', 'inst_text', 'inst_photo', 'photo_cues', 'sketchy_sketches'])
plt.title('Distance matrix of various RDMs (testing)');

In [None]:
c8, id_feature_vectors = get_mean_feature_vectors(inst_photo_meta, inst_photo_fc6, 'id')
id_feature_vectors_corrs = get_correlation_distances(id_feature_vectors, upper = False)

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(id_feature_vectors_corrs, square = True)

In [None]:
# get typicality ratings
norming_data = pd.read_csv(os.path.join(csv_dir, 'photodraw_sketchy32_typicality_ratings.csv'))
norming_data = norming_data[(norming_data.catch_trial == False) &
                            (norming_data.repeat_offender == False) & 
                            (norming_data.failed_catches == False) & 
                            (norming_data.tooDissimilar == False)]

norming_data['sketchy_id'] = norming_data.img_id.str.rsplit('/', 1, expand = True).iloc[:,1]\
                                                .str.rsplit('.', 1, expand = True).iloc[:,0]\
                                                .str.split('_',expand=True)[[0,1]].agg('_'.join, axis = 1)
norming_data = norming_data.groupby(['category','sketchy_id']).enumerated_ratings.describe().reset_index()

In [None]:
sorted_ids = norming_data.sort_values(by=['category', 'mean', '50%']).sketchy_id.values
photo_ids = dict(zip(c8, range(len(c8))))
rearrangement = [photo_ids[sid] for sid in sorted_ids] 
id_feature_vectors_corrs_arr = get_correlation_distances(id_feature_vectors[rearrangement], upper = False)

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(id_feature_vectors_corrs_arr, square = True)

### How does photo-cue typicality relate to sketch recognizability?

In [None]:
R = pd.read_csv(os.path.join(csv_dir, 'photodraw_sketchy32_typicality_ratings.csv'))

R = R[(R.repeat_offender == False) & (R.tooDissimilar == False) & (R.failed_catches == False) & (R.catch_trial == False)]
R['imageURL'] = R.img_id.str.split('/', expand=True).iloc[:,-1].str.split('_', expand=True)\
                                                               .iloc[:,:2].agg('_'.join, axis=1)

rara = pd.DataFrame(R.groupby(['imageURL', 'category']).enumerated_ratings.mean()).reset_index()  
rara['isTypical'] = rara.apply(lambda row: row.enumerated_ratings >= \
                               rara[rara.category == row.category].enumerated_ratings.median(), axis = 1)
K['isTypical'] = K.imageURL.map(dict(zip(rara.imageURL, rara.isTypical)))

cat_ratings_map = R.groupby('category').enumerated_ratings.mean().to_dict()
inst_ratings_map = R.groupby('imageURL').enumerated_ratings.mean().to_dict()
K['cat_typicality'] = K.category.map(cat_ratings_map)
K['inst_typicality'] = K.imageURL.map(inst_ratings_map)

In [None]:
def find_nearest(array,value):
    idx = np.searchsorted(array, value, side="left")
    if idx > 0 and (idx == len(array) or math.fabs(value - array[idx-1]) < math.fabs(value - array[idx])):
        return idx - 1
    else:
        return idx
    
R = pd.read_csv(os.path.join(csv_dir, 'photodraw_sketchy32_typicality_ratings.csv'))

R = R[(R.repeat_offender == False) & (R.tooDissimilar == False) & (R.failed_catches == False) & (R.catch_trial == False)]
R['imageURL'] = R.img_id.str.split('/', expand=True).iloc[:,-1].str.split('_', expand=True)\
                                                               .iloc[:,:2].agg('_'.join, axis=1)

rara = pd.DataFrame(R.groupby(['imageURL', 'category']).enumerated_ratings.mean()).reset_index()  
rara['isTypical'] = rara.apply(lambda row: row.enumerated_ratings >= \
                               rara[rara.category == row.category].enumerated_ratings.median(), axis = 1)
rara['enumerated_ratings'] = rara['enumerated_ratings'] + 0.001 * (np.random.rand(len(rara)) - 0.5)
rara['decile'] =  rara.groupby(['category'])['enumerated_ratings'].transform(
                     lambda x: pd.qcut(x, 8, labels=range(8)))

#rara['decile'] = rara.apply(lambda row: find_nearest(pd.cut(rara[rara.category == row.category].enumerated_ratings, \
#                                            9, labels = range(9), retbins=True)[-1], row.enumerated_ratings), axis = 1)
K['decile'] = K.imageURL.map(dict(zip(rara.imageURL, rara.decile)))


In [None]:
sns.set_style('ticks')
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

plt.figure(figsize=(3,8))
sns.lineplot(data = K[(K.isOutlier == False) & (K.condition == 'photo')], \
            x = 'decile', y = 'prob_true_predict_fc6_logodds', hue = 'goal', linewidth = 4,
            palette=["#C93312", "#899DA4"], legend=False)
plt.ylabel('')
plt.xlabel('');
#plt.savefig(os.path.join(plot_dir, 'photodraw2x2_typicality_logodds_lineplot.pdf'), bbox_inches = 'tight', transparent=True)

In [None]:
# Effect of typicality on recognizability by goal
K[(K['isOutlier'] == False) & (K.condition == 'photo')].groupby(['goal','isTypical'])['prob_true_predict_fc6'].mean()

In [None]:
g = sns.catplot(
    data=K[(K['isOutlier'] == False) & (K.condition == 'photo')], kind="bar",
    x="goal", y="prob_true_predict_fc6_logodds", hue="isTypical", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("isTypical", "probability (logodds)")
plt.title('Probability of correct classification (logodds)');


### Effect of ImageNet

In [None]:
with open('imagenet1000_labels.txt') as f: 
    imagenet_labels = f.read() 
imagenet_labels = ast.literal_eval(imagenet_labels) 

In [None]:
x = pd.DataFrame(inImagenet.items())
x[x[1] == False][0].values

In [None]:
inImagenet = dict([[j, sum([j in i for i in list(imagenet_labels.values())]) != 0] for j in K.category.unique()])
K['in_imagenet'] = K.category.map(inImagenet)

K.groupby('in_imagenet')[['prob_true_predict_fc6', 'true_predict_fc6', 'prob_true_predict_instance',\
                          'true_predict_instance']].apply(np.mean)

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(data = K, x = 'experiment', y = 'prob_true_predict_fc6', hue = 'in_imagenet')
plt.legend(title = 'in_imagenet', bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);


In [None]:
sns.barplot(data = K, x = 'in_imagenet', y = 'prob_true_predict_fc6', hue = 'condition')

In [None]:
np.mean([sum([j == i for i in list(imagenet_labels.values())]) != 0 for j in K.category.unique()])

In [None]:
np.mean([sum([j in i for i in list(imagenet_labels.values())]) != 0 for j in K.category.unique()])

### Basic demographics

In [None]:
# demographic information for photodraw2x2
K['participantSex'].value_counts() / 32
K.inputDevice.value_counts() / 32
K.participantAge.value_counts() / 32

# mean age of participants, removing outlier datapoints
knew = K[pd.to_numeric(K.participantAge, errors='coerce').notnull()]  
knew['participantAge'] = knew.participantAge.astype(int)
(2021 - knew[(knew.participantAge > 1930) & (knew.participantAge < 2020)
            ].groupby('gameID').participantAge.first().values).mean()