### package imports and set up paths


In [None]:
import os, sys
import shutil
import re
from glob import glob

import random
import pymongo as pm
import numpy as np
import scipy.stats as stats
import pandas as pd
import json
import re
from io import BytesIO
from PIL import Image
import base64
import PIL
from collections import Counter
from skimage import io
import requests
import socket
from scipy.spatial import distance as dist
from scipy.spatial.distance import pdist
from sklearn import preprocessing
from scipy.spatial.distance import squareform
from scipy.stats import f

import matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')

from IPython.display import clear_output
import importlib

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [None]:
def make_dir_if_not_exists(dir_name):   
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    return dir_name
def list_files(path, ext='png'):
    result = [y for x in os.walk(path) for y in glob(os.path.join(x[0], '*.%s' % ext))]
    return result

### load in class-level annotations

In [None]:
## load in and inspect/
X = pd.read_csv('sketchy_class_annotations.csv')
X.head()

In [None]:
## assign a new column that only contains the classes that seem pretty diverse
X = X.assign(Diverse = X.apply(lambda x: True if x['Comments'][:3]=='yes' else False, axis=1))

In [None]:
## how many diverse classes are there
num_diverse = X['Diverse'].sum()
print('There are {} diverse classes out of a total of {} classes.'.format(num_diverse, X.shape[0]))

In [None]:
## subset to only these diverse classes
Y = X[X['Diverse']==True]

In [None]:
print(Y['Basic-level'].values)

##### informal considerations for photodraw32 compared to photodraw1
- Would be nice for all participants to produce both photo-cued and text-cued sketches from each class
- If not that, would be nice for participants to produce a sketch of every class, even if not from same cue type
- Would be nice to have a larger set of classes overall
- Would be nice to have a larger set of images in each class
- Would be nice to to have roughly the same number of images in each class as there are classes in total, at least in similar ballpark so the set of alternatives for photo-level classification is similar in size to that of class-level classification

In [None]:
Y.groupby(['Natural','Familiar']).count()

In [None]:
Y.groupby(['Natural','Familiar','Large']).count()

- (Manually) identify a set of 8 classes from each (Natural,Familiar) combination, yielding 32 classes in total. (done)
- Identify at least a set of 32 photos from each of these 32 classes and copy 32 images to a new file structure that will be photodraw32 (done)

In [None]:
print(Y[(Y['Natural']==False) & (Y['Familiar']==False)]["Basic-level"].values)

In [None]:
print(Y[(Y['Natural']==False) & (Y['Familiar']==True)]["Basic-level"].values)

In [None]:
print(Y[(Y['Natural']==True) & (Y['Familiar']==False)]["Basic-level"].values)

In [None]:
print(Y[(Y['Natural']==True) & (Y['Familiar']==True)]["Basic-level"].values)

#### Before reducing:

| Natural      | Familiar |   Categories |
| ----------- | ----------- | ----------- |
| True      | True  | beetle, butterfly, cat, dog, fish, flower, mushroom, rabbit, raccoon, seal, spider, squirrel, tree   |
| True   | False | ape, bat, bear, camel, elephant, hermit_crab, jellyfish, kangaroo, lion, ray, scorpion, sheep, snake, starfish |
| False | True | airplane, bread, car_(sedan), church, cup, fan, hat, piano, pickup_truck, shoe, skyscraper. teapot, window |
| False   |  False |  axe, blimp, castle, hotdog, jack-o-lantern, motorcycle, saw, windmill |

#### After reducing:

| Natural      | Familiar |   Categories |
| ----------- | ----------- | ----------- |
| True      | True  | butterfly, cat, fish, flower, mushroom, raccoon, squirrel, tree |
| True   | False | ape, elephant, jellyfish, kangaroo, lion, ray, scorpion, snake |
| False | True | airplane, bread, car_(sedan), cup, hat, piano, skyscraper, window |
| False   |  False |  axe, blimp, castle, hotdog, jack-o-lantern, motorcycle, saw, windmill |

In [None]:
group1 = ['butterfly', 'cat', 'fish', 'flower', 'mushroom', 'raccoon', 'squirrel', 'tree']
group2 = ['ape', 'elephant', 'jellyfish', 'kangaroo', 'lion', 'ray', 'scorpion', 'snake']
group3 = ['airplane', 'bread', 'car_(sedan)', 'cup', 'hat', 'piano', 'skyscraper', 'window']
group4 = ['axe', 'blimp', 'castle', 'hotdog', 'jack-o-lantern', 'motorcycle', 'saw', 'windmill']

In [None]:
#print("True\n" * 32 * 8 * 2, end="")
#print("False\n" * 32 * 8 * 2, end="")

In [None]:
#print("True\n" * 32 * 8 * 1, end="")
#print("False\n" * 32 * 8 * 1, end="")
#print("True\n" * 32 * 8 * 1, end="")
#print("False\n" * 32 * 8 * 1, end="")

In [None]:
#for group in [group1, group2, group3, group4]:
#    for category in group:
#        print((category + '\n') * 32, end = "")

### Made directory with new 32x32 dataset in photodraw directory

In [None]:
df = pd.read_csv('sketchy_image_paths.csv')

In [None]:
stim_dir = os.path.abspath('..')
photodraw_32_stims = os.path.join(stim_dir, 'photodraw32_stims')

[make_dir_if_not_exists(x) for x in [stim_dir, photodraw_32_stims]]

category_paths = [os.path.join(photodraw_32_stims, category) for category in df.Category.unique()]
[make_dir_if_not_exists(x) for x in category_paths]

In [None]:
reallyRun = 0
if reallyRun:
    # copy files from source to destination (in photodraw repo)
    for index, row in df.iterrows():
        source = row.Path
        destination = os.path.join(photodraw_32_stims, row.Category)
        shutil.copy(source,destination)

### Create metadata file for amazon s3 upload

In [None]:
# Get updated filenames in photodraw2_stims
os.chdir('../') 
destinationFiles = list_files('photodraw32_stims', 'png') # change to png or jpg if not working
destinationFiles = sorted(destinationFiles, key = lambda path: (path.split('\\')[-2], int(re.split(r'[_.\\]', path)[-2])))

In [None]:
indices = pd.Series([str(i) if i >= 10 else '0' + str(i) for i in range(32)] * 32, dtype=str)
photodraw32_metadata = pd.DataFrame(data = {'category': [i for i in sorted(df.Category.unique()) for j in range(32)],
                                           'index': indices, 
                                           'sketchy_preprocessing_mode': 'tx_000100000000',
                                           'sketchy_filepath': destinationFiles})
photodraw32_metadata['sketchy_filename'] = photodraw32_metadata.apply(lambda row: os.path.split(row.sketchy_filepath)[1], axis=1)
photodraw32_metadata['photodraw32_filename'] = photodraw32_metadata.apply(lambda row: row['category'] + '_' + row['index'] , axis=1)
photodraw32_metadata['s3_filename'] = photodraw32_metadata.apply(lambda row: row['sketchy_filename'][:-4] + '_' + 
                          row['photodraw32_filename'] + row['sketchy_filename'][-4:], axis=1)
photodraw32_metadata['s3_url'] = photodraw32_metadata.apply(lambda row: "https://photodraw32.s3.amazonaws.com/" + row['s3_filename'], axis = 1)

In [None]:
from PIL import Image

convertToPNG = False
if convertToPNG:
    for filename in destinationFiles:
        if filename.endswith(".jpg"):
            im = Image.open(filename)
            rgb_im = im.convert('RGB')
            rgb_im.save(filename[:-4]+'.png')
            os.remove(filename)
            continue
        else:
            continue

In [None]:
# update metadata to reflect png 
for index, row in photodraw32_metadata.iterrows():
    for columnname, columndata in row.iteritems():
        if ".jpg" in str(columndata):
            photodraw32_metadata[columnname][index] = photodraw32_metadata[columnname][index][:-3] + "png"
        else:
            continue

In [None]:
batch_list = []
for cat in photodraw32_metadata.category.unique():
    temp_list = list(range(0,8))*4
    random.shuffle(temp_list)
    batch_list.append(temp_list)
# turns the list of lists into just a single list
batch_list = [item for sublist in batch_list for item in sublist]
photodraw32_metadata['batch_num'] = batch_list

In [None]:
rand_category = np.random.choice(photodraw32_metadata.category.unique())
photodraw32_metadata[photodraw32_metadata.category == rand_category]['batch_num'].value_counts()

In [None]:
photodraw32_metadata.sample()

In [None]:
dest_path = os.path.join(stim_dir, 'photodraw32_metadata.csv')
photodraw32_metadata.to_csv(dest_path, index=False)

### turn into list of dicts

In [None]:
import csv
import pickle

with open('F:\photodraw\stimuli\photodraw32_metadata.csv') as f:
    a = [{k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]
print(a,  file=open('F:\photodraw\experiments\photodraw_norming\photodraw32_metadata.js', 'w'))

In [None]:
df = pd.read_csv('F:\photodraw\stimuli\photodraw32_metadata.csv')

In [None]:
# partition dataframe into 8 subsets with equal amount of images in each category
df_list = [pd.DataFrame() for x in range(8)]
for cat in df.category.unique():
    subset = df[df.category == cat]
    shuffled = subset.sample(frac=1)
    result = np.array_split(shuffled, 8)  
    for index, frame in enumerate(result):
        df_list[index] = df_list[index].append(frame)
for index, frame_subset in enumerate(df_list):
    df_list[index] = frame_subset.to_dict('records')
print(df_list,  file=open('F:\photodraw\experiments\photodraw_norming\photodraw32_metadata_sampled.js', 'w'))