### Import packages and set up paths

In [1]:
import os
import sys
import json
import utils
import socket
import base64
import importlib
import numpy as np
import pandas as pd
import pymongo as pm
from PIL import Image
from io import BytesIO
from scipy import stats

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# directory & file hierarchy
proj_dir = os.path.abspath('..')
analysis_dir = os.getcwd()
results_dir = os.path.join(proj_dir,'results')
csv_dir = os.path.join(results_dir,'csv')
sketch_dir = os.path.abspath(os.path.join(proj_dir,'sketches'))
gallery_dir = os.path.abspath(os.path.join(proj_dir,'gallery'))
if socket.gethostname() == 'nightingale':
    feature_dir = os.path.abspath('/mnt/pentagon/photodraw/features/')
else:
    feature_dir = os.path.abspath(os.path.join(proj_dir,'features'))
    
## add helpers to python path
if os.path.join(proj_dir,'utils') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'utils'))   

def make_dir_if_not_exists(dir_name):   
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    return dir_name

## create directories that don't already exist        
result = [make_dir_if_not_exists(x) for x in [results_dir,csv_dir,sketch_dir,gallery_dir,feature_dir]]

In [3]:
def get_coll_from_mongo(coll_name, iter_name, auth, reallyRun = False):
    if reallyRun:
        # set vars 
        pswd = auth.values[0][0]
        user = 'sketchloop'
        host = 'cogtoolslab.org'

        # have to fix this to be able to analyze from local
        import socket
        if socket.gethostname().split('_')[0]=='Justin':
            conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27020')
        else:
            conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27017')
        db = conn['photodraw']
        coll = db[coll_name]

        ## how many records do we have in mongo?
        print(f'We have {coll.estimated_document_count()} records in {coll_name}: {iter_name}.')
        
        # commenting out because it's really slow
        print('{} of these are sketches.'.format(len(list(coll.find({'iterationName':iter_name, 'eventType':'sketch'})))))
        print('{} of these are strokes.'.format(len(list(coll.find({'iterationName':iter_name, 'eventType':'stroke'})))))
        print('{} of these are surveys.'.format(len(list(coll.find({'iterationName':iter_name, 'eventType':'survey'})))))
        
        # Sketches
        k = coll.find({'iterationName':iter_name, 'eventType':'sketch'})
        K = pd.DataFrame(k)

        # Strokes
        t = coll.find({'iterationName':iter_name, 'eventType':'stroke'})
        T = pd.DataFrame(t)

        # Surveys
        s = coll.find({'iterationName':iter_name, 'eventType':'survey'})
        S = pd.DataFrame(s) #.sort_values('aID').reset_index(drop=True)
    return K, T, S

def subset_valid_drawings(data):
    unique_cats = pd.DataFrame(data.groupby(['prolificID', 'gameID']).category.nunique()).reset_index()
    complete_sessions = unique_cats[unique_cats.category == 32].gameID
    return data[data.gameID.isin(complete_sessions)]

# method to flag outlier data
def flag_outliers(data, varList, sd = 3):
    return (np.abs(stats.zscore(data[varList])) > sd).any(axis=1)

In [4]:
def compute_class_predictions(data, labels):
    # setup cross validation framework
    kFold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
    logit = LogisticRegression(max_iter=1000)

    prob_dict = {}
    pred_dict = {} # get it?
    for train_ind, test_ind in kFold.split(data, labels):
        # fit logistic regression and make indices corresponding to each of the model classes
        model = logit.fit(data[train_ind], labels[train_ind])
        class_inds = {label : ind for ind, label in enumerate(model.classes_)}

        # predict model probabilities and get find true class prediction probability
        probarrs = model.predict_proba(data[test_ind])
        label_inds = np.asarray([class_inds[label] for label in labels[test_ind]])[:, None]
        prob_of_classification = np.take_along_axis(probarrs, label_inds, axis=1).reshape((-1,))

        # update dictionaries with probabilities and predictions
        prob_dict.update(list(zip(test_ind, prob_of_classification)))
        pred_dict.update(list(zip(test_ind, model.predict(data[test_ind]) == labels[test_ind])))

    # ensure we are actually getting probabilities
    assert all(np.fromiter(prob_dict.values(), dtype = float) <= 1)
    assert all(np.fromiter(prob_dict.values(), dtype = float) >= 0)
    
    return prob_dict, pred_dict 

# convert raw png data to greyscale image array
def image_to_array(imgdata, imgsize = (224, 224)):
    imgarray = np.asarray(Image.open(BytesIO(base64.b64decode(imgdata))).resize(imgsize))
    return 255 - imgarray[:,:,3] 

def to_rgb(im):
    # suprisingly fast
    w, h = im.shape
    ret = np.empty((w, h, 3), dtype=np.uint8)
    ret[:, :, 0] = im
    ret[:, :, 1] = im
    ret[:, :, 2] = im
    return ret

In [5]:
def setup_dataframes(sketch_data, stroke_data, survey_data, experiment_type):

    # subset the sessions that were complete (all 32 trials done)
    sketch_data = subset_valid_drawings(sketch_data)
    stroke_data = subset_valid_drawings(stroke_data)

    # then sort the trial values comparably
    sketch_data = sketch_data.sort_values(by=['gameID', 'category']).reset_index(drop=True)
    stroke_data = stroke_data.sort_values(by=['gameID', 'category']).reset_index(drop=True)
    survey_data = survey_data.sort_values(by=['gameID']).reset_index(drop=True)

    # count the number of unique categories in each prolific-gameID combo
    id_counts = pd.DataFrame(sketch_data.groupby(['prolificID','gameID']).category.nunique()).reset_index()
    # get the gameIDs corresponding only to complete sessions 
    valid_gameIDs = id_counts[id_counts.category == 32].drop_duplicates('prolificID', 'last').gameID.values

    # subset valid sessions
    sketch_data = sketch_data[sketch_data.gameID.isin(valid_gameIDs)]
    stroke_data = stroke_data[stroke_data.gameID.isin(valid_gameIDs)]
    survey_data = survey_data[survey_data.gameID.isin(valid_gameIDs)]

    # ensure that there are 32 completed trials for each session subsetted
    assert all(sketch_data.groupby('gameID').trialNum.nunique() == 32)
    assert all(stroke_data.groupby('gameID').trialNum.nunique() == 32)
    assert all(sketch_data.groupby('gameID').category.nunique() == 32)
    assert all(stroke_data.groupby('gameID').category.nunique() == 32)

    assert all(sketch_data.groupby('prolificID').trialNum.nunique() == 32)
    assert all(stroke_data.groupby('prolificID').trialNum.nunique() == 32)
    assert all(sketch_data.groupby('prolificID').category.nunique() == 32)
    assert all(stroke_data.groupby('prolificID').category.nunique() == 32)

    # ensure that the the number of sketches is divisible by the number of trials in a session
    assert len(sketch_data) % 32 == 0 
    print(f'After subsetting valid sessions, there are {len(sketch_data)} valid sketches' +
                                                f' and {len(stroke_data)} valid strokes.')
    # reset index once more
    sketch_data = sketch_data.sort_values(by=['gameID', 'category']).reset_index(drop=True)
    stroke_data = stroke_data.sort_values(by=['gameID', 'category']).reset_index(drop=True)
    survey_data = survey_data.sort_values(by=['gameID']).reset_index(drop=True)
    
    # add time spent actively drawing (e.g. mouse click down) per sketch to sketch dataframe
    activeSketchTime = stroke_data.groupby(['gameID', 'category'])[
        ['startStrokeTime', 'endStrokeTime']].apply(lambda x: sum(x.endStrokeTime - x.startStrokeTime))
    sketch_data['activeSketchTime'] = sketch_data.set_index(['gameID', 'category']).index.map(activeSketchTime.to_dict())

    # add total ink spend on each drawing, computed by the sum of each stroke's arc length
    totalInk = stroke_data.groupby(['gameID', 'category'])['arcLength'].agg(np.sum)
    sketch_data['totalInk'] = sketch_data.set_index(['gameID', 'category']).index.map(totalInk.to_dict())

    # flag data marked as outliers according to specified variables
    sketch_data['isOutlier'] = flag_outliers(sketch_data, ['numStrokes', 'activeSketchTime', 'totalInk'], sd = 3)

    # transform raw png images into flattened (num sessions*32 , 224*224) numpy array
    img_data = np.asarray([image_to_array(data).flatten() for data in sketch_data.pngData])
    img_labels = np.asarray(sketch_data.category)
    img_data_tosave = img_data
    
    # apply min-max scalar on features 
    img_data = img_data - img_data.mean(axis = 0) 

    # ensure data and labels match up in length
    assert img_data.shape[0] == img_labels.shape[0]

    # add logistic regression predictions and probabilities to sketch dataframe
    prob_dict, pred_dict = compute_class_predictions(img_data, img_labels)
    sketch_data['prob_true_predict'] = sketch_data.index.map(prob_dict)
    sketch_data['true_predict'] = sketch_data.index.map(pred_dict)
    
    # replace imageURL nans with 'text' in the text condition, and clean up to just contain 
    if all(sketch_data.condition == 'text'):
        sketch_data['imageURL'] = sketch_data.imageURL.fillna('text')
    else:
        sketch_data['imageURL'] = sketch_data.imageURL.str.split('/', expand=True).iloc[:, -1]\
                                                      .str.split('_', 2, expand=True).iloc[:,:2]\
                                                      .agg('_'.join, axis=1)
    
    # convert the sketch data to RGB format
    img_data_tosave = img_data_tosave.reshape((len(img_data_tosave), 224, 224))
    img_data_tosave = np.asarray([to_rgb(img) for img in img_data_tosave])
    
    # add in survey data to sketch data
    surveys = []
    for gid in survey_data.gameID.unique():
        resp = survey_data[survey_data.gameID == gid]['responses'].values
        dictresp = [json.loads(response) for response in resp]
        dictresp = {k: v for d in dictresp for k, v in d.items()}
        dictresp['gameID'] =  gid
        surveys.append(dictresp)
    surveys = pd.DataFrame(surveys).set_index('gameID')
    surveys = surveys.reindex(sorted(surveys.columns), axis=1)
    sketch_data[['TechnicalDifficultiesFreeResp','inputDevice',
       'participantAge','participantComments',
       'participantSex', 'subjectiveSkill']] = sketch_data.apply(lambda x: surveys.loc[x.gameID], axis = 1)
    
    # add experiment into dataframes
    sketch_data['experiment'] = experiment_type
    stroke_data['experiment'] = experiment_type
    survey_data['experiment'] = experiment_type
    
    # add goal into dataframes
    sketch_data['goal'] = experiment_type.split('-')[0]
    stroke_data['goal'] = experiment_type.split('-')[0]
    survey_data['goal'] = experiment_type.split('-')[0]
    
    # add null batch_num column in text conditions
    if 'batch_num' not in sketch_data.columns:
        sketch_data['batch_num'] = np.nan
        stroke_data['batch_num'] = np.nan
        survey_data['batch_num'] = np.nan
    
    # finally, drop participant identifier now that gameID is equivalent
    sketch_data = sketch_data.drop(columns='prolificID')
    stroke_data = stroke_data.drop(columns='prolificID')
    survey_data = survey_data.drop(columns='prolificID')
    
    return sketch_data, stroke_data, survey_data, img_data_tosave

### establish connection to mongo

`ssh -fNL 27020:127.0.0.1:27017 jyang@cogtoolslab.org`  
`ssh -fNL 27017:127.0.0.1:27017 jyang@cogtoolslab.org`

In [6]:
%%time 
sketch_data, stroke_data, survey_data = [], [], []
for experiment in ['categorydraw-text', 'instancedraw-text', 'categorydraw-photo', 'instancedraw-photo']: 
    # this auth.txt file contains the password for the sketchloop user
    auth = pd.read_csv(os.path.join(analysis_dir,'auth.txt'), header = None) 

    # get raw experiment data from mongo
    sketch, stroke, survey = get_coll_from_mongo(experiment, 'run0', auth = auth, reallyRun = True)

    # set up dataframes for analysis, get raw image arrays
    sketch, stroke, survey, im_data = setup_dataframes(sketch, stroke, survey, experiment)
    sketch_data.append(sketch)
    stroke_data.append(stroke)
    survey_data.append(survey)    
    
    # save image data 
    np.save(os.path.abspath(os.path.join(feature_dir, f'{experiment}_flattened_pixels.npy')), im_data)
    
    print(f'Saved out {experiment} data. Path names are:')
    print(os.path.abspath(os.path.join(feature_dir, f'{experiment}_flattened_pixels.npy')))

print('\n')    

sketch_data = pd.concat(sketch_data)
stroke_data = pd.concat(stroke_data)
survey_data = pd.concat(survey_data)

reallyRun = False
if reallyRun:

    # save data out to csv
    sketch_data.to_csv(os.path.join(csv_dir, 'photodraw2x2_sketch_data.csv'), index = False)
    stroke_data.to_csv(os.path.join(csv_dir, 'photodraw2x2_stroke_data.csv'), index = False)
    survey_data.to_csv(os.path.join(csv_dir, 'photodraw2x2_survey_data.csv'), index = False)

    print(os.path.join(csv_dir, f'photodraw2x2_sketch_data.csv'))
    print(os.path.join(csv_dir, f'photodraw2x2_stroke_data.csv'))
    print(os.path.join(csv_dir, f'photodraw2x2_survey_data.csv'))

We have 34631 records in categorydraw-text: run0.
After subsetting valid sessions, there are 3072 valid sketches and 27435 valid strokes.
Saved out categorydraw-text data. Path names are:
F:\photodraw\features\categorydraw-text_flattened_pixels.npy
We have 51876 records in instancedraw-text: run0.
After subsetting valid sessions, there are 3072 valid sketches and 39345 valid strokes.
Saved out instancedraw-text data. Path names are:
F:\photodraw\features\instancedraw-text_flattened_pixels.npy
We have 39310 records in categorydraw-photo: run0.
After subsetting valid sessions, there are 3072 valid sketches and 32949 valid strokes.
Saved out categorydraw-photo data. Path names are:
F:\photodraw\features\categorydraw-photo_flattened_pixels.npy
We have 51312 records in instancedraw-photo: run0.
After subsetting valid sessions, there are 3072 valid sketches and 44082 valid strokes.
Saved out instancedraw-photo data. Path names are:
F:\photodraw\features\instancedraw-photo_flattened_pixels.np

### render out sketches for inspection

In [58]:
sketch_df = pd.read_csv(os.path.join(csv_dir, 'photodraw2x2_sketch_data.csv'))
for experiment in ['categorydraw-text', 'instancedraw-text', 'categorydraw-photo', 'instancedraw-photo']:   
    sketch_experiment_df = sketch_df[sketch_df.experiment == experiment]
    importlib.reload(utils)
    
    print(f'Rendering {experiment} sketch data...')
    utils.render_images(sketch_experiment_df, data = 'pngData',
                        metadata = ['gameID', 'trialNum', 'condition','category','imageURL', 'goal'],
                        out_dir = os.path.join(sketch_dir, experiment))


Done rendering 3072 images to F:\photodraw\sketches\instancedraw-photo.


### Make sketch galleries 

In [None]:
for experiment in ['categorydraw-text', 'instancedraw-text', 'categorydraw-photo', 'instancedraw-photo']:
    
    gallery_path = make_dir_if_not_exists(os.path.join(gallery_dir, experiment))
    sketch_paths = sorted([sketch_path for sketch_path in os.listdir(os.path.join(sketch_dir, experiment))])
    gameids = list(np.unique([i.split('_')[0] for i in sketch_paths]))

    if (len(os.listdir(gallery_path))==0):
        ## generate gallery for each participant
        for gind, game in enumerate(gameids): 
            print('Generating sketch gallery for participant: {} | {} of {}'.format(game,gind+1,len(gameids)))
            # get list of all sketch paths JUST from current game
            game_sketch_paths = [path for path in sketch_paths if path.split('_')[0] == game]
            game_sketch_paths = sorted(game_sketch_paths, key = lambda x: x.split('_')[3].split('.')[0]) 
            fig = plt.figure(figsize=(12,20))   
            for i,f in enumerate(game_sketch_paths):
                # open image
                im = Image.open(os.path.join(sketch_dir, experiment, f))
                # get metadata
                gameid = f.split('_')[0] 
                trialNum = f.split('_')[1]
                condition = f.split('_')[2]
                category = f.split('_')[3].split('.')[0]   
                # make gallery
                p = plt.subplot(8,4,i+1)
                plt.imshow(im)
                sns.set_style('white')
                k = p.get_xaxis().set_ticklabels([])
                k = p.get_yaxis().set_ticklabels([])
                k = p.get_xaxis().set_ticks([])
                k = p.get_yaxis().set_ticks([])   
                p.axis('off')
                plt.title('{} {} {}'.format(trialNum,condition,category))
            plt.suptitle(gameid)
            fname = '{}.png'.format(gameid)
            plt.savefig(os.path.join(gallery_path,fname))
            plt.close(fig)
        print('Done!')

In [41]:
for experiment in ['categorydraw-text', 'instancedraw-text', 'categorydraw-photo', 'instancedraw-photo']:
    
    gallery_path = make_dir_if_not_exists(os.path.join(gallery_dir, experiment))
    sketch_paths = sorted([sketch_path for sketch_path in os.listdir(os.path.join(sketch_dir, experiment))])
    gameids = list(np.unique([i.split('_')[0] for i in sketch_paths]))

    if (len(os.listdir(gallery_path))==0):
        ## generate gallery for each participant
        for gind, game in enumerate(gameids): 
            print('Generating sketch gallery for participant: {} | {} of {}'.format(game,gind+1,len(gameids)))
            # get list of all sketch paths JUST from current game
            game_sketch_paths = [path for path in sketch_paths if path.split('_')[0] == game]
            game_sketch_paths = sorted(game_sketch_paths, key = lambda x: x.split('_')[3].split('.')[0]) 
            fig = plt.figure(figsize=(12,20))   
            for i,f in enumerate(game_sketch_paths):
                # open image
                im = Image.open(os.path.join(sketch_dir, experiment, f))
                # get metadata
                gameid = f.split('_')[0] 
                trialNum = f.split('_')[1]
                condition = f.split('_')[2]
                category = f.split('_')[3].split('.')[0]   
                # make gallery
                p = plt.subplot(8,4,i+1)
                plt.imshow(im)
                sns.set_style('white')
                k = p.get_xaxis().set_ticklabels([])
                k = p.get_yaxis().set_ticklabels([])
                k = p.get_xaxis().set_ticks([])
                k = p.get_yaxis().set_ticks([])   
                p.axis('off')
                plt.title('{} {} {}'.format(trialNum,condition,category))
            plt.suptitle(gameid)
            fname = '{}.png'.format(gameid)
            plt.savefig(os.path.join(gallery_path,fname))
            plt.close(fig)
        print('Done!')

Generating sketch gallery for participant: 0058-ce87d784-26a0-4ee6-b31e-04b7669d6ffa | 1 of 96
Generating sketch gallery for participant: 0072-4fb13d34-250e-4a72-80f9-bd57bffddfce | 2 of 96
Generating sketch gallery for participant: 0597-1cebe818-5134-457d-a6ae-882fa273e7ae | 3 of 96
Generating sketch gallery for participant: 0712-06cd00bd-1a04-4d87-9315-2e286faecc61 | 4 of 96
Generating sketch gallery for participant: 0986-ad780b37-aac4-417d-a20b-997204e9a3f2 | 5 of 96
Generating sketch gallery for participant: 1235-00cbba14-ad50-482e-be5c-f62e550eabc2 | 6 of 96
Generating sketch gallery for participant: 1239-70860003-a9f6-40d6-bd05-df9e70196840 | 7 of 96
Generating sketch gallery for participant: 1464-3f3ab669-fd0f-448e-abad-dc3b7364927d | 8 of 96
Generating sketch gallery for participant: 1543-8902a2f3-2a2b-47fb-a7a8-37aa7a8e01cf | 9 of 96
Generating sketch gallery for participant: 1613-b960b761-c83f-4cc7-8b13-29898c1d39fc | 10 of 96
Generating sketch gallery for participant: 1745-1

Generating sketch gallery for participant: 8603-55797fab-2893-4f7d-b644-af54210aac46 | 87 of 96
Generating sketch gallery for participant: 8803-1dffbfb2-96e0-4397-b4c8-56172d178a0e | 88 of 96
Generating sketch gallery for participant: 8835-fe725b87-5dd0-469f-8da2-0980223552ec | 89 of 96
Generating sketch gallery for participant: 9016-0c32b658-ab9b-46a1-9c8b-230a85373ee3 | 90 of 96
Generating sketch gallery for participant: 9054-7a3cd65e-55bf-4f03-a3d5-cbff651f2dd7 | 91 of 96
Generating sketch gallery for participant: 9068-867eea6d-0335-413c-b112-beb12c1018ae | 92 of 96
Generating sketch gallery for participant: 9229-3da49b43-d59a-4b84-b777-90339bbb77cd | 93 of 96
Generating sketch gallery for participant: 9526-09266e5f-629d-4604-b73a-47c448e4a5d0 | 94 of 96
Generating sketch gallery for participant: 9834-c61a287a-0706-46e9-a100-416413cbcc57 | 95 of 96
Generating sketch gallery for participant: 9850-f33717b7-3f0c-4aa8-9a84-919b086a10ce | 96 of 96
Done!
Generating sketch gallery for part

Generating sketch gallery for participant: 8251-7b9192f9-e119-4c1d-8b94-20b6380a55f2 | 77 of 96
Generating sketch gallery for participant: 8288-6f9238b7-b958-4186-b88f-5b86dcab2f96 | 78 of 96
Generating sketch gallery for participant: 8314-c5f1d232-f1d5-48b2-b8ef-a40274666e98 | 79 of 96
Generating sketch gallery for participant: 8443-efb42d93-fe58-4a60-89ea-d6853b31eb7d | 80 of 96
Generating sketch gallery for participant: 8450-84086b6f-042d-418b-b43e-a30662d6ed77 | 81 of 96
Generating sketch gallery for participant: 8640-7215bfc0-26ed-47e1-82a0-650372ad8511 | 82 of 96
Generating sketch gallery for participant: 8664-6fd3ed4a-5e42-436b-b4e1-fa732c16694e | 83 of 96
Generating sketch gallery for participant: 8683-5c9bd49e-74be-4c0c-9323-cd538c8569a9 | 84 of 96
Generating sketch gallery for participant: 8824-5363798b-5398-4a77-b68a-f061dbcf4f33 | 85 of 96
Generating sketch gallery for participant: 8968-86fcb2df-875f-4e88-82e3-3d66133a77aa | 86 of 96
Generating sketch gallery for participan

Generating sketch gallery for participant: 6690-3503c5be-4d33-4856-ae5d-ea387462c894 | 67 of 96
Generating sketch gallery for participant: 6921-9a7bc842-f257-4360-a8cb-e5289714b039 | 68 of 96
Generating sketch gallery for participant: 7092-639010a8-023d-4df0-9ed9-f2157701b65e | 69 of 96
Generating sketch gallery for participant: 7139-aa017962-7cad-4b08-bf3d-b3e0f81ca5c6 | 70 of 96
Generating sketch gallery for participant: 7239-82275173-48ff-43ff-81ab-5513eec34a1a | 71 of 96
Generating sketch gallery for participant: 7314-c28cbbba-fdce-4a56-aa9b-eb308b7437c8 | 72 of 96
Generating sketch gallery for participant: 7425-e83b0bb8-29b9-4b26-bd54-84a71b955634 | 73 of 96
Generating sketch gallery for participant: 7446-76165afa-93ac-439e-94a3-1ac6d262d1fe | 74 of 96
Generating sketch gallery for participant: 7528-0df40860-01fa-410d-9b88-c4135cfbd9c7 | 75 of 96
Generating sketch gallery for participant: 7546-6f3083bd-b056-4abd-8cdf-9cea29733456 | 76 of 96
Generating sketch gallery for participan

Generating sketch gallery for participant: 5539-d651fd63-8041-465f-939c-30acdb44d90c | 57 of 96
Generating sketch gallery for participant: 5723-2bea42f8-04c0-49bb-9aeb-8163357a4a51 | 58 of 96
Generating sketch gallery for participant: 5746-dd897e2a-8af7-43b3-9d7c-0586eb396d32 | 59 of 96
Generating sketch gallery for participant: 5858-03751bd9-d1f1-476b-8aea-666adac7405c | 60 of 96
Generating sketch gallery for participant: 5921-c7756a91-d3fb-4ea9-af9f-7d43982056f1 | 61 of 96
Generating sketch gallery for participant: 5929-bf51df99-6ded-471f-9c0e-466f86c561f8 | 62 of 96
Generating sketch gallery for participant: 5942-15df2a49-f232-499a-896f-44dc87c15697 | 63 of 96
Generating sketch gallery for participant: 5997-6f335be2-a8d5-499e-b717-7848b8c34907 | 64 of 96
Generating sketch gallery for participant: 6153-193521ea-2b36-469a-af08-4fb7051c41dd | 65 of 96
Generating sketch gallery for participant: 6177-5ca91079-ad3e-4149-91f2-31a8a6387b1e | 66 of 96
Generating sketch gallery for participan

### render out sketches for feature extraction

In [59]:
sketch_df = pd.read_csv(os.path.join(csv_dir, 'photodraw2x2_sketch_data.csv'))

importlib.reload(utils)

utils.render_images(sketch_df, data = 'pngData',
                    metadata = ['gameID', 'trialNum', 'condition','category','imageURL', 'goal'],
                    out_dir = os.path.join(sketch_dir, 'photodraw2x2'))


Done rendering 12288 images to F:\photodraw\sketches\photodraw2x2.


### Finally, reload sketch data and add fc6 feature predictions

`python extract_features.py --data=/photodraw/sketches/photodraw2x2 --layer_ind=5 --data_type=sketch --spatial_avg=True --channel_norm=False --out_dir=/photodraw/features/ --experiment_flag=photodraw2x2`

`python extract_features.py --data=/photodraw/stimuli/photodraw32_stims_copy --layer_ind=5 --data_type=image --spatial_avg=True --channel_norm=False --out_dir=/photodraw/features/ --experiment_flag=photodraw2x2_stims`

`python extract_features.py --data=/home/AD/juy003/photodraw/sketches/photodraw2x2 --layer_ind=5 --data_type=sketch --spatial_avg=True --channel_norm=False --out_dir=/home/AD/juy003/photodraw/features --experiment_flag=photodraw2x2`


In [43]:
%%time
sketch_df = pd.read_csv(os.path.join(csv_dir, 'photodraw2x2_sketch_data.csv'))

M_fc6 = pd.read_csv(os.path.join(feature_dir, f'METADATA_photodraw2x2_sketch.csv'))
F_fc6 = np.load(os.path.join(feature_dir, f'FEATURES_FC6_photodraw2x2_sketch.npy'))

# sort sketches into same format as fc6 featuresb
sketch_df['trialNum'] = pd.to_numeric(sketch_df.trialNum, errors='coerce')
sketch_df = sketch_df.sort_values(by=['gameID', 'trialNum', 'condition', 'category', 'imageURL', 'goal']).reset_index(drop=True)

# parse M_fc6 into same format as sketch_df 
M_fc6[['gameID', 'trialNum', 'condition', 'category']] = M_fc6.image_id.str.split('_', 3, expand = True)
M_fc6[['category', 'goal']] = M_fc6.category.str.rsplit('_', 1, expand = True)
M_fc6.loc[M_fc6.condition != 'text', 'imageURL'] = M_fc6.loc[M_fc6.condition != 'text', 'category']\
                                                .str.rsplit('_', 2, expand = True).iloc[:,1:3].agg('_'.join, axis = 1)
M_fc6.loc[M_fc6.condition == 'text', 'imageURL'] = M_fc6.loc[M_fc6.condition == 'text', 'category']\
                                                .str.rsplit('_', 1, expand = True).iloc[:,1]

M_fc6.loc[M_fc6.condition != 'text', 'category'] = M_fc6.loc[M_fc6.condition != 'text', 'category']\
                                                .str.rsplit('_', 2, expand = True).iloc[:,0]
M_fc6.loc[M_fc6.condition == 'text', 'category'] = M_fc6.loc[M_fc6.condition == 'text', 'category']\
                                                .str.rsplit('_', 1, expand = True).iloc[:,0]

# put M_fc6 in the same order as sketch_df
M_fc6['trialNum'] = pd.to_numeric(M_fc6.trialNum, errors='coerce')
M_fc6 = M_fc6.sort_values(by=['gameID', 'trialNum', 'condition', 'category', 'imageURL', 'goal']).reset_index(drop=True)

# make sure the dataframes and numpy arrays are aligned
a = ['_'.join(str(x) for x in lis) for lis in M_fc6[['gameID', 'trialNum', 'condition',  \
                                                        'category', 'imageURL', 'goal']].values]
b = ['_'.join(str(x) for x in lis) for lis in sketch_df[['gameID', 'trialNum', 'condition', \
                                                        'category', 'imageURL', 'goal']].values]

assert a == b

# assert that the a and b map uniquely between M_fc6 and sketch_df to ensure everything is well aligned
assert len(M_fc6) == len(sketch_df) == len(set(a)) == len(set(b))

# reindex feature data and normalize it a bit 
img_data = F_fc6[M_fc6.feature_ind.values]
img_data = img_data - img_data.mean(axis = 0)

# add logistic regression predictions and probabilities to sketch dataframe for fc6 data
prob_dict, pred_dict = compute_class_predictions(img_data, sketch_df.category.values)
sketch_df['prob_true_predict_fc6'] = sketch_df.index.map(prob_dict)
sketch_df['true_predict_fc6'] = sketch_df.index.map(pred_dict)
sketch_df['prob_true_predict_fc6_logodds'] = sketch_df.prob_true_predict_fc6.apply(lambda x: np.log(x) - np.log(1-x))

# add feature indices to sketch df for future ease of access
sketch_df['feature_ind'] = M_fc6.feature_ind.values

# save data out to csv
sketch_df.to_csv(os.path.join(csv_dir, 'photodraw2x2_sketch_data.csv'), index = False)

Wall time: 11min 10s


### All done!

To summarize the pipeline: 
- get data from mongo and clean it up
- make image and gallery directories
- run both instance and category feature extraction 
- get classification scores