## Follow the instructions of this notebook to create all relevant directories and to load all relevant data into the correct places

### Import packages and set up paths

In [None]:
import os 
import sys
import socket
import numpy as np
import pandas as pd
import pymongo as pm

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')

In [None]:
# directory & file hierarchy
proj_dir = os.path.abspath('..')
analysis_dir = os.getcwd()
results_dir = os.path.join(proj_dir,'results')
csv_dir = os.path.join(results_dir,'csv')

In [None]:
K = pd.read_csv(os.path.join(csv_dir, 'photodraw2x2_sketch_data.csv'))
K = K.sort_values(['gameID', 'category'])

### establish connection to mongo

`ssh -fNL 27020:127.0.0.1:27017 jyang@cogtoolslab.org`  
`ssh -fNL 27017:127.0.0.1:27017 jyang@cogtoolslab.org`

In [None]:
reallyRun = False
if reallyRun:
    # set vars 

    # this auth.txt file contains the password for the sketchloop user
    auth = pd.read_csv(os.path.join(analysis_dir,'auth.txt'), header = None) 
    pswd = auth.values[0][0]
    decoderpswd = int(pswd[-1])
    user = 'sketchloop'
    host = 'cogtoolslab.org'

    # have to fix this to be able to analyze from local
    import socket
    if socket.gethostname().split('_')[0]=='Justin':
        conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27020')
    else:
        conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27017')
    db = conn['photodraw']
    coll_cat = db['recogdraw-category']
    coll_inst = db['recogdraw-instance']

    iterationName = 'run0'

In [None]:
if reallyRun:
    K_responses_cat = coll_cat.find({'iterationName':'run0', 'eventType': 'trial'})
    K_responses_inst = coll_inst.find({'iterationName':'run0', 'eventType': 'trial'})

    K_responses_cat = pd.DataFrame(K_responses_cat)
    K_responses_inst = pd.DataFrame(K_responses_inst)

    K_responses_cat['rt'] = K_responses_cat.rt - 500 # delay before being able to select
    K_responses_inst['rt'] = K_responses_inst.rt - 1000 # delay before being able to select
    
    
    K_responses_cat['rt_log'] = np.log(K_responses_cat['rt'])
    cutoff = K_responses_cat['rt_log'].std() * 3 + K_responses_cat['rt_log'].mean()
    K_responses_cat['isOutlier'] = K_responses_cat['rt_log'] > cutoff

    K_responses_inst['rt_log'] = np.log(K_responses_inst['rt'])
    cutoff = K_responses_inst['rt_log'].std() * 3 + K_responses_inst['rt_log'].mean()
    K_responses_inst['isOutlier'] = K_responses_inst['rt_log'] > cutoff

In [None]:
## save out CSVs
reallyRun = False
if reallyRun:
    K_responses_cat.to_csv(os.path.join(csv_dir, 'photodraw2x2_category_recog_ratings.csv'), index=False)
    K_responses_inst.to_csv(os.path.join(csv_dir, 'photodraw2x2_instance_recog_ratings.csv'), index=False)
else:
    K_responses_cat = pd.read_csv(os.path.join(csv_dir, 'photodraw2x2_category_recog_ratings.csv'))
    K_responses_inst = pd.read_csv(os.path.join(csv_dir, 'photodraw2x2_instance_recog_ratings.csv'))

In [None]:
# extract survey responses & save out CSVs
reallyRun = False
if reallyRun:
    K_responses_cat_survey = coll_cat.find({'iterationName':'run0', 'eventType': 'survey'})
    K_responses_inst_survey = coll_inst.find({'iterationName':'run0', 'eventType': 'survey'})
    K_responses_cat_survey = pd.DataFrame(K_responses_cat_survey)
    K_responses_inst_survey = pd.DataFrame(K_responses_inst_survey)


    K_responses_inst_survey_text = \
        K_responses_inst_survey[K_responses_inst_survey.choice_or_text == 'text']  \
            .sort_values(by='gameID') \
            .drop(columns=['prolificID'])

    K_responses_inst_survey_text[['participantSex', 'inputDevice']] = \
        K_responses_inst_survey[K_responses_inst_survey.choice_or_text != 'text']  \
            .sort_values(by='gameID') \
            .drop(columns=['prolificID'])[['participantSex', 'inputDevice']].values

    K_responses_cat_survey_text = \
        K_responses_cat_survey[K_responses_cat_survey.choice_or_text == 'text']  \
            .sort_values(by='gameID') \
            .drop(columns=['prolificID'])

    K_responses_cat_survey_text[['participantSex', 'inputDevice']] = \
        K_responses_cat_survey[K_responses_cat_survey.choice_or_text != 'text']  \
            .sort_values(by='gameID') \
            .drop(columns=['prolificID'])[['participantSex', 'inputDevice']].values
    K_responses_inst_survey = K_responses_inst_survey_text
    K_responses_cat_survey = K_responses_cat_survey_text
    
    # take out unnecessary columns
    K_responses_cat_survey = K_responses_cat_survey.drop(
                            columns=['randomize_question_order', 'type', 'preamble', 'questions', 
                                     'button_label','choice_or_text', 'responses', 'question_order', 'trial_type',
                                     'trial_index', 'internal_node_id'])
    
    K_responses_cat_survey.to_csv(os.path.join(csv_dir, 'photodraw2x2_category_recog_survey.csv'), index=False)
    K_responses_inst_survey.to_csv(os.path.join(csv_dir, 'photodraw2x2_instance_recog_survey.csv'), index=False)
else:
    K_responses_cat_survey = pd.read_csv(os.path.join(csv_dir, 'photodraw2x2_category_recog_survey.csv'))
    K_responses_inst_survey = pd.read_csv(os.path.join(csv_dir, 'photodraw2x2_instance_recog_survey.csv'))

In [None]:
print(K_responses_cat_survey.participantAge.mean())
print(K_responses_cat_survey.participantSex.value_counts(), '\n')

print(K_responses_inst_survey.participantAge.mean())
print(K_responses_inst_survey.participantSex.value_counts())

<br>

### Save recognition data into main dataframe

<br>

In [None]:
reallyRun = False
if reallyRun:
    
    K_responses_cat_main = K_responses_cat[(K_responses_cat.catch_trial == False) & 
                                       (K_responses_cat.prep_trial  == False) &
                                       (K_responses_cat.isOutlier   == False)  ]
    K_responses_inst_main = K_responses_inst[(K_responses_inst.catch_trial == False) & 
                                             (K_responses_inst.prep_trial  == False) &
                                             (K_responses_inst.isOutlier   == False)  ]
    
    groupdata = K_responses_cat_main.groupby(['sketcher_gameID', 'sketcher_category'])[['rt','isCorrect']]
    assert (K[['gameID', 'category']].values == \
            groupdata.count().reset_index().sort_values(['sketcher_gameID','sketcher_category']
                                               )[['sketcher_gameID','sketcher_category']].values).all()

    K[['cat_rt_mean', 'cat_correct_mean']] = groupdata.mean().reset_index().sort_values(
                            ['sketcher_gameID','sketcher_category'])[['rt', 'isCorrect']]
    K[['cat_rt_sd', 'cat_correct_sd']] = groupdata.std().reset_index().sort_values(
                            ['sketcher_gameID','sketcher_category'])[['rt', 'isCorrect']]
    K['cat_numRaters'] = groupdata.count().reset_index().sort_values(
                            ['sketcher_gameID','sketcher_category'])['rt']

In [None]:
reallyRun = False
if reallyRun:
    groupdata = K_responses_inst_main.groupby(['sketcher_gameID', 'sketcher_category'])[['rt','isCorrect']]
    assert (K[K.condition == 'photo'][['gameID', 'category']].values == \
            groupdata.count().reset_index().sort_values(['sketcher_gameID','sketcher_category']
                                               )[['sketcher_gameID','sketcher_category']].values).all()

    instmean = groupdata.mean().reset_index().sort_values(['sketcher_gameID','sketcher_category'])
    instmean = instmean.set_index(['sketcher_gameID','sketcher_category']).to_dict()
    K['inst_rt_mean'] = K.set_index(['gameID', 'category']).index.map(instmean['rt'])
    K['inst_correct_mean'] = K.set_index(['gameID', 'category']).index.map(instmean['isCorrect'])

    instsd = groupdata.std().reset_index().sort_values(['sketcher_gameID','sketcher_category'])
    instsd = instsd.set_index(['sketcher_gameID','sketcher_category']).to_dict()
    K['inst_rt_sd'] = K.set_index(['gameID', 'category']).index.map(instsd['rt'])
    K['inst_correct_sd'] = K.set_index(['gameID', 'category']).index.map(instsd['isCorrect'])

    instcount = groupdata.count().reset_index().sort_values(['sketcher_gameID','sketcher_category'])
    instcount = instcount.set_index(['sketcher_gameID','sketcher_category']).to_dict()
    K['inst_numRaters'] = K.set_index(['gameID', 'category']).index.map(instcount['rt'])

In [None]:
## save out CSV and sync to git
reallyRun = False
if reallyRun:
    K = K.sort_values(by=['gameID', 'trialNum', 'condition', 'category', 'imageURL', 'goal']).reset_index(drop = True)
    K.to_csv(os.path.join(csv_dir, 'photodraw2x2_sketch_data.csv'), index = False)
else:
    K = pd.read_csv(os.path.join(csv_dir, 'photodraw2x2_sketch_data.csv'))

# Analysis

### Basic barplots

In [None]:
g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="goal", y="cat_rt_mean", hue="condition", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("", "Reaction time (ms)")
plt.title('Category-level reaction time per sketch (recognition task)');

g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="goal", y="cat_correct_mean", hue="condition", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("", "Recognizer accuracy")
plt.title('Category recognizer accuracy per sketch (recognition task)');

g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="goal", y="prob_true_predict_fc6", hue="condition", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("", "probability")
plt.title('Probability of correct classification (fc6)');

In [None]:
g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="goal", y="prob_true_predict_instance", hue="condition", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("", "probability")
plt.title('Probability of correct classification (instance)');

g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="goal", y="inst_correct_mean", hue="condition", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("", "Recognizer accuracy")
plt.title('Instance recognizer accuracy per sketch (recognition task)');

g = sns.catplot(
    data=K[K['isOutlier'] == False], kind="bar",
    x="goal", y="inst_rt_mean", hue="condition", palette="dark", alpha=.7, height=5
)
g.despine(left=True)
g.set_axis_labels("", "rt (ms)")
plt.title('Reaction time of instance recognizers');

In [None]:
sns.histplot(K_responses_cat['rt_log'])

In [None]:
corrperc = np.zeros((50, 2))
for i in range(50):
    cat_rt_cutoff = K_responses_cat.rt.median() * i
    corrperc[i] = [i, sum(K_responses_cat.rt > cat_rt_cutoff)]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,4))
ax1.plot(corrperc[:, 0], corrperc[:,1])
ax1.set(xlabel='Multiples of RT above median', ylabel = 'Number of trials', title='Number of trials above RT cutoff')
corrperc = np.zeros((50, 2))
for i in range(50):
    cat_rt_cutoff = K_responses_cat.rt.median() * i
    corrperc[i] = [i, K_responses_cat[K_responses_cat.rt > cat_rt_cutoff].isCorrect.mean()]
ax2.plot(corrperc[:, 0], corrperc[:,1])
ax2.set(xlabel='Multiples of RT above median', ylabel='Accuracy', title='Accuracy of responses above RT cutoff')
plt.tight_layout()