In [None]:
import os, sys

import pymongo as pm
import numpy as np
import scipy.stats as stats
import pandas as pd
import json
import re
from io import BytesIO
from PIL import Image
import base64
import PIL

from bezier import curve
from svg.path import Path, Line, Arc, CubicBezier, QuadraticBezier, Close, parse_path

import matplotlib
from matplotlib import pylab, mlab, pyplot
import matplotlib.patches as mpatches
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')

from IPython.display import clear_output
import importlib

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

# so dataframes don't get cut off in display:
#pd.set_option("display.max_rows", None, "display.max_columns", None)

### set up paths

In [None]:
# directory & file hierarchy
proj_dir = os.path.abspath('..')
analysis_dir = os.getcwd()
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
exp_dir = os.path.abspath(os.path.join(proj_dir,'experiments'))
sketch_dir = os.path.abspath(os.path.join(proj_dir,'sketches'))
gallery_dir = os.path.abspath(os.path.join(proj_dir,'gallery'))

## add helpers to python path
if os.path.join(proj_dir,'utils') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'utils'))   

def make_dir_if_not_exists(dir_name):   
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    return dir_name

## create directories that don't already exist        
result = [make_dir_if_not_exists(x) for x in [results_dir,plot_dir,csv_dir,sketch_dir,gallery_dir]]

## add utils to python path
import sys
if os.path.join(proj_dir,'utils') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'utils'))
import utils

### establish connection to mongo
first thing you need to do is to establish an ssh tunnel (aka remote port forwarding) to the server, so that requests to the mongodb can be made "as if" the mongodb server is running on your local computer. Run this from the command line before you begin data analysis if you plan to fetch data from mongo:

ssh -fNL 27020:127.0.0.1:27017 USER@cogtoolslab.org

In [None]:
! ssh -fNL 27020:127.0.0.1:27017 sholt@cogtoolslab.org

In [None]:
# set vars 
auth = pd.read_csv(os.path.join(analysis_dir,'auth.txt'), header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org'

# have to fix this to be able to analyze from local
import pymongo as pm
import socket
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1:27017')
db = conn['iterated_number']
coll = db['num8_shape4']

# which iteration name should we use?
iterationName = 'sandbox3' #increment when needed
# this has previously been run1, but switched to sandbox 3 for testing the url recording function


In [None]:
## here is what one of these records looks like
coll.find_one()

### Initialize dataframes

In [None]:
## trials
k = coll.find({'iterationName':iterationName, 'eventType':'clickedObj'})
K = pd.DataFrame(k)

## strokes
t = coll.find({'iterationName':iterationName, 'eventType':'stroke'})
T = pd.DataFrame(t)

## get list of valid game IDs (i.e, subject number)
from collections import Counter
game_dict = Counter(K['gameid']) ## get dictionary mapping gameIDs to number of sketches 
complete_gameids = [k for (k,v) in game_dict.items() if v==32] ## get gameids that contributed exactly the right number of sketches

## subset stroke/sketch dataframes by being complete AND also exclude practice
subset = True
if (subset and T['gameid'].nunique()!=len(complete_gameids)):
    T = T[(T['gameid'].isin(complete_gameids))].reset_index(drop=True)
    K = K[(K['gameid'].isin(complete_gameids))].reset_index(drop=True)
    
print('We have {} unique stroke records in all {} of our complete games.'.format(T.shape[0],len(complete_gameids)))
print('We have {} unique sketch records in all {} of our complete games.'.format(K.shape[0],len(complete_gameids)))

## save out to csv
T.to_csv(os.path.join(csv_dir,'photodraw_stroke_data.csv'),index=False)
K.to_csv(os.path.join(csv_dir,'photodraw_sketch_data.csv'),index=False)

## generate group dataframe and save out to file
importlib.reload(utils)
D = utils.generate_dataframe(coll, complete_gameids, iterationName, csv_dir)

# Turning things that can be numeric into numeric things
D = D.astype({'trialNum': 'float',
              'cardinality': 'float',
              'drawDuration': 'float',
              'outcome': 'float',
              'numStrokes': 'float',
              'meanPixelIntensity': 'float',
              'numCurvesPerSketch': 'float',
              'numCurvesPerStroke': 'float',
              'D1_Car': 'float',
              'D2_Car': 'float',
              'D3_Car': 'float'})

def GetArcLenData(df):
    """
    This requires the dataframe to have a ['svgString'] column to analyse.
    It returns the same dataframe, but with an extra column of 'stroke_len_means'.
    Currently just taking the total arc length of each stroke, and averaging them per sketch.
    
    If not already, import Path, Arc, CubicBezier, and parse_path from svg.path
    Used this: https://pypi.org/project/svg.path/
    """
    stroke_len_means = []
    for row_num in range(len(df['svgString'])):
        stroke_lengths = []
        for stroke_num in range(len(df['svgString'][row_num])):
            stroke_length = 0
            for curve in parse_path(D['svgString'][row_num][stroke_num]):
                stroke_length += curve.length(error=1e-5)
            stroke_lengths.append(stroke_length)
        stroke_len_means.append(np.mean(stroke_lengths))
    new_df = df
    new_df['stroke_len_means'] = stroke_len_means
    return new_df

D = GetArcLenData(D)



In [None]:
## monitor how far along games-in-progress are
all_games = K['gameid'].unique()
num_games = len(all_games)
print('There are a total of {} unique gameids in mongo.'.format(num_games))

print('\n')
print('These are the games and how many trials have been completed so far:')
for name, group in K.groupby('gameid'):
    print('gameid: {} | number of trials : {}'.format(name, group.shape[0]))

In [None]:
## TODO: hash 'workerId' so that we do not save actual workerIDs to file.

### render out all the sketches

In [None]:
importlib.reload(utils)
utils.render_images(K,data = 'pngString',
                    metadata = ['gameid','intendedName','trialNum'],
                    out_dir = sketch_dir)

### make sketch gallery (for complete games only)

In [None]:
importlib.reload(utils)
## actually render sketch gallery for each complete game
utils.render_sketch_gallery(complete_gameids, 
                     sketch_dir = sketch_dir,
                     gallery_dir = gallery_dir,
                     num_trials = 32)

## Analysis things

In [None]:
D.columns

### Additional derived measures we might want

In [None]:
# Just trials where all distractors were of different animal category than the target
D_animal = D.loc[(D['category'] != D['D1_Cat']) & (D['category'] != D['D2_Cat']) & (D['category'] != D['D3_Cat'])]
D_animal[['trialNum','category','cardinality','D1_Cat','D1_Car','D2_Cat','D2_Car','D3_Cat','D3_Car']]


# Which quarter of the experiment the trial happened in
D['quarter'] = np.ceil(D['trialNum']/8)
D = D.astype({'quarter': 'float'})

# subtract the first quarter from the fourth quarter to get delta of different measures
D_1st = D.loc[D['quarter'] == 1.0]
D_2nd = D.loc[D['quarter'] == 2.0]
D_3rd = D.loc[D['quarter'] == 3.0]
D_4th = D.loc[D['quarter'] == 4.0]



# to get differences between 1st and 3rd
D1_sorted = D_1st.sort_values('cardinality')[['cardinality','meanPixelIntensity']].to_numpy()
D4_sorted = D_4th.sort_values('cardinality')[['cardinality','meanPixelIntensity']].to_numpy()

D_diff = pd.DataFrame({'cardinality': D1_sorted[:,0],
                       'pixelIntensityDiff': D1_sorted[:,1] - D4_sorted[:,1]})


# General purpose function for plotting measurements separately by quarter of the experiment
def PlotByQuarter(title,xvar='cardinality',yvar='numStrokes',ylabel='Strokes per Sketch'):
    if xvar=='cardinality':
        xlabel = 'Cardinality'
    elif xvar=='category':
        xlabel = 'Animal'
    plt.figure(title)
    plt.title(title)
    sns.pointplot(data=D_1st, x=xvar, y=yvar,color='#000000',markers='.')
    sns.pointplot(data=D_2nd, x=xvar, y=yvar,color='#bb3f3f',markers='.')
    sns.pointplot(data=D_3rd, x=xvar, y=yvar,color='#edda07',markers='.')
    sns.pointplot(data=D_4th, x=xvar, y=yvar,color='#6a0dad',markers='.')
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    black_patch = mpatches.Patch(color='#000000', label='1st round')
    red_patch = mpatches.Patch(color='#bb3f3f', label='2nd round')
    yellow_patch = mpatches.Patch(color='#edda07', label='3rd round')
    purple_patch = mpatches.Patch(color='#6a0dad', label='4th round')
    plt.legend(handles=[black_patch, red_patch, yellow_patch, purple_patch])

PlotByQuarter("Ink by Cardinality",yvar='meanPixelIntensity',ylabel="Mean Pixel Intensity")
PlotByQuarter("Strokes per Sketch",yvar='numStrokes',ylabel="Strokes per Sketch")
PlotByQuarter("Curves per Stroke",yvar='numCurvesPerStroke',ylabel="Curves per Stroke")

PlotByQuarter("Curves per Stroke (Animal)",xvar='category',yvar='numCurvesPerStroke',ylabel="Curves per Stroke")
PlotByQuarter("Strokes per Sketch (Animal)",xvar='category',yvar='numStrokes',ylabel="Strokes per Sketch")

PlotByQuarter("Mean Stroke Length by Cardinality",yvar='stroke_len_means',ylabel="Mean Stroke Length")
PlotByQuarter("Sketch Time by Cardinality",yvar='drawDuration',ylabel="Sketch Time")



### analyze performance (accuracy and RT)

In [None]:
acc_table = D.groupby('gameID')['outcome'].mean().reset_index()
acc_table.head()

In [None]:
D['drawDuration'] = pd.to_numeric(D['drawDuration'])
acc_table = D.groupby('gameID')['drawDuration'].mean().reset_index()
acc_table

In [None]:
# for some reason, .mean() doesn't work on these. But .sum() and .count() do, so gonna use that
x = D.groupby(['cardinality'])[['cardinality','numCurvesPerStroke']].sum()
y = D.groupby(['cardinality'])[['cardinality','numCurvesPerStroke']].count()

print(x/y)
#sns.pointplot(data=D, x='trialNum', y='numCurvesPerStroke')
sns.pointplot(data=D, x='trialNum', y='numCurvesPerStroke')

## What statistics do we want to look at?

### Measures of iconicity:

In [None]:
# Complexity (number of curves) and ink (pixel intensity) per cardinality and between animals

plt.figure()
sns.barplot(data=D, x='cardinality', y='numStrokes')
plt.figure()
sns.barplot(data=D, x='cardinality', y='numCurvesPerSketch')
plt.figure()
sns.barplot(data=D, x='cardinality', y='meanPixelIntensity')

In [None]:
plt.figure()
sns.barplot(data=D, x='category', y='numStrokes')
plt.figure()
sns.barplot(data=D, x='category', y='numCurvesPerSketch')
plt.figure()
sns.barplot(data=D, x='category', y='meanPixelIntensity')

In [None]:
# linear regression....


#stats.linregress(D.groupby('cardinality')['drawDuration'])
D.groupby(['cardinality'])[['quarter','drawDuration']]
