In [1]:
import os
import sys
import urllib, io
os.getcwd()
sys.path.append("..")
sys.path.append("../utils")
proj_dir = os.path.abspath('../..')

## add helpers to python path
if os.path.join(proj_dir,'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'stimuli'))

import numpy as np
import scipy.stats as stats
import pandas as pd
from scipy import stats
from scipy.spatial import distance
from scipy import ndimage
from scipy.stats import entropy
from random import random
from sklearn.cluster import SpectralBiclustering
import itertools

import pymongo as pm
from collections import Counter
import json
import re
import ast

from PIL import Image, ImageOps, ImageDraw, ImageFont 

from io import BytesIO
import base64

import  matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
from matplotlib import colors

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')

from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import plotly
import plotly.graph_objects as go
import plotly.io as pio
pio.orca.config.use_xvfb = True
plotly.io.orca.config.save()

import importlib
import trajectory as g

In [2]:
## directory & file hierarchy
proj_dir = os.path.abspath('../..')
datavol_dir = os.path.join(proj_dir,'data')
analysis_dir = os.path.abspath(os.path.join(os.getcwd(),'..'))
results_dir = os.path.join(analysis_dir,'results')
stim_dir = os.path.join(proj_dir,'stimuli')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
json_dir = os.path.join(results_dir,'json')
exp_dir = os.path.abspath(os.path.join(proj_dir,'experiments'))
png_dir = os.path.abspath(os.path.join(datavol_dir,'png'))
jefan_dir = os.path.join(analysis_dir,'jefan')
will_dir = os.path.join(analysis_dir,'will')
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       

### load in data

In [3]:
iteration_name = 'Exp2Pilot3'
num_trials = 24 #for sanity checks

In [4]:
# Data already compiled into dataframes in CogSci 2020 Dataframe Generator

# trial_end data
trial_path = os.path.join(csv_dir,'block_silhouette_{}_good.csv'.format(iteration_name))
df = pd.read_csv(trial_path)

# # initial_block data
initial_path = os.path.join(csv_dir,'block_silhouette_initial_{}_good.csv'.format(iteration_name))
dfi = pd.read_csv(initial_path)

# # settled_block data
settled_path = os.path.join(csv_dir,'block_silhouette_settled_{}_good.csv'.format(iteration_name))
dfs = pd.read_csv(settled_path)

# # Sanity Check- same participants in each dataset.
df_participants = df.gameID.unique()
dfs_participants = dfs.gameID.unique()
assert Counter(df_participants) == Counter(dfs_participants)

n_before_outliers = len(df_participants)
print(str(n_before_outliers) + ' participants total')

49 participants total


### apply preprocessing

In [5]:
targets = np.sort(df['targetName'].unique())
ppts = np.sort(df['gameID'].unique())
reps = np.sort(df['repetition'].unique())

dfi['usableDiscreteWorld'] = dfi['discreteWorld'].apply(lambda a: 1+(-1)*np.array(ast.literal_eval(a)))
dfi['flatDiscreteWorld'] = dfi['discreteWorld'].apply(lambda a: (1+(-1)*np.array(ast.literal_eval(a))).flatten())

dfic = dfi.copy()
#dfic = dfi[dfi.condition=='repeated']
dfic = dfic[['targetName','gameID','blockNum','repetition','phase_extended','flatDiscreteWorld','usableDiscreteWorld','rawF1DiscreteScore']]
dfic['discreteWorld'] = dfic['usableDiscreteWorld']
dfic['flatDiscreteWorldStr'] = dfic['flatDiscreteWorld'].apply(g.convert_to_str)

max_actions = dfic['blockNum'].max()

### make trajectory graph

In [6]:
importlib.reload(g) ## reimport graph utils
make_plot = False
if make_plot:
    phases = ['pre', 'post']
    for this_target in targets:
        for this_phase in phases:
            g.plot_trajectory_graph(data = dfic, 
                                    target = this_target, 
                                    phase = this_phase, 
                                    save=False, 
                                    out_dir = plot_dir,
                                    extension = 'test',
                                    x_lower_bound = 4,
                                    x_upper_bound = 13,
                                    edge_width_scale_factor = 0.8,
                                    node_size_scale_factor = 0.8)

### analyze sparsity over world states

In [7]:
importlib.reload(g)
H = dict()
P = dict()
phases = ['pre', 'post']
for target in targets:
    H[target] = dict()
    P[target] = dict()
    for phase in phases:
        print('Calculating sparsity metric for {} {}'.format(target, phase))
        h,p = g.get_sparsity_over_states(data=dfic, target=target, phase=phase, metric='mean')
        H[target][phase] = h
        P[target][phase] = p
        clear_output(wait=True)
print('Done!')

Done!


In [8]:
H2 = pd.DataFrame(H).transpose().reset_index()
H2['diff'] = H2['post']-H2['pre']
importlib.reload(g)

print('**Mean difference in states visited in each phase**')
## bootstrapped CI on difference in mean states visited
U,lb,ub,p1,p2 = g.bootstrapCI(H2['diff'].values, verbose=True)
## paired t-test on difference in mean states visited
g.prettyTtest(H2['diff'].values)

print('-------')
print('**Mean states visited in each phase**')
## bootstrapped CIs on mean states visited in each phase
for phase in phases:
    print('{}'.format(phase))
    U,lb,ub,p1,p2 = g.bootstrapCI(H2[phase].values, verbose=True, compareNull=False)

**Mean difference in states visited in each phase**
Original mean = 0.33143. Bootstrapped mean = 0.33024.
95% CI = [0.18518, 0.49564].
p<0=0.0 | p>0=2.0.
t(7) = 3.76217 | p = 0.00705
-------
**Mean states visited in each phase**
pre
Original mean = 1.7711. Bootstrapped mean = 1.77035.
95% CI = [1.57875, 1.95293].
post
Original mean = 2.10254. Bootstrapped mean = 2.10059.
95% CI = [1.91135, 2.2988].


### Analyze set relationships between pre states and post states

In [9]:
NS = dict() # num states dict
for target in targets:
    NS[target] = dict()
    PRE = set(P[target]['pre'].keys())
    POST = set(P[target]['post'].keys())
    xsect = PRE.intersection(POST)
    u = PRE.union(POST)
    overlap = list(xsect)
    post_prop = np.sum([P[target]['post'][o] for o in overlap])
    pre_prop = np.sum([P[target]['pre'][o] for o in overlap])    
    NS[target]['pre'] = len(PRE)
    NS[target]['post'] = len(POST)
    NS[target]['xsect'] = len(xsect)
    NS[target]['union'] = len(u)
    NS[target]['pre_overlap'] = pre_prop
    NS[target]['post_overlap'] = post_prop   
    NS[target]['diff'] = len(POST) - len(PRE)
    print('{} | Pre: {} | Post: {} | Intrsct: {} | Union: {} | PreOverlap: {} | PostOverlap: {}'.
          format(target, len(PRE),len(POST),len(xsect), len(u), pre_prop.round(4),post_prop.round(4)))

## create num states dataframe 
ND = pd.DataFrame.from_dict(NS, orient='index').reset_index()    

hand_selected_004 | Pre: 282 | Post: 253 | Intrsct: 74 | Union: 461 | PreOverlap: 0.481 | PostOverlap: 0.5828
hand_selected_005 | Pre: 156 | Post: 176 | Intrsct: 48 | Union: 284 | PreOverlap: 0.658 | PostOverlap: 0.6617
hand_selected_006 | Pre: 214 | Post: 249 | Intrsct: 52 | Union: 411 | PreOverlap: 0.4585 | PostOverlap: 0.4369
hand_selected_008 | Pre: 214 | Post: 214 | Intrsct: 57 | Union: 371 | PreOverlap: 0.5495 | PostOverlap: 0.6409
hand_selected_009 | Pre: 252 | Post: 193 | Intrsct: 68 | Union: 377 | PreOverlap: 0.5853 | PostOverlap: 0.7123
hand_selected_011 | Pre: 182 | Post: 226 | Intrsct: 62 | Union: 346 | PreOverlap: 0.6499 | PostOverlap: 0.6373
hand_selected_012 | Pre: 264 | Post: 261 | Intrsct: 80 | Union: 445 | PreOverlap: 0.5759 | PostOverlap: 0.6313
hand_selected_016 | Pre: 299 | Post: 281 | Intrsct: 83 | Union: 497 | PreOverlap: 0.465 | PostOverlap: 0.5877


In [10]:
## difference in total num states visited between pre and post
importlib.reload(g)
g.statsPrint(ND['diff'].values)

t(7) = -0.10328 | p = 0.92064
Original mean = -1.25. Bootstrapped mean = -1.22888.
95% CI = [-22.00625, 20.00312].
p<0=1.072 | p>0=0.922.


### calculate sparsity over state-state transitions

In [11]:
importlib.reload(g)
H = dict()
P = dict()
phases = ['pre', 'post']
for target in targets:
    H[target] = dict()
    P[target] = dict()
    for phase in phases:
        print('Calculating sparsity for {} {}'.format(target, phase))
        h, E = g.get_sparsity_over_edges(data=dfic, target=target, phase=phase, metric='mean')
        H[target][phase] = h
        P[target][phase] = E
        clear_output(wait=True)
print('Done!')   

Done!


In [12]:
H2 = pd.DataFrame(H).transpose().reset_index()
H2['diff'] = H2['post']-H2['pre']
importlib.reload(g)

print('**Mean difference in state-state transitions taken in each phase**')
## bootstrapped CI on difference in mean states visited
U,lb,ub,p1,p2 = g.bootstrapCI(H2['diff'].values, verbose=True)
## paired t-test on difference in mean states visited
g.prettyTtest(H2['diff'].values)

print('-------')
print('**Mean state-state transitions taken in each phase**')
## bootstrapped CIs on mean states visited in each phase
for phase in phases:
    print('{}'.format(phase))
    U,lb,ub,p1,p2 = g.bootstrapCI(H2[phase].values, verbose=True, compareNull=False)

**Mean difference in state-state transitions taken in each phase**
Original mean = 0.24753. Bootstrapped mean = 0.24672.
95% CI = [0.15633, 0.34251].
p<0=0.0 | p>0=2.0.
t(7) = 4.66106 | p = 0.00231
-------
**Mean state-state transitions taken in each phase**
pre
Original mean = 1.54281. Bootstrapped mean = 1.54241.
95% CI = [1.411, 1.6706].
post
Original mean = 1.79034. Bootstrapped mean = 1.78913.
95% CI = [1.65145, 1.91795].
