In [7]:
import pandas as pd
import numpy as np
import os
import glob
import math
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import copy
from random import randrange
import random
from sklearn.metrics import jaccard_score
from tqdm import tqdm

In [9]:
# import data
data_dict = pd.read_csv('online_boundary_data.csv')

# import story boundary files
directory_stories = '../story_csv'

filenames_stories = glob.glob(directory_stories + '/*.csv')

dfs_stories = []
    
for filename in filenames_stories:
    dfs_stories.append(pd.read_csv(filename))

In [11]:
# Arrange putative event boundaries into arrays:
#mark the boundaries with a '1'
#output: story_boundaries, where the first column has the location boundaries and the second column has the social
#boundaries

story_boundaries = dict()
story_targets = dict()

#read in the location and social event values from the story files 
for s in range(16):
    this_story = int(dfs_stories[s]['story'].iloc[0])
    keys2 = dfs_stories[s]['locationEvent'].values
    keys3 = dfs_stories[s]['socialEvent'].values
    story_boundaries[this_story] = keys2[:, np.newaxis]
    story_boundaries[this_story] = np.concatenate((story_boundaries[this_story], keys3[:, np.newaxis]), axis = 1)
    
    story_targets[this_story] = np.column_stack(((dfs_stories[s]['locationAnswer'].values>0).astype(int), (dfs_stories[s]['socialAnswer'].values>0).astype(int)))
    
#mark the changes or boundaries from one event to another with a '1' and delete first two columns
for key in story_boundaries:
    location = story_boundaries[key][:,0]
    social = story_boundaries[key][:,1]
    for i in range(0, len(location)):  
        if location[i] > location[i-1]:
            location[i] = 7
    for i in range(0, len(location)):  
        if location[i] != 7:
            location[i] = 0
    for i in range(0, len(social)):
        if social[i] > social[i - 1]:
            social[i] = 7
    for i in range(0, len(social)):
        if social[i] != 7:
            social[i] = 0
    location[0] = 7
    social[0] = 7
    location2 = location > 1
    social2 = social > 1
    location2 = location2.astype(int)
    social2 = social2.astype(int)
    story_boundaries[key] = np.concatenate((story_boundaries[key], location2[:, np.newaxis]), axis = 1) 
    story_boundaries[key] = np.concatenate((story_boundaries[key], social2[:, np.newaxis]), axis = 1)
    story_boundaries[key] = np.delete(story_boundaries[key] ,np.s_[0:2],axis=1)

# Remove the first sentence/row from all stories 
for key in story_boundaries:
    story_boundaries[key] = np.delete(story_boundaries[key],(0), axis = 0)
    story_targets[key] = np.delete(story_targets[key],(0), axis = 0)

In [22]:
# organize and process keypresses
# Processing the keypresses
# Group similarly primed participants for each story
loc_keys_r = dict()
soc_keys_r = dict()
none_keys_r = dict()
all_keys_r = dict()

for i in range(len(data_dict)):
    this_story = data_dict.iloc[i]['story']
    keys = np.array([int(p) for p in list(data_dict.iloc[i]['story_presses'])])
    p_type = data_dict.iloc[i]['p_type']
    #put all story keys in the same dictionary regardless of priming
    if this_story in all_keys_r:
        all_keys_r[this_story] = np.concatenate((all_keys_r[this_story],keys[:,np.newaxis]), axis=1)
    else:
        all_keys_r[this_story] = keys[:, np.newaxis]
    #organize keys by priming type
    #no prime
    if p_type == 'np':
        if this_story in none_keys_r:
            none_keys_r[this_story] = np.concatenate((none_keys_r[this_story],keys[:,np.newaxis]), axis=1)
        else:
            none_keys_r[this_story] = keys[:, np.newaxis]
    #loc prime
    elif p_type == 'loc':
        if this_story in loc_keys_r:
            loc_keys_r[this_story] = np.concatenate((loc_keys_r[this_story],keys[:,np.newaxis]), axis=1)
        else:
            loc_keys_r[this_story] = keys[:, np.newaxis]
    #soc prime
    elif p_type == 'soc':
        if this_story in soc_keys_r:
            soc_keys_r[this_story] = np.concatenate((soc_keys_r[this_story],keys[:,np.newaxis]), axis=1)
        else:
            soc_keys_r[this_story] = keys[:, np.newaxis]
            
# convert keypresses from '9's and '1's to '1's and '0's
loc_keys = dict()
soc_keys = dict()
none_keys = dict()
all_keys = dict()

#all keys
for key in all_keys_r:
    this_array = copy.deepcopy(all_keys_r[key])
    this_array[this_array ==1] = 0
    this_array[this_array == 9] = 1
    all_keys[key] = this_array   
    
#none keys
for key in none_keys_r:
    this_array = copy.deepcopy(none_keys_r[key])
    this_array[this_array ==1] = 0
    this_array[this_array == 9] = 1
    none_keys[key] = this_array   
    
#loc keys
for key in loc_keys_r:
    this_array = copy.deepcopy(loc_keys_r[key])
    this_array[this_array ==1] = 0
    this_array[this_array == 9] = 1
    loc_keys[key] = this_array   
    
#soc keys
for key in soc_keys_r:
    this_array = copy.deepcopy(soc_keys_r[key])
    this_array[this_array ==1] = 0
    this_array[this_array == 9] = 1
    soc_keys[key] = this_array   
    
# Remove first sentence (first row) from all keypresses
list_keys = [all_keys, none_keys, loc_keys, soc_keys]
for i in range(0, len(list_keys)):
    for key in list_keys[i]:
        list_keys[i][key] = np.delete(list_keys[i][key],(0), axis = 0)

In [23]:
df = pd.DataFrame(columns = ['sub', 'story', 'priming', 'loc_bounds', 'soc_bounds', 'loc_targets', 'soc_targets'], 
                   index = list(np.arange(len(data_dict))))


key_type = [none_keys, loc_keys, soc_keys]
prime_type = ['np', 'loc', 'soc']

count_index = 0
for this_story in all_keys:
    for key_i, key in enumerate(key_type):
        for this_subj in range(key_type[key_i][this_story].shape[1]):
            new_row = [count_index]
            new_row.append(this_story)
            new_row.append(prime_type[key_i])
            
            subj_keys = key_type[key_i][this_story][:, this_subj]
            for this_script in range(2):
                # Get putative boundaries
                put_b = story_boundaries[this_story].T[this_script]
                subj_jaccard = jaccard_score(subj_keys, put_b)
                new_row.append(subj_jaccard)
            for this_script in range(2):
                # Get targets
                targets = story_targets[this_story].T[this_script]
                subj_jaccard = jaccard_score(subj_keys, targets)
                new_row.append(subj_jaccard)
            df.loc[count_index] = new_row
            count_index += 1

In [24]:
df.to_csv('comparison_to_EB_vals.csv', index = False) 

In [25]:
from scipy.stats import f_oneway

none_keytotal = np.concatenate([none_keys[s].sum(0) for s in none_keys])
print('Avg keys pushed by no prime:',none_keytotal.mean())

loc_keytotal = np.concatenate([loc_keys[s].sum(0) for s in loc_keys])
print('Avg keys pushed by loc prime:',loc_keytotal.mean())

soc_keytotal = np.concatenate([soc_keys[s].sum(0) for s in soc_keys])
print('Avg keys pushed by soc prime:',soc_keytotal.mean())

print(f_oneway(none_keytotal, loc_keytotal, soc_keytotal))
print('df_between=',3-1)
print('df_within=',len(none_keytotal)+len(loc_keytotal)+len(soc_keytotal)-3)

Avg keys pushed by no prime: 6.792079207920792
Avg keys pushed by loc prime: 7.6415094339622645
Avg keys pushed by soc prime: 7.02970297029703
F_onewayResult(statistic=1.70706355741533, pvalue=0.1831261628514985)
df_between= 2
df_within= 305
