In [18]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.special import ndtr
from scipy.optimize import curve_fit
from scipy.stats import linregress
from pathlib import Path
from math import log, sqrt, e
import json

# 1. CHECKING OUTPUT ON MOCK DATA

In [7]:
# creating empty dataframe:
humans_300_complete_df = pd.DataFrame()
# accessing all json files in 'data' directory:
files = Path('./data/').glob('*.csv')
file_counter = 0
for file in files:
    # reading single csv file as dataframe
    df = pd.read_csv(file)
    # adding single dataframe to general dataframe:
    humans_300_complete_df = pd.concat([humans_300_complete_df,df])
    # incrementing file counter (will be used to label the graph):
    file_counter += 1
    print(file_counter)

1
2


In [8]:
humans_300_complete_df

Unnamed: 0,success,timeout,failed_images,failed_audio,failed_video,trial_type,trial_index,time_elapsed,internal_node_id,view_history,...,stimulus,response,nodes_order,correct_response,block_index,presentation_index,clique_size,graph_size,graphs_couple,correct
0,True,False,[],[],[],preload,0,432,0.0-0.0,,...,,,,,,,,,,
1,,,,,,instructions,1,6142,0.0-1.0,"[{""page_index"":0,""viewing_time"":5103},{""page_i...",...,,,,,,,,,,
2,True,False,[],[],[],preload,2,6369,0.0-2.0,,...,,,,,,,,,,
3,,,,,,instructions,3,15539,0.0-3.0,"[{""page_index"":0,""viewing_time"":697},{""page_in...",...,,,,,,,,,,
4,,,,,,instructions,4,16793,0.0-4.0,"[{""page_index"":0,""viewing_time"":1253}]",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,,,,,,audio-keyboard-response,105,90391,0.0-77.0-0.0-1.0,,...,audio_feedback/right/coin01.mp3,,,,0.0,28.0,33.0,1000.0,"[""0047_N1000_K0033_CLIQUE"",""0037_N1000_K0033_N...",
106,,,,,,canvas-keyboard-response,106,91902,0.0-78.0-0.0-0.0,,...,,arrowleft,"[221,630,638,191,517,97,261,632,64,452,246,505...",arrowright,0.0,29.0,33.0,1000.0,"[""0095_N1000_K0033_NOCLIQUE"",""0098_N1000_K0033...",False
107,,,,,,audio-keyboard-response,107,92892,0.0-78.0-0.0-1.0,,...,audio_feedback/wrong/error01.mp3,,,,0.0,29.0,33.0,1000.0,"[""0095_N1000_K0033_NOCLIQUE"",""0098_N1000_K0033...",
108,,,,,,instructions,108,93636,0.0-79.0,"[{""page_index"":0,""viewing_time"":742}]",...,,,,,,,,,,


In [34]:
# isolating survey trials and calculating average age:
humans_300_survey = humans_300_complete_df[humans_300_complete_df.trial_type == "survey-html-form"].response
ages_list = []
for i in humans_300_survey:
    # Convert the JSON string to a Python dictionary (responses are stored as json strings)
    i_dict = json.loads(i)
    # adding age to list:
    ages_list.append(int(i_dict['age']))

# printing age list and average age:
print(ages_list)
print(np.mean(ages_list))

[76, 34]
55.0


In [38]:
# isolating experiment trials ("canvas-keyboard-response" ones) and dropping irrelevant variables:
humans_300_cleaned = humans_300_complete_df[humans_300_complete_df.trial_type == "canvas-keyboard-response"]
# dropping irrelevant variables:
humans_300_cleaned.drop(["timeout","failed_images","failed_audio","failed_video","view_history","trial_index", "time_elapsed","internal_node_id" ,"success","stimulus"], axis=1, inplace=True)
# isolating trials were final responses were given (shuffles have " " as response):
humans_300_final = humans_300_cleaned[humans_300_cleaned['response'].isin(['arrowright','arrowleft'])]
print(humans_300_final['clique_size'].value_counts() )  # final answers for each clique size, in this case should be 2*(number of participants)


300.0    4
267.0    4
233.0    4
217.0    4
200.0    4
183.0    4
167.0    4
150.0    4
133.0    4
117.0    4
100.0    4
83.0     4
67.0     4
50.0     4
33.0     4
Name: clique_size, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  humans_300_cleaned.drop(["timeout","failed_images","failed_audio","failed_video","view_history","trial_index", "time_elapsed","internal_node_id" ,"success","stimulus"], axis=1, inplace=True)


In [58]:
# loop through trials to retrieve graph names:
for clique_size in humans_300_final['clique_size'].unique():
    # isolating data of current clique size:
    current_data = humans_300_final[humans_300_final.clique_size == clique_size]
    # counting number of trials for current clique size:
    print(type(current_data.graphs_couple))
    # unpacking the series:
    for i in current_data.graphs_couple:
        array_names = json.loads(i)
        print(array_names)
        print(array_names[1])
        # storing graph names in an array to be used to retrieve the graph in the "graph_pools" folder and calculate properties

<class 'pandas.core.series.Series'>
['0007_N1000_K0300_NOCLIQUE', '0073_N1000_K0300_CLIQUE']
0073_N1000_K0300_CLIQUE
['0045_N1000_K0300_NOCLIQUE', '0033_N1000_K0300_CLIQUE']
0033_N1000_K0300_CLIQUE
['0003_N1000_K0300_CLIQUE', '0040_N1000_K0300_NOCLIQUE']
0040_N1000_K0300_NOCLIQUE
['0025_N1000_K0300_NOCLIQUE', '0047_N1000_K0300_CLIQUE']
0047_N1000_K0300_CLIQUE
separate
<class 'pandas.core.series.Series'>
['0069_N1000_K0267_CLIQUE', '0099_N1000_K0267_NOCLIQUE']
0099_N1000_K0267_NOCLIQUE
['0060_N1000_K0267_CLIQUE', '0001_N1000_K0267_NOCLIQUE']
0001_N1000_K0267_NOCLIQUE
['0046_N1000_K0267_CLIQUE', '0043_N1000_K0267_NOCLIQUE']
0043_N1000_K0267_NOCLIQUE
['0074_N1000_K0267_CLIQUE', '0050_N1000_K0267_NOCLIQUE']
0050_N1000_K0267_NOCLIQUE
separate
<class 'pandas.core.series.Series'>
['0028_N1000_K0233_NOCLIQUE', '0024_N1000_K0233_CLIQUE']
0024_N1000_K0233_CLIQUE
['0012_N1000_K0233_NOCLIQUE', '0094_N1000_K0233_CLIQUE']
0094_N1000_K0233_CLIQUE
['0031_N1000_K0233_CLIQUE', '0069_N1000_K0233_NOCLIQUE

# 2. CHECKING OUTPUT ON PILOT 2 DATA
(IMPORTANT NOTE!!
In this iteration, due to an imprecision, there were two variables for graph names:
- "graphs_names": actually contains the graphs names;
- "graphs_couple": is empty
)
In the next iterations, only "graphs_couple" will contain the graph names, and the other variable will not be present anymore

In [60]:
# creating empty dataframe:
humans_1000_complete_df = pd.DataFrame()
# accessing all json files in 'data' directory:
files = Path('./data/pilot2_csv_copy/').glob('*.csv')
file_counter = 0
for file in files:
    # reading single csv file as dataframe
    df = pd.read_csv(file)
    # adding single dataframe to general dataframe:
    humans_1000_complete_df = pd.concat([humans_1000_complete_df,df])
    # incrementing file counter (will be used to label the graph):
    file_counter += 1
    print(file_counter)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15


In [62]:
# isolating survey trials and calculating average age:
humans_1000_survey = humans_1000_complete_df[humans_1000_complete_df.trial_type == "survey-html-form"].response
ages_list = []
for i in humans_1000_survey:
    # Convert the JSON string to a Python dictionary (responses are stored as json strings)
    i_dict = json.loads(i)
    # adding age to list:
    ages_list.append(int(i_dict['age']))

# printing age list and average age:
print(ages_list)
print(np.mean(ages_list))

[25, 23, 24, 25, 28, 27, 27, 26, 27, 27, 23, 24, 32, 23, 25]
25.733333333333334


In [64]:
# isolating experiment trials ("canvas-keyboard-response" ones) and dropping irrelevant variables:
humans_1000_cleaned = humans_1000_complete_df[humans_1000_complete_df.trial_type == "canvas-keyboard-response"]
# dropping irrelevant variables:
humans_1000_cleaned.drop(["timeout","failed_images","failed_audio","failed_video","view_history","trial_index", "time_elapsed","internal_node_id" ,"success","stimulus"], axis=1, inplace=True)
# isolating trials were final responses were given (shuffles have " " as response):
humans_1000_final = humans_1000_cleaned[humans_1000_cleaned['response'].isin(['arrowright','arrowleft'])]
print(humans_1000_final['clique_size'].value_counts() )  # final answers for each clique size, in this case should be 12*(number of participants)

300.0    180
267.0    180
233.0    180
217.0    180
200.0    180
183.0    180
167.0    180
150.0    180
133.0    180
117.0    180
100.0    180
83.0     180
67.0     180
50.0     180
33.0     180
Name: clique_size, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  humans_1000_cleaned.drop(["timeout","failed_images","failed_audio","failed_video","view_history","trial_index", "time_elapsed","internal_node_id" ,"success","stimulus"], axis=1, inplace=True)


In [73]:
# printing column names:
print(humans_1000_final.columns)
print(humans_1000_final['clique_size'].unique())

Index(['trial_type', 'rt', 'response', 'nodes_order', 'correct_response',
       'block_index', 'presentation_index', 'clique_size', 'graph_size',
       'graphs_names', 'graphs_couple', 'accuracy'],
      dtype='object')
[300. 267. 233. 217. 200. 183. 167. 150. 133. 117. 100.  83.  67.  50.
  33.]


In [79]:
# loop through trials to retrieve graph names:
for clique_size in humans_1000_final['clique_size'].unique():
    # isolating data of current clique size:
    current_data = humans_1000_final[humans_1000_final.clique_size == clique_size]
    # unpacking the series:
    for i in current_data.graphs_names:
        array_names = json.loads(i)
        print(array_names)
        print(array_names[1])
        # storing graph names in an array to be used to retrieve the graph in the "graph_pools" folder and calculate properties

['0085_N1000_K0300_NOCLIQUE', '0044_N1000_K0300_CLIQUE']
0044_N1000_K0300_CLIQUE
['0042_N1000_K0300_NOCLIQUE', '0023_N1000_K0300_CLIQUE']
0023_N1000_K0300_CLIQUE
['0088_N1000_K0300_CLIQUE', '0005_N1000_K0300_NOCLIQUE']
0005_N1000_K0300_NOCLIQUE
['0004_N1000_K0300_NOCLIQUE', '0016_N1000_K0300_CLIQUE']
0016_N1000_K0300_CLIQUE
['0015_N1000_K0300_NOCLIQUE', '0066_N1000_K0300_CLIQUE']
0066_N1000_K0300_CLIQUE
['0075_N1000_K0300_NOCLIQUE', '0087_N1000_K0300_CLIQUE']
0087_N1000_K0300_CLIQUE
['0039_N1000_K0300_CLIQUE', '0067_N1000_K0300_NOCLIQUE']
0067_N1000_K0300_NOCLIQUE
['0071_N1000_K0300_NOCLIQUE', '0057_N1000_K0300_CLIQUE']
0057_N1000_K0300_CLIQUE
['0054_N1000_K0300_CLIQUE', '0097_N1000_K0300_NOCLIQUE']
0097_N1000_K0300_NOCLIQUE
['0032_N1000_K0300_NOCLIQUE', '0009_N1000_K0300_CLIQUE']
0009_N1000_K0300_CLIQUE
['0074_N1000_K0300_CLIQUE', '0069_N1000_K0300_NOCLIQUE']
0069_N1000_K0300_NOCLIQUE
['0073_N1000_K0300_CLIQUE', '0011_N1000_K0300_NOCLIQUE']
0011_N1000_K0300_NOCLIQUE
['0035_N1000_K0300