# Test of jupyter for data analysis and visualization

Some example data from csv files is loaded and analyzed. 


In [1]:
%matplotlib notebook
import os
import re
import qgrid
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

qgrid.nbinstall(overwrite=True)

## Reading data from files to a dictionairy that is indexed by subject id

In [2]:
path = "./results/"
pattern = '(?:std-toj-subject-)([0-9]+)(?:.csv)'
files = sorted([f for f in os.listdir(path) if re.match(pattern,f)])
subject_ids = [int(re.match(pattern,f).groups()[0]) for f in files]
dfs = {}
for sid, f in zip(subject_ids, files):
    dfs[sid] = pd.read_csv(path+f, comment='#')

## Excel like view of the raw data 

Data of the first subject. Relevant columns are selected and shown in a table that can be sorted by clicking the column headings.

I have not found a way of using tabs for multiple tables. 

In [3]:
qgrid.show_grid(dfs[1][['t1ori','t2ori','t1lum','t2lum','correct', 'soa', 'condition']])

A summary may be more informative about the raw data. This time a noninteractive table is used that includes al columns of the original file. Appearently one column is replaced by "...". Perhaps the table is too big otherwise. 

In [4]:
dfs[1].describe()

Unnamed: 0.1,Unnamed: 0,acc,accuracy,average_response_time,avg_rt,bg,correct,correct_keyboard_response,count_exp,count_keyboard_response,...,response_time_keyboard_response,soa,t1lum,t1ori,t2lum,t2ori,time_exp,time_keyboard_response,time_sequence,condition
count,960.0,960.0,960.0,960.0,960.0,0.0,960.0,960.0,960.0,960.0,...,960.0,960.0,960.0,960.0,960.0,960.0,960.0,960.0,960.0,960.0
mean,15.4375,66.682027,66.682027,863.547581,863.547581,,0.566667,0.658333,0.0,525.5,...,844.213377,0.020833,117.0,13.125,117.0,13.125,186266.9,5852650.0,5851375.0,8.5
std,10.112828,1.259894,1.259894,73.532174,73.532174,,0.495794,0.474516,0.0,277.272429,...,2226.734016,6.458303,45.969569,20.464497,45.969569,20.464497,2.853662e-09,3416229.0,3416229.0,4.612175
min,0.0,63.829787,63.829787,796.693585,796.693585,,0.0,0.0,0.0,46.0,...,60.16415,-10.0,88.0,0.0,88.0,0.0,186266.9,187589.4,186271.1,1.0
25%,7.0,65.978848,65.978848,813.533927,813.533927,,0.0,0.0,0.0,285.75,...,455.025902,-5.0,88.0,0.0,88.0,0.0,186266.9,864100.2,862808.1,4.75
50%,15.0,66.299963,66.299963,841.672844,841.672844,,1.0,1.0,0.0,525.5,...,601.589341,0.0,88.0,0.0,88.0,0.0,186266.9,6413248.0,6411945.0,8.5
75%,23.0,66.748998,66.748998,871.632624,871.632624,,1.0,1.0,0.0,765.25,...,895.859571,5.0,128.0,18.75,128.0,18.75,186266.9,8966132.0,8964819.0,12.25
max,59.0,71.25,71.25,1164.573741,1164.573741,,1.0,1.0,0.0,1005.0,...,65654.215599,10.0,224.0,60.0,224.0,60.0,186266.9,9802383.0,9801112.0,16.0


In [5]:
column_names = dfs[1].columns.values.tolist()
print(column_names)

['Unnamed: 0', 'acc', 'accuracy', 'average_response_time', 'avg_rt', 'bg', 'correct', 'correct_keyboard_response', 'count_exp', 'count_keyboard_response', 'count_sequence', 'response_keyboard_response', 'response_time_keyboard_response', 'soa', 't1first_resp', 't1lum', 't1ori', 't1right', 't2lum', 't2ori', 'time_exp', 'time_keyboard_response', 'time_sequence', 'condition']


Interesting columns can also be selected from the dataframe if not all columns are shown in these tables.

In [6]:
dfs[1][column_names[5:15]].describe()

Unnamed: 0,bg,correct,correct_keyboard_response,count_exp,count_keyboard_response,count_sequence,response_time_keyboard_response,soa
count,0.0,960.0,960.0,960.0,960.0,960.0,960.0,960.0
mean,,0.566667,0.658333,0.0,525.5,479.5,844.213377,0.020833
std,,0.495794,0.474516,0.0,277.272429,277.272429,2226.734016,6.458303
min,,0.0,0.0,0.0,46.0,0.0,60.16415,-10.0
25%,,0.0,0.0,0.0,285.75,239.75,455.025902,-5.0
50%,,1.0,1.0,0.0,525.5,479.5,601.589341,0.0
75%,,1.0,1.0,0.0,765.25,719.25,895.859571,5.0
max,,1.0,1.0,0.0,1005.0,959.0,65654.215599,10.0


## Plot of the sixteen conditions for all subjects

Typical visualization as TOJ curves. Limited to 4 subjects as it is merely an example.

In [7]:
conditions = dfs[1]["condition"].drop_duplicates().values
soas = np.sort(dfs[1]["soa"].drop_duplicates().values)

In [12]:
for subject_id in subject_ids[1:4]:
    fig, axarr = plt.subplots(4, 4)
    for condition in conditions:
            x = [ dfs[subject_id].loc[(dfs[subject_id]["condition"] == condition) &
                                      (dfs[1]["soa"] == soa),"correct"].mean() for soa in soas]
            axarr[int((condition-1)/4), int((condition-1)%4)].plot(soas, x)
            #axarr[(condition-1)/4, (condition-1)%4].set_title("Condition: {0}".format(condition))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Now a modelbased analysis would follow...