In [1]:
import pandas as pd
import numpy as np
import os

from bokeh.plotting import figure, show, output_file,gridplot
from bokeh.models import FixedTicker,Range1d

import seaborn as sns

In [31]:
data_fold='figure_csv'

# Subject results

In [32]:
type_fold='subj'

# Cluster dispersion

In [4]:
disp_fname='subj_dispersion.csv'

dispersion_df=pd.read_csv(os.path.join(data_fold,type_fold,disp_fname))
disperion_by_display=dispersion_df.loc[:,['iter', 'disp','value']].groupby(['iter','disp']).mean().reset_index()
dispersion_by_iter=disperion_by_display.loc[:,['value','iter']].groupby('iter')

dispersion_mean=dispersion_by_iter.mean().reset_index()
dispersion_std=dispersion_by_iter.std()
dispersion_num=dispersion_by_iter.size()
dispersion_sem=pd.DataFrame(np.divide(dispersion_std.value,np.sqrt(dispersion_num))).reset_index()
dispersion_sem.columns=['iter','dispersion_sem']
dispersion_stats=pd.merge(dispersion_mean,dispersion_sem,on='iter')
dispersion_stats.columns=['iter','dispersion','dispersion_sem']

In [5]:
plot = figure(width=1000, height=600)


plot.line(x=dispersion_stats.iter,y=dispersion_stats.dispersion,color='cornflowerblue',line_width=6)
plot.scatter(x=dispersion_stats.iter,y=dispersion_stats.dispersion,color='cornflowerblue',size=15)

err_xs = []
err_ys = []

for x, y, yerr in zip(dispersion_stats.iter, dispersion_stats.dispersion, dispersion_stats.dispersion_sem):
    err_xs.append((x, x))
    err_ys.append((y - yerr, y + yerr))

plot.multi_line(err_xs, err_ys, color='cornflowerblue',line_width=6)

plot.xaxis.axis_label='Iteration'
plot.xaxis.axis_label_text_font_size='32px'
plot.xaxis.major_label_text_font_size='24px'

plot.yaxis.axis_label='Log Group Dispersion'
plot.yaxis.axis_label_text_font_size='32px'
plot.yaxis.major_label_text_font_size='24px'

plot.xgrid.grid_line_color = None
plot.ygrid[0].ticker.desired_num_ticks = 4

output_file('group_dispersion.html')
show(plot)

# Line proportion

In [6]:
disp_fname='subj_line_proportion.csv'

line_df=pd.read_csv(os.path.join(data_fold,type_fold,disp_fname))
line_by_display=line_df.loc[:,['iter', 'disp','value']].groupby(['iter','disp']).mean().reset_index()
line_by_iter=line_by_display.loc[:,['value','iter']].groupby('iter')

line_mean=line_by_iter.mean().reset_index()
line_std=line_by_iter.std()
line_num=line_by_iter.size()
line_sem=pd.DataFrame(np.divide(line_std.value,np.sqrt(line_num))).reset_index()
line_sem.columns=['iter','line_sem']
line_stats=pd.merge(line_mean,line_sem,on='iter')
line_stats.columns=['iter','line','line_sem']

In [7]:
plot = figure(width=1000, height=600)


plot.line(x=line_stats.iter,y=line_stats.line,color='cornflowerblue',line_width=6)
plot.scatter(x=line_stats.iter,y=line_stats.line,color='cornflowerblue',size=15)

err_xs = []
err_ys = []

for x, y, yerr in zip(line_stats.iter, line_stats.line, line_stats.line_sem):
    err_xs.append((x, x))
    err_ys.append((y - yerr, y + yerr))

plot.multi_line(err_xs, err_ys, color='cornflowerblue',line_width=6)

plot.xaxis.axis_label='Iteration'
plot.xaxis.axis_label_text_font_size='32px'
plot.xaxis.major_label_text_font_size='24px'

plot.yaxis.axis_label='Proportion of Line Groups'
plot.yaxis.axis_label_text_font_size='32px'
plot.yaxis.major_label_text_font_size='24px'

plot.xgrid.grid_line_color = None
plot.ygrid[0].ticker.desired_num_ticks = 3

output_file('line_proportion.html')
show(plot)

# Similarity comparison

In [33]:
disp_fname='subj_len_sim.csv'

len_sim_df=pd.read_csv(os.path.join(data_fold,type_fold,disp_fname))
len_sim_by_display=len_sim_df.loc[:,['iter', 'disp','value']].groupby(['iter','disp']).mean().reset_index()
len_sim_by_iter=len_sim_by_display.loc[:,['value','iter']].groupby('iter')

len_sim_mean=len_sim_by_iter.mean().reset_index()
len_sim_std=len_sim_by_iter.std()
len_sim_num=len_sim_by_iter.size()
len_sim_sem=pd.DataFrame(np.divide(len_sim_std.value,np.sqrt(len_sim_num))).reset_index()
len_sim_sem.columns=['iter','len_sim_sem']
len_sim_stats=pd.merge(len_sim_mean,len_sim_sem,on='iter')
len_sim_stats.columns=['iter','num_sim','num_sim_sem']

In [34]:
plot = figure(width=1000, height=600)


plot.line(x=len_sim_stats.iter,y=len_sim_stats.num_sim,color='cornflowerblue',line_width=6)
plot.scatter(x=len_sim_stats.iter,y=len_sim_stats.num_sim,color='cornflowerblue',size=15)

err_xs = []
err_ys = []

for x, y, yerr in zip(len_sim_stats.iter, len_sim_stats.num_sim, len_sim_stats.num_sim_sem):
    err_xs.append((x, x))
    err_ys.append((y - yerr, y + yerr))

plot.multi_line(err_xs, err_ys, color='cornflowerblue',line_width=6)

plot.xaxis.axis_label='Iteration'
plot.xaxis.axis_label_text_font_size='32px'
plot.xaxis.major_label_text_font_size='24px'

plot.yaxis.axis_label='Similar-Different Length Line Pairs'
plot.yaxis.axis_label_text_font_size='32px'
plot.yaxis.major_label_text_font_size='24px'

plot.xgrid.grid_line_color = None

output_file('len_sim.html')
show(plot)

In [10]:
disp_fname='subj_ang_sim.csv'

ang_sim_df=pd.read_csv(os.path.join(data_fold,type_fold,disp_fname))
ang_sim_by_display=ang_sim_df.loc[:,['iter', 'disp','value']].groupby(['iter','disp']).mean().reset_index()
ang_sim_by_iter=ang_sim_by_display.loc[:,['value','iter']].groupby('iter')

ang_sim_mean=ang_sim_by_iter.mean().reset_index()
ang_sim_std=ang_sim_by_iter.std()
ang_sim_num=ang_sim_by_iter.size()
ang_sim_sem=pd.DataFrame(np.divide(ang_sim_std.value,np.sqrt(ang_sim_num))).reset_index()
ang_sim_sem.columns=['iter','ang_sim_sem']
ang_sim_stats=pd.merge(ang_sim_mean,ang_sim_sem,on='iter')
ang_sim_stats.columns=['iter','num_sim','num_sim_sem']

In [11]:
plot = figure(width=1000, height=600)


plot.line(x=ang_sim_stats.iter,y=ang_sim_stats.num_sim,color='cornflowerblue',line_width=6)
plot.scatter(x=ang_sim_stats.iter,y=ang_sim_stats.num_sim,color='cornflowerblue',size=15)

err_xs = []
err_ys = []

for x, y, yerr in zip(ang_sim_stats.iter, ang_sim_stats.num_sim, ang_sim_stats.num_sim_sem):
    err_xs.append((x, x))
    err_ys.append((y - yerr, y + yerr))

plot.multi_line(err_xs, err_ys, color='cornflowerblue',line_width=6)

plot.xaxis.axis_label='Iteration'
plot.xaxis.axis_label_text_font_size='32px'
plot.xaxis.major_label_text_font_size='24px'

plot.yaxis.axis_label='Similar-Different Angle Line Pairs'
plot.yaxis.axis_label_text_font_size='32px'
plot.yaxis.major_label_text_font_size='24px'

plot.xgrid.grid_line_color = None
plot.ygrid[0].ticker.desired_num_ticks = 4

output_file('ang_sim.html')
show(plot)

# Compare subjects to models

## Dispersion

In [12]:
data_names=['subj', 'isotropic_v1']

In [13]:
model_dispersion_df=[]
for data in data_names:
    type_fold=data
    disp_fname=data+'_dispersion.csv'
    print disp_fname
    dispersion_df=pd.read_csv(os.path.join(data_fold,type_fold,disp_fname))
    disperion_by_display=dispersion_df.loc[:,['iter', 'disp','value']].groupby(['iter','disp']).mean().reset_index()
    dispersion_by_iter=disperion_by_display.loc[:,['value','iter']].groupby('iter')

    dispersion_mean=dispersion_by_iter.mean().reset_index()
    dispersion_std=dispersion_by_iter.std()
    dispersion_num=dispersion_by_iter.size()
    dispersion_sem=pd.DataFrame(np.divide(dispersion_std.value,np.sqrt(dispersion_num))).reset_index()
    dispersion_sem.columns=['iter','dispersion_sem']
    dispersion_stats=pd.merge(dispersion_mean,dispersion_sem,on='iter')
    dispersion_stats.columns=['iter','dispersion','dispersion_sem']
    model_dispersion_df.append(dispersion_stats)

subj_dispersion.csv
isotropic_v1_dispersion.csv


IOError: File figure_csv/isotropic_v1/isotropic_v1_dispersion.csv does not exist

In [14]:
subj_dispersion_data=model_dispersion_df[0]
mod_dispersion_data=model_dispersion_df[1:]

In [15]:
curr_mod_data=mod_dispersion_data[0]

plot = figure(width=650, height=600,x_range=Range1d(5, 6.35), y_range=Range1d(5, 6.35))


plot.line(x=[5,6.35],y=[5,6.35],color='grey',line_width=6,line_dash=[20, 20])
plot.scatter(x=subj_dispersion_data.dispersion,y=curr_mod_data.dispersion,color='firebrick',size=12)

err_xs = []
err_ys = []

for x, y, yerr in zip(subj_dispersion_data.dispersion, curr_mod_data.dispersion, curr_mod_data.dispersion_sem):
    err_xs.append((x, x))
    err_ys.append((y - yerr, y + yerr))

plot.multi_line(err_xs, err_ys, color='firebrick',line_width=6)

plot.xaxis.axis_label='Participant Dispersion'
plot.xaxis.axis_label_text_font_size='32px'
plot.xaxis.major_label_text_font_size='24px'

plot.yaxis.axis_label='Isotropic Dispersion'
plot.yaxis.axis_label_text_font_size='32px'
plot.yaxis.major_label_text_font_size='24px'

plot.xgrid.grid_line_color = None

output_file('subj_mod_dispersion.html')
show(plot)

## Line proportion

In [14]:
data_names=['subj', 'isotropic_v1','anisotropic_v1','line_v1','line_hierarchy_v1']
data_names=['subj','line_v1','line_hierarchy_v1']

In [15]:
model_line_proportion_df=[]
for data in data_names:
    type_fold=data
    disp_fname=data+'_line_proportion.csv'
    print disp_fname
    line_df=pd.read_csv(os.path.join(data_fold,type_fold,disp_fname))
    line_by_display=line_df.loc[:,['iter', 'disp','value']].groupby(['iter','disp']).mean().reset_index()
    line_by_iter=line_by_display.loc[:,['value','iter']].groupby('iter')

    line_mean=line_by_iter.mean().reset_index()
    line_std=line_by_iter.std()
    line_num=line_by_iter.size()
    line_sem=pd.DataFrame(np.divide(line_std.value,np.sqrt(line_num))).reset_index()
    line_sem.columns=['iter','line_sem']
    line_stats=pd.merge(line_mean,line_sem,on='iter')
    line_stats.columns=['iter','line','line_sem']
    model_line_proportion_df.append(line_stats)

subj_line_proportion.csv
line_v1_line_proportion.csv
line_hierarchy_v1_line_proportion.csv


In [16]:
subj_line_data=model_line_proportion_df[0]
mod_line_data=model_line_proportion_df[1:]
# mod_names=['Isotropic','Anisotropic','Line','Line Hierarchy']
mod_names=['Line','Line Hierarchy']

In [17]:
line_plots=[]
for curr_mod_data,curr_mod_name in zip(mod_line_data,mod_names):


    plot = figure(width=650, height=600,x_range=Range1d(0, .2), y_range=Range1d(0, .2))

    plot.line(x=[0,.2],y=[0,.2],color='grey',line_width=6,line_dash=[20, 20])
    plot.scatter(x=subj_line_data.line,y=curr_mod_data.line,color='firebrick',size=12)

    err_xs = []
    err_ys = []

    for x, y, yerr in zip(subj_line_data.line, curr_mod_data.line, curr_mod_data.line_sem):
        err_xs.append((x, x))
        err_ys.append((y - yerr, y + yerr))

    plot.multi_line(err_xs, err_ys, color='firebrick',line_width=6)

    plot.xaxis.axis_label='Participant Line Proportion'
    plot.xaxis.axis_label_text_font_size='32px'
    plot.xaxis.major_label_text_font_size='24px'

    plot.yaxis.axis_label=curr_mod_name+' Model Line Proportion'
    plot.yaxis.axis_label_text_font_size='32px'
    plot.yaxis.major_label_text_font_size='24px'

    plot.xgrid.grid_line_color = None
    line_plots.append(plot)

grid_line_plots=gridplot([line_plots])
output_file('subj_mod_line.html')
show(grid_line_plots)

In [20]:
line_plots

[<bokeh.plotting.figure.Figure at 0x117716d10>,
 <bokeh.plotting.figure.Figure at 0x117712650>,
 <bokeh.plotting.figure.Figure at 0x117712d10>,
 <bokeh.plotting.figure.Figure at 0x11772a610>]

# Line length similarity

In [18]:
data_names=['subj','line_v1','line_hierarchy_v1']

In [19]:
model_len_sim_df=[]
for data in data_names:
    type_fold=data
    disp_fname=data+'_len_sim.csv'
    print disp_fname
    len_df=pd.read_csv(os.path.join(data_fold,type_fold,disp_fname))
    len_by_display=len_df.loc[:,['iter', 'disp','value']].groupby(['iter','disp']).mean().reset_index()
    len_by_iter=len_by_display.loc[:,['value','iter']].groupby('iter')

    len_mean=len_by_iter.mean().reset_index()
    len_std=len_by_iter.std()
    len_num=len_by_iter.size()
    len_sem=pd.DataFrame(np.divide(len_std.value,np.sqrt(len_num))).reset_index()
    len_sem.columns=['iter','len_sem']
    len_stats=pd.merge(len_mean,len_sem,on='iter')
    len_stats.columns=['iter','len_sim','len_sim_sem']
    model_len_sim_df.append(len_stats)

subj_len_sim.csv
line_v1_len_sim.csv
line_hierarchy_v1_len_sim.csv


In [20]:
subj_len_sim_data=model_len_sim_df[0]
mod_len_sim_data=model_len_sim_df[1:]
mod_names=['Line','Hierarchical']

In [21]:
line_plots=[]
for curr_mod_data,curr_mod_name in zip(mod_len_sim_data,mod_names):


#    plot = figure(width=650, height=600,x_range=Range1d(0, .2), y_range=Range1d(0, .2))
    plot = figure(width=650, height=600)

    plot.line(x=[0,.2],y=[0,.2],color='grey',line_width=6,line_dash=[20, 20])
    plot.scatter(x=subj_len_sim_data.len_sim,y=curr_mod_data.len_sim,color='firebrick',size=12)

    err_xs = []
    err_ys = []

    for x, y, yerr in zip(subj_len_sim_data.len_sim, curr_mod_data.len_sim, curr_mod_data.len_sim_sem):
        err_xs.append((x, x))
        err_ys.append((y - yerr, y + yerr))

    plot.multi_line(err_xs, err_ys, color='firebrick',line_width=6)

    plot.xaxis.axis_label='Participant Length Similarity'
    plot.xaxis.axis_label_text_font_size='32px'
    plot.xaxis.major_label_text_font_size='24px'

    plot.yaxis.axis_label=curr_mod_name+' Model Length Similarity'
    plot.yaxis.axis_label_text_font_size='32px'
    plot.yaxis.major_label_text_font_size='24px'

    plot.xgrid.grid_line_color = None
    line_plots.append(plot)

grid_line_plots=gridplot([line_plots])
output_file('subj_mod_len_sim.html')
show(grid_line_plots)

# Line angle similarity

In [22]:
data_names=['subj','line_v1','line_hierarchy_v1']

In [23]:
model_ang_sim_df=[]
for data in data_names:
    type_fold=data
    disp_fname=data+'_ang_sim.csv'
    print disp_fname
    ang_df=pd.read_csv(os.path.join(data_fold,type_fold,disp_fname))
    ang_by_display=ang_df.loc[:,['iter', 'disp','value']].groupby(['iter','disp']).mean().reset_index()
    ang_by_iter=ang_by_display.loc[:,['value','iter']].groupby('iter')

    ang_mean=ang_by_iter.mean().reset_index()
    ang_std=ang_by_iter.std()
    ang_num=ang_by_iter.size()
    ang_sem=pd.DataFrame(np.divide(ang_std.value,np.sqrt(ang_num))).reset_index()
    ang_sem.columns=['iter','ang_sem']
    ang_stats=pd.merge(ang_mean,ang_sem,on='iter')
    ang_stats.columns=['iter','ang_sim','ang_sim_sem']
    model_ang_sim_df.append(ang_stats)

subj_ang_sim.csv
line_v1_ang_sim.csv
line_hierarchy_v1_ang_sim.csv


In [24]:
subj_ang_sim_data=model_ang_sim_df[0]
mod_ang_sim_data=model_ang_sim_df[1:]
mod_names=['Line','Hierarchical']

In [25]:
line_plots=[]
for curr_mod_data,curr_mod_name in zip(mod_ang_sim_data,mod_names):


#    plot = figure(width=650, height=600,x_range=Range1d(0, .2), y_range=Range1d(0, .2))
    plot = figure(width=650, height=600)

    plot.line(x=[0,.2],y=[0,.2],color='grey',line_width=6,line_dash=[20, 20])
    plot.scatter(x=subj_ang_sim_data.ang_sim,y=curr_mod_data.ang_sim,color='firebrick',size=12)

    err_xs = []
    err_ys = []

    for x, y, yerr in zip(subj_ang_sim_data.ang_sim, curr_mod_data.ang_sim, curr_mod_data.ang_sim_sem):
        err_xs.append((x, x))
        err_ys.append((y - yerr, y + yerr))

    plot.multi_line(err_xs, err_ys, color='firebrick',line_width=6)

    plot.xaxis.axis_label='Participant Angle Similarity'
    plot.xaxis.axis_label_text_font_size='32px'
    plot.xaxis.major_label_text_font_size='24px'

    plot.yaxis.axis_label=curr_mod_name+' Model Angle Similarity'
    plot.yaxis.axis_label_text_font_size='32px'
    plot.yaxis.major_label_text_font_size='24px'

    plot.xgrid.grid_line_color = None
    line_plots.append(plot)

grid_line_plots=gridplot([line_plots])
output_file('subj_mod_ang_sim.html')
show(grid_line_plots)

# Proportion based analysis

In [26]:
data_names=['subj','line_v1','line_hierarchy_v1']
data_names=['subj','line_hierarchy_v1']

In [27]:
model_ang_sim_df=[]
for data in data_names:
    type_fold=data
    disp_fname=data+'_ang_sim_prop.csv'
    print disp_fname
    ang_df=pd.read_csv(os.path.join(data_fold,type_fold,disp_fname))
    ang_by_display=ang_df.loc[:,['iter', 'disp','value']].groupby(['iter','disp']).mean().reset_index()
    ang_by_iter=ang_by_display.loc[:,['value','iter']].groupby('iter')

    ang_mean=ang_by_iter.mean().reset_index()
    ang_std=ang_by_iter.std()
    ang_num=ang_by_iter.size()
    ang_sem=pd.DataFrame(np.divide(ang_std.value,np.sqrt(ang_num))).reset_index()
    ang_sem.columns=['iter','ang_sem']
    ang_stats=pd.merge(ang_mean,ang_sem,on='iter')
    ang_stats.columns=['iter','ang_sim','ang_sim_sem']
    model_ang_sim_df.append(ang_stats)

subj_ang_sim_prop.csv
line_hierarchy_v1_ang_sim_prop.csv


In [28]:
num_pairs=ang_df.loc[:,['iter','value']].groupby(['iter']).sum().reset_index() # find total counts
ang_df.loc[:,'len_sim']=ang_df.cat<=2 # find similar line pairs
sim_ang=ang_df.loc[:,['value','len_sim','iter']].groupby(['len_sim','iter']).sum().reset_index() # count number of line pairs
sim_ang2=pd.merge(sim_ang,num_pairs,on='iter') # match to total count in that iteration
sim_ang2.loc[:,'sim_prop']=sim_ang2.loc[:,'value_x']/sim_ang2.loc[:,'value_y']# normalize
sim_ang2.loc[sim_ang2.len_sim,:]

Unnamed: 0,len_sim,iter,value_x,value_y,sim_prop
1,True,1,4,8,0.5
3,True,2,6,12,0.5
5,True,3,8,18,0.444444
7,True,4,8,18,0.444444
9,True,5,10,20,0.5
11,True,6,14,28,0.5
13,True,7,18,38,0.473684
15,True,8,26,48,0.541667
17,True,9,26,46,0.565217
19,True,10,24,48,0.5


In [29]:
num_pairs=ang_df.loc[:,['iter','value']].groupby(['iter']).sum().reset_index() # find total counts
ang_df.loc[:,'len_sim']=ang_df.cat<=2 # find similar line pairs
sim_ang=ang_df.loc[:,['value','len_sim','iter']].groupby(['len_sim','iter']).sum().reset_index() # count number of line pairs
sim_ang2=pd.merge(sim_ang,num_pairs,on='iter') # match to total count in that iteration
sim_ang2.loc[:,'sim_prop']=sim_ang2.loc[:,'value_x']/sim_ang2.loc[:,'value_y']# normalize
sim_ang2.loc[sim_ang2.len_sim,:]

Unnamed: 0,len_sim,iter,value_x,value_y,sim_prop
1,True,1,4,8,0.5
3,True,2,6,12,0.5
5,True,3,8,18,0.444444
7,True,4,8,18,0.444444
9,True,5,10,20,0.5
11,True,6,14,28,0.5
13,True,7,18,38,0.473684
15,True,8,26,48,0.541667
17,True,9,26,46,0.565217
19,True,10,24,48,0.5


In [27]:
num_pairs

Unnamed: 0,iter,value
0,1,4
1,2,4
2,3,6
3,4,4
4,5,2
5,6,0
6,7,4
7,8,8
8,9,16
9,10,14


In [67]:
ang_df.head()

Unnamed: 0,disp,iter,cat,value,cat_sim
0,1,1,1,0,True
1,1,1,2,0,True
2,1,1,3,0,False
3,1,1,4,2,False
4,1,2,1,0,True
