In [172]:
import pandas as pd
import numpy as np
import os

from bokeh.plotting import figure, show, output_file
import seaborn as sns

In [7]:
data_fold='figure_csv'

# Cluster dispersion

In [89]:
disp_fname='dispersion_example.csv'

dispersion_df=pd.read_csv(os.path.join(data_fold,disp_fname))
disperion_by_display=dispersion_df.loc[:,['iter', 'disp','dispersion']].groupby(['iter','disp']).mean().reset_index()
dispersion_by_iter=disperion_by_display.loc[:,['dispersion','iter']].groupby('iter')

dispersion_mean=dispersion_by_iter.mean().reset_index()
dispersion_std=dispersion_by_iter.std()
dispersion_num=dispersion_by_iter.size()
dispersion_sem=pd.DataFrame(np.divide(dispersion_std.dispersion,np.sqrt(dispersion_num))).reset_index()
dispersion_sem.columns=['iter','dispersion_sem']
dispersion_stats=pd.merge(dispersion_mean,dispersion_sem,on='iter')

In [86]:
plot = figure(width=1000, height=600)


plot.line(x=dispersion_stats.iter,y=dispersion_stats.dispersion,color='#666699',line_width=4)

err_xs = []
err_ys = []

for x, y, yerr in zip(dispersion_stats.iter, dispersion_stats.dispersion, dispersion_stats.dispersion_sem):
    err_xs.append((x, x))
    err_ys.append((y - yerr, y + yerr))

plot.multi_line(err_xs, err_ys, color='#666699',line_width=4)

plot.xaxis.axis_label='Iteration'
plot.xaxis.axis_label_text_font_size='32px'
plot.xaxis.major_label_text_font_size='24px'

plot.yaxis.axis_label='Log Group Dispersion'
plot.yaxis.axis_label_text_font_size='32px'
plot.yaxis.major_label_text_font_size='24px'

plot.xgrid.grid_line_color = None

output_file('group_dispersion.html')
show(plot)

# Stack plot

In [237]:
from bokeh.charts import Area
from bokeh.models import Range1d
from collections import OrderedDict

In [168]:
line_fname='line_example.csv'

line_df=pd.read_csv(os.path.join(data_fold,line_fname))
line_df2=line_df.iloc[:,1:]

In [249]:
def rgb_to_hex(rgb):
    return '#%02x%02x%02x' % tuple(rgb)
temp=[[255*j for j in i] for i in sns.color_palette("RdBu")]
cols=[rgb_to_hex(i) for i in temp]
cols.reverse()
cols=cols[0:4]
cols.insert(0,'darkred')

In [251]:
def stacked(df, categories):
    areas = dict()
    last = np.zeros(len(df[categories[0]]))
    for cat in categories:
        next = last + df[cat]
        areas[cat] = np.hstack((last[::-1], next))
        last = next
    return areas

categories=line_df2.columns
areas=stacked(line_df2, categories)

colors =cols#['indianred','dodgerblue','orange','teal','red']

x2 = np.hstack((line_df['iter'][::-1], line_df['iter']))

plot = figure(width=1000, height=600,x_range=Range1d(0,5), y_range=Range1d(0,1))

for a, area in enumerate(sorted(areas.keys())):
    plot.patch(x2, areas[area], color=colors[a],legend=area)


# bells and whistles
plot.xgrid.grid_line_color = None
plot.ygrid.grid_line_color = None

plot.xaxis.axis_label='Iteration'
plot.xaxis.axis_label_text_font_size='32px'
plot.xaxis.major_label_text_font_size='24px'

plot.yaxis.axis_label='Proportion'
plot.yaxis.axis_label_text_font_size='32px'
plot.yaxis.major_label_text_font_size='24px'


output_file("line_plot.html")
show(plot)

line
q1
q2
q3
q4


# Similarity comparison

In [220]:
similar_fname='similarity_example.csv'

similarity_all_df=pd.read_csv(os.path.join(data_fold,similar_fname))
split_cats=np.unique(similarity_all_df.loc[:,'split'])

In [225]:
plot = figure(width=1000, height=600)
color=['indianred','dodgerblue']

# Plot lines

for i,col in zip(split_cats,color):
    similar_df=similarity_all_df.loc[similarity_all_df.loc[:,'split']==i]
    
    similar_by_display=similar_df.loc[:,['iter', 'disp','num']].groupby(['iter','disp']).mean().reset_index()
    similar_by_iter=similar_by_display.loc[:,['num','iter']].groupby('iter')

    similar_mean=similar_by_iter.mean().reset_index()
    similar_std=similar_by_iter.std()
    similar_num=similar_by_iter.size()
    similar_sem=pd.DataFrame(np.divide(similar_std.num,np.sqrt(similar_num))).reset_index()
    similar_sem.columns=['iter','num_sem']
    similar_stats=pd.merge(similar_mean,similar_sem,on='iter')
    
    plot.line(x=dispersion_stats.iter,y=similar_stats.num,color=col,line_width=4,legend=str(i))

    err_xs = []
    err_ys = []

    for x, y, yerr in zip(similar_stats.iter, similar_stats.num, similar_stats.num_sem):
        err_xs.append((x, x))
        err_ys.append((y - yerr, y + yerr))

    plot.multi_line(err_xs, err_ys, color=col,line_width=4)

    
plot.xaxis.axis_label='Iteration'
plot.xaxis.axis_label_text_font_size='32px'
plot.xaxis.major_label_text_font_size='24px'

plot.yaxis.axis_label='Number in groups'
plot.yaxis.axis_label_text_font_size='32px'
plot.yaxis.major_label_text_font_size='24px'

plot.xgrid.grid_line_color = None

output_file('similarity_analysis.html')
show(plot)