## Data visualization (and statistics)

- create mock data for the experiments 
- make visualizations
    - bar graph
(- statistical test )

In [6]:
# imports

# data manipulation and processing
import pandas as pd
import numpy as np

# visualizations
from bokeh.models import FactorRange, Legend
from bokeh.palettes import Colorblind, ColumnDataSource
from bokeh.plotting import figure, show, output_notebook

output_notebook()


### create mock data fingerprint experiment

In [7]:
rows = ['1', '2', '3', '4', '5', '6']
columns = ['HS1-C1', 'HS1-C2', 'HS1-C3', 'HS1-1', 'HS1-2', 'HS1-3', 'HS2-C1', 'HS2-C2', 'HS2-C3', 'HS2-1', 'HS2-2', 'HS2-3']

df = pd.DataFrame(np.random.randint(0, 150, size=(6, 12)), columns=columns, index=rows)

In [8]:
df

Unnamed: 0,HS1-C1,HS1-C2,HS1-C3,HS1-1,HS1-2,HS1-3,HS2-C1,HS2-C2,HS2-C3,HS2-1,HS2-2,HS2-3
1,147,68,129,50,17,146,103,70,129,116,22,62
2,64,122,60,129,58,108,105,108,64,131,91,119
3,108,48,92,2,55,67,5,32,22,31,24,76
4,5,129,65,114,49,21,35,106,145,19,80,144
5,147,38,142,85,133,137,6,39,115,15,82,34
6,107,93,124,35,60,43,118,96,103,128,121,58


In [9]:
# get the mean for each triplicate
df['HS1_S'] = df[df.columns[df.columns.str.contains('HS1-\d+')]].mean(axis=1)
df['HS1_C'] = df[df.columns[df.columns.str.contains('HS1-C')]].mean(axis=1)
df['HS2_S'] = df[df.columns[df.columns.str.contains('HS2-\d+')]].mean(axis=1)
df['HS2_C'] = df[df.columns[df.columns.str.contains('HS2-C')]].mean(axis=1) 

df_res = df.iloc[:, -4:].T
df_res['count'] = df_res[df_res.columns].mean(axis=1)

df_res

Unnamed: 0,1,2,3,4,5,6,count
HS1_S,71.0,98.333333,41.333333,61.333333,118.333333,46.0,72.722222
HS1_C,114.666667,82.0,82.666667,66.333333,109.0,108.0,93.777778
HS2_S,66.666667,113.666667,43.666667,81.0,43.666667,102.333333,75.166667
HS2_C,100.666667,92.333333,19.666667,95.333333,53.333333,105.666667,77.833333


In [10]:
df_res.reset_index(inplace=True)
df_res[['HS', 'sample_type']] = df_res['index'].str.split('_', n=1, expand=True)
df_res.drop(['index'], axis=1, inplace=True)

## visualization

In [11]:
df_res['sample_type'] = df_res['sample_type'].map({'S': 'treated', 'C': 'control'})
df_res['x'] = df_res[['HS', 'sample_type']].apply(lambda x: (x[0],str(x[1])), axis=1)
df_res

  df_res['x'] = df_res[['HS', 'sample_type']].apply(lambda x: (x[0],str(x[1])), axis=1)


Unnamed: 0,1,2,3,4,5,6,count,HS,sample_type,x
0,71.0,98.333333,41.333333,61.333333,118.333333,46.0,72.722222,HS1,treated,"(HS1, treated)"
1,114.666667,82.0,82.666667,66.333333,109.0,108.0,93.777778,HS1,control,"(HS1, control)"
2,66.666667,113.666667,43.666667,81.0,43.666667,102.333333,75.166667,HS2,treated,"(HS2, treated)"
3,100.666667,92.333333,19.666667,95.333333,53.333333,105.666667,77.833333,HS2,control,"(HS2, control)"


In [13]:
# nested bar graph: https://stackoverflow.com/questions/67901133/create-nested-bar-graph-in-bokeh-from-a-dataframe
# legend append: https://stackoverflow.com/questions/46730609/position-the-legend-outside-the-plot-area-with-bokeh

p = figure(
    x_range=FactorRange(*list(df_res["x"].unique())),
    width=500
)

factors = df_res['sample_type'].unique()

# Manually specify colors for each factor
colors = Colorblind[3][:len(factors)]

legend_items = []
for i, factor in enumerate(factors):
    source = df_res[df_res['sample_type'] == factor]
    vbar = p.vbar(x='x', top='count', width=0.9, source=source,
                  color=colors[i])
    legend_items.append((factor, [vbar]))

p.y_range.start = 0
p.y_range.end = df_res['count'].max() * 1.2
p.x_range.range_padding = 0.25

p.title = "Number of colonies per handsanitizer"
p.title.text_font_size = '15px'
p.yaxis.axis_label = "Number of colonies"
p.xaxis.axis_label = "Hand Sanitizers"
p.xgrid.grid_line_color = None

# Create a legend
legend = Legend(items=legend_items, location="top_center")
legend.label_text_font_size = "12px"
legend.spacing = 5
legend.click_policy = "hide"  

p.add_layout(legend, 'below')

# Show the plot
show(p)
