## Data visualization (and statistics)

Visualization and analysis for the fingerprint experiment (hand sanitizer project DSPH). 

Improvements:
- add error bar   
- create function for the bar graph code
- make function incorporation test for normality and significance testing


In [None]:
# imports

# data manipulation and processing
import pandas as pd

# visualizations
from bokeh.models import FactorRange, Legend, ColumnDataSource, Whisker, LegendItem
from bokeh.palettes import Colorblind
from bokeh.plotting import figure, show, output_notebook, output_file

# statistics
from scipy.stats import ttest_ind, mannwhitneyu, anderson, shapiro

output_file('results.html')


### import data fingerprint experiment

In [None]:
df = pd.read_excel('fingerprinting.xlsx')
df.set_index('experiment', inplace=True)
df = df.transpose()
df

In [None]:
# get the mean for each triplicate
df['HS1_S'] = df[df.columns[df.columns.str.contains('HS1-\d+')]].mean(axis=1)
df['HS1_C'] = df[df.columns[df.columns.str.contains('HS1-C\d+')]].mean(axis=1)
#df['HS2_S'] = df[df.columns[df.columns.str.contains('HS2-\d+')]].mean(axis=1)
#df['HS2_C'] = df[df.columns[df.columns.str.contains('HS2-C')]].mean(axis=1)  

df_res = df.iloc[:, -2:].T # change depending on the number of HS (# HS * 2)
df_res['count'] = df_res[df_res.columns].mean(axis=1)

df_res



In [None]:
df_res.reset_index(inplace=True)
df_res[['HS', 'sample_type']] = df_res['experiment'].str.split('_', n=1, expand=True)
df_res.drop(['experiment'], axis=1, inplace=True)


In [None]:
df_res

In [None]:
df_res['std_dev'] = df_res.iloc[:, :5].std(axis=1)


## visualization

In [None]:
df_res['sample_type'] = df_res['sample_type'].map({'S': 'treated', 'C': 'control'})
df_res['x'] = df_res.apply(lambda row: (row['HS'], str(row['sample_type'])), axis=1)

In [None]:
df_res

In [None]:
# nested bar graph: https://stackoverflow.com/questions/67901133/create-nested-bar-graph-in-bokeh-from-a-dataframe
# legend append: https://stackoverflow.com/questions/46730609/position-the-legend-outside-the-plot-area-with-bokeh

def nested_bar(df, sample_col, value_col, tuple_experiment):
    '''
    Function to create a nested bar graph using bokeh. 

    Args:
        df                pandas dataframe containing sample ID and column with mean counts.
        sample_col        column containing experiment type (e.g. treated and control) (str)
        value_col         column with the count data (str)
        tuple_experiment  column containing a tuple with (experiment number, sample type) e.g. (HS1, treated) (str)

    TODO:
        - add possibility to change title and axis names when calling function
        - add checks to make sure the arguments are of the correct type
        - add error bars
    '''

    df.columns = df.columns.astype(str)

    p = figure(
        x_range=FactorRange(*list(df[tuple_experiment].unique())),
        width=500
    )

    factors = df[sample_col].unique()

    # Manually specify colors for each factor
    colors = Colorblind[3][:len(factors)]

    legend_items = []

    for i, factor in enumerate(factors):
        source = ColumnDataSource(df[df[sample_col] == factor])
        
        vbar = p.vbar(x=tuple_experiment, top=value_col, width=0.9, source=source,
                    color=colors[i])
        legend_items.append((factor, [vbar]))


    p.y_range.start = 0
    p.y_range.end = df[value_col].max() * 1.2
    p.x_range.range_padding = 0.25

    p.title = "Number of colonies per handsanitizer"
    p.title.text_font_size = '15px'
    p.yaxis.axis_label = "Number of colonies"
    p.xaxis.axis_label = "Hand Sanitizers"
    p.xgrid.grid_line_color = None

    # Create a legend
    legend = Legend(items=legend_items, location="top_center")
    legend.label_text_font_size = "12px"
    legend.spacing = 5
    legend.click_policy = "hide"  

    p.add_layout(legend, 'below')

    # Show the plot
    show(p)



In [None]:
nested_bar(df_res, 'sample_type', 'count', 'x')

In [None]:
# prepare for statistics
# Create a new column by combining 'HS' and 'sample_type'
df_res['HS_sample_type'] = df_res['HS'] + '_' + df_res['sample_type']

df_res.set_index('HS_sample_type', inplace=True)
df_transposed = df_res.T
df_transposed.reset_index(inplace=True)

# Convert 'HS1_treated' and 'HS1_control' to numeric
df_transposed['HS1_treated'] = pd.to_numeric(df_transposed['HS1_treated'][:6])
df_transposed['HS1_control'] = pd.to_numeric(df_transposed['HS1_control'][:6])

treated = df_transposed['HS1_treated'].iloc[:6]
control = df_transposed['HS1_control'].iloc[:6]

In [None]:
print(f'control : {anderson(control)}')
print(f'control : {anderson(treated)}')


In [None]:
# Check normality using Shapiro-Wilk test
stat_treated, p_value_treated = shapiro(treated)
stat_control, p_value_control = shapiro(control)

# Print the results
print(f'Shapiro-Wilk test for normality - HS1 treated: Statistic = {stat_treated}, P-value = {p_value_treated}')
print(f'Shapiro-Wilk test for normality - HS1 control: Statistic = {stat_control}, P-value = {p_value_control}')


**conclusion** 

The p-value is above 0.05 thus the data is normally distributed.

for the anderson-darling test - for significance level of 0.05: statistic is below the critical value

Thus the H0 is not rejected

In [None]:
# Perform two-sample t-test
t_statistic, p_value = ttest_ind(treated, control)

# Print the results
print(f'Two-sample t-test results for HS1: T-statistic = {t_statistic}, P-value = {p_value}')

In [None]:
# Perform Mann-Whitney U test
u_statistic, p_value = mannwhitneyu(treated, control)

# Print the results
print(f'Mann-Whitney U test results for HS1: U-statistic = {u_statistic}, P-value = {p_value}')

# conclusion
p-value is 0.225 for the t-test (T-statistic = 1.29) thus there is no significant difference between the efficacy of the hand sanitizer and the control (washing with water and soap).