# kChip QC Analysis_lysis
**Written:** 20220501\
**Last Updated:** 20231230

## Table of Content
- **Image QC & Stats**
    - uses d_area_trimmed
    - cleans data: remove incomplete kinetic data, correct labels --> d_area_trimmed
    - counts, chip efficiency
    - combo distributions, missing combos
    
    
- **Media Subtract**
    - uses d_area_trimmed
    - blanks all values given a media/media combo 
    - & produces an intermediate df --> trimmed_subdf
    - correct all negative RFU values to 0 (floored)
    - final is blanked_df
    
        
- **Z' Analysis**
    - positive: 12B09_high / 1F97_high


**Output format:** 

path/date_chipID_fileDescriptor_unique(e.g. combo)_analysis(e.g. mean vs median, tp).file

# Inputs (RUN FIRST)

## configurable

In [None]:
# YYYYMMDD_name_ as prefix for output 
chip_id = 'YYYYMMDD_chipName_' # file output
chip_name = 'chipName' # for summary produced

# path to data analysis scripts
script_reroute = '../path/'

# total imaging tp
num_tp = 48

# max microwells for design
num_wells = 124000 # dis148_v7

In [None]:
import os
out_path = './output/image_analysis/'+chip_id
qc_path = './output/qc/'+chip_id
core_path =  './output/core/'+chip_id
cc_path =  './output/coculture/'+chip_id
z_path = './output/z_analysis/'+chip_id

os.makedirs('./output/', exist_ok=True)
os.makedirs('./output/qc/', exist_ok=True)
os.makedirs('./output/core/', exist_ok=True)
os.makedirs('./output/coculture/', exist_ok=True)
os.makedirs('./output/z_analysis/', exist_ok=True)

# Imports

In [None]:
import numpy as np
import pandas as pd
import scipy
from sklearn.metrics import auc

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import sys
sys.path.insert(1, script_reroute)
import sytox_scripts.bootstrap_and_z as bsz
import sytox_scripts.supplementary as helper
import sytox_scripts.cocultures as cocultures

In [None]:
# plotting style
plot_path = script_reroute+'sytox_scripts/plotting_parameters.py'
%run $plot_path

# Image QC & Stats
- remove incomplete kinetic data (microwell not imaged at all tps)
- correct mis-assigned barcodes 
- review kChip efficiency & counts post-image analysis

## drop incomplete kinetic data

In [None]:
# remove microwells with incomplete kinetic data too
w_incomplete = pd.read_csv(out_path+'distance_and_area_filtered.csv',index_col=0)
d_area_trimmed = pd.read_csv(out_path+'distance_and_area_filtered.csv',index_col=0).dropna().reset_index()

In [None]:
w_incomplete.to_csv(out_path+'distance_and_area_filtered_original.csv')
d_area_trimmed.to_csv(out_path+'distance_and_area_filtered_complete_kinetic.csv')

In [None]:
print(w_incomplete.shape, d_area_trimmed.shape)

## generate barcode correction map
- manual corrections based on reference and assignments by image_analysis nb

In [None]:
og = ['barcode1', 'barcode3'] 
temp = ['abc%d' %n for n in range(len(og))]
corrected = ['barcode2', 'barcode4']

In [None]:
map_df = pd.DataFrame({'original':og, 'temp':temp, 'corrected':corrected})
map_df

In [None]:
map_df.to_csv('./barcode_map.csv')

## correct barcodes

In [None]:
d_area_trimmed = pd.read_csv(out_path+'distance_and_area_filtered_complete_kinetic.csv', index_col=0)
label_map = pd.read_csv('./barcode_map.csv')

In [None]:
helper.correct_labels(d_area_trimmed, label_map, save_dir=core_path)

## microwell efficiency (technical replicates)

In [None]:
d_area_trimmed = pd.read_csv(out_path+'distance_and_area_filtered_complete_kinetic.csv', index_col=0)

In [None]:
# expected values
labels = np.unique(np.concatenate((d_area_trimmed.Label_left.unique(),\
                                   d_area_trimmed.Label_right.unique()), axis=None))
# num_wells in first cell
num_inputs = len(labels)

num_combo = scipy.special.comb(num_inputs,2)+num_inputs
exp_reps = int((num_wells/num_combo).round()) # if all combos equal

wells_passed = d_area_trimmed.shape[0]
chip_eff = wells_passed/num_wells
reps_eff = int(chip_eff*exp_reps) 

In [None]:
print('Total No. of Inputs: %s' %num_inputs)
print('Expected Replicate Count @ 100%: ' + str(exp_reps))
print('')

print('No. Microwells after Filtering: %s' %wells_passed)
print('Chip Efficiency: {:.3f}'.format(chip_eff))
print('Expected Replicate Count: %s' %reps_eff)

## counts & distributions

In [None]:
# store counts of unique Hashes
counts_df = d_area_trimmed[['Label_left', 'Label_right', 'Hash']]

In [None]:
# summarize all unique combinations
counts_all = counts_df.groupby(['Label_left', 'Label_right']).count()
av_counts_all = counts_all.Hash.mean()
med_counts_all = counts_all.Hash.median()

In [None]:
print('Average replicate count for all combos: '+str(av_counts_all))
print('Median replicate count for all combos: '+str(med_counts_all))
print('')
print('No. media/media counts: '+str(mm_count))

In [None]:
def count_combo_types(left, right, df=counts_df):
    ''' For specific combinations' distributions.
    '''
    sub_df = df[df.Label_left.str.contains(left) & 
                df.Label_right.str.contains(right)]
    counts = sub_df.groupby(['Label_left', 'Label_right']).count()
    av = counts.Hash.mean()
    med = counts.Hash.median()
    std = counts.Hash.std()
    ct_sum = counts.Hash.sum()
    
    return av, med, std, ct_sum, counts

In [None]:
def plot_count_distr(counts, title, save_desc, num_bin=20, save_dir=qc_path):
    ''' Coplots histogram distribution & boxplot from df.
    counts: df generated from count_combo_types [-1]
    save_desc: (str) unique identifier to file
    '''
    f, (ax_hist, ax_box) = plt.subplots(2, sharex=True,\
                                        gridspec_kw={"height_ratios": (.95 ,.05)})
    
    sns.boxplot(counts, ax=ax_box, fliersize=0.1, boxprops=dict(alpha=.5))
    sns.distplot(counts, ax=ax_hist, kde=False, bins=num_bin)
    ax_hist.set_xlabel('')
    ax_hist.set_title(title)
    plt.xlabel('No. Microwells Observed')
    ax_hist.set_ylabel('Counts of Combinations')
    
    plt.savefig(save_dir+'counts_distribution_'+save_desc+'.png')
    return

In [None]:
plot_count_distr(counts_all.Hash, 'all_combinations', 'all_combo')

## check if there are unrepresented combinations

In [None]:
expected_num_combos = num_combo

In [None]:
counts_concat = helper.concatenate_labels(counts_df)

In [None]:
actual_num_combo = len(counts_concat.Labels_combo.unique())
actual_num_combo

In [None]:
num_missing = num_combo - actual_num_combo
num_missing

# Media Subtract
- export a .csv, trimming excess columns not used in downstream analysis
- media subtraction RFU values per tp

## trim for columns relevant downstream
- only need the labels and time point RFU values for calling hits

In [None]:
d_area_trimmed = pd.read_csv(out_path+'distance_and_area_filtered_complete_kinetic.csv',index_col=0)

In [None]:
subcols = ['Label_left', 'Label_right'] + [i for i in d_area_trimmed if '_norm' in i]
trimmed_subdf = d_area_trimmed[subcols]
trimmed_subdf.to_csv(core_path+'trimmed_for_labels_and_rfu.csv')

## subtraction with min zero
**default:** subtract using median media/media signal

In [None]:
trimmed_subdf = pd.read_csv(core_path+'trimmed_for_labels_and_rfu.csv',index_col=0)

In [None]:
def subtract_media_per_tp(df, media_summ, mode='median'):
    '''
    Subtracts the background media/media RFU value 
    from each microwell for the respective timepoint (col).
    
    df: unblanked dataframe
    median_summ: dataframe containing summarized media/media values
    mode: 'mean' or 'median' of media/media droplets
    '''
    sub = df.copy()
    for col in media_summ.columns:
        sub[col] = sub[col] - media_summ[col][mode]

    return sub

In [None]:
media_save = '_media_media'
helper.summarize_single_combo(df=trimmed_subdf, left='MEDIA', right='MEDIA', save_dir=core_path, save_desc=media_save)

In [None]:
media_med = pd.read_csv(core_path+'summarized'+media_save+'.csv', index_col=0)

In [None]:
media_med

In [None]:
blanked_df = subtract_media_per_tp(trimmed_subdf, media_med)
blanked_df.to_csv(core_path+'trimmed_label_rfu_blanked_notfloored.csv')

In [None]:
blanked_df

In [None]:
blanked_df = blanked_df.applymap(lambda x:(0 if x < 0 else x) if isinstance(x, float) else x)
blanked_df.to_csv(core_path+'trimmed_label_rfu_blanked.csv')
blanked_df

# Z' Analysis
- assess effect size on chip

## coculture - 1F97/12B09

### empirical values

In [None]:
blanked = pd.read_csv(core_path+'trimmed_label_rfu_blanked.csv', index_col=0)

In [None]:
# cocultures
pos_co = helper.extract_combos(blanked, '12B09_high', '1F97_high')
neg_co = helper.extract_combos(blanked, '1F97_high', '1F97_high')

# monocultures
mono_12B = helper.extract_combos(blanked, '12B09_high', 'MEDIA')
mono_1F = helper.extract_combos(blanked, '1F97_high', 'MEDIA')

In [None]:
# not error-adjusted yet
co_auc = auc(range(0, num_tp),pos_co.median())
neg_auc = auc(range(0, num_tp),neg_co.median())

mono12B_auc = auc(range(0, num_tp), mono_12B.median())
mono1F_auc = auc(range(0, num_tp), mono_1F.median())

sum_auc = mono12B_auc + mono1F_auc 
negsum_auc = mono1F_auc*2

pos_dAUC = helper.calc_dAUC(co_auc, sum_auc)
neg_dAUC = helper.calc_dAUC(neg_auc, negsum_auc)
print(pos_dAUC, neg_dAUC)

### bootstrap for SE

In [None]:
# mean of medians, SEM, all bs AUCs
pos_bs = bsz.boot_microwells(pos_co.filter(like='norm'))
neg_bs = bsz.boot_microwells(neg_co.filter(like='norm'))

mono12B_bs = bsz.boot_microwells(mono_12B.filter(like='norm'))
mono1F_bs = bsz.boot_microwells(mono_1F.filter(like='norm'))

In [None]:
all_bs = pd.DataFrame({'co_12B1Fhh': pos_bs[2], 'co_12B12Bhh': mono12B_bs[2],'co_1F1Fhh': mono1F_bs[2],
                       'mono_12Bh': mono12B_bs[2], 'mono_1Fh': mono1F_bs[2]})

In [None]:
updated_bs = helper.calc_dAUC_per_bs(all_bs, 'co_12B1Fhh', 'mono_12Bh', 'mono_1Fh', 'dAUC_12B1F')
updated_bs1 = helper.calc_dAUC_per_bs(all_bs, 'co_12B12Bhh', 'mono_12Bh', 'mono_12Bh', 'dAUC_12B12B')
updated_bs2 = helper.calc_dAUC_per_bs(all_bs, 'co_1F1Fhh', 'mono_1Fh', 'mono_1Fh', 'dAUC_1F1F')

In [None]:
all_bs = pd.concat([updated_bs, updated_bs1, updated_bs2], join='inner')
all_bs.to_csv(z_path+'bootstrapped_AUCs_for_z.csv')

In [None]:
all_bs = pd.read_csv(z_path+'bootstrapped_AUCs_for_z.csv', index_col=0)

In [None]:
all_bs

In [None]:
pos_score_bs = all_bs.dAUC_12B1F.mean()
pos_score_bs_std = all_bs.dAUC_12B1F.std()

print(pos_score_bs,pos_score_bs_std)

### calculate Z'

In [None]:
z_1F = bsz.calc_z(pos_dAUC, neg_dAUC, np.std(all_bs.dAUC_12B1F), np.std(all_bs.dAUC_1F1F))
z_1F

### visualize positive control curve
- also error-adjusts empirical kinetic data

In [None]:
tp = [x/2 for x in range(len(pos_bs[0]))]

co_low = np.subtract(pos_co.median(), pos_bs[1])
co_up = np.add(pos_co.median(), pos_bs[1])
co_auc = auc(tp, co_low)

mono1_up = np.add(mono_12B.median(), mono12B_bs[1])
mono2_up = np.add(mono_1F.median(), mono1F_bs[1])
mono1_low = np.subtract(mono_12B.median(), mono12B_bs[1])
mono2_low = np.subtract(mono_1F.median(), mono1F_bs[1])

mono_sum = np.add(mono1_up, mono2_up)
sum_auc = auc(tp, mono_sum)

pos_adj_score = helper.calc_dAUC(co_auc, sum_auc)

# for lineplot
pos_df = pd.DataFrame({'tp': tp,
                       'co': pos_co.median(), 'co_up': co_up, 'co_low': co_low,
                       'mono1': mono12B_bs[0], 'mono1_up': mono1_up, 'mono1_low': mono1_low,
                       'mono2': mono1F_bs[0], 'mono2_up': mono2_up, 'mono2_low': mono2_low,
                       'mono_sum': mono_sum})

In [None]:
# error-adjusted empirical lysis score
pos_score

In [None]:
plt.figure(figsize=(10,9))
co = sns.lineplot(data=pos_df, x='tp', y='co', ci=None, linewidth=2)
co.fill_between(pos_df.tp, pos_df.co_low, pos_df.co_up, alpha=0.2)

mono_sum = sns.lineplot(data=pos_df, x='tp', y='mono_sum', ci=None, linewidth=2)
mono_sum.fill_between(pos_df.tp, [0]*len(pos_df.tp), [0]*len(pos_df.tp), alpha=0.2)

left = sns.lineplot(data=pos_df, x='tp', y='mono1', ci=None, linewidth=2)
left.fill_between(pos_df.tp, pos_df.mono1_low, pos_df.mono1_up, alpha=0.2)

right = sns.lineplot(data=pos_df, x='tp', y='mono2', ci=None, linewidth=2)
right.fill_between(pos_df.tp, pos_df.mono2_low, pos_df.mono2_up, alpha=.2)

ax = plt.gca() # get current axes
ax.yaxis.set_major_locator(plt.MaxNLocator(4)) # reduce noise on axis
ax.xaxis.set_major_locator(plt.MaxNLocator(6))
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.ylabel('RFU', fontsize=20)
plt.xlabel('time (h)', fontsize=20)

plt.title('12B09_high/1F97_low')
plt.legend(['coculture', 'monoculture sum', '12B09_high', '1F97_high'], bbox_to_anchor=(1.2,0.5))
plt.savefig(qc_path+'visualize_individual_12B1F_sns.png')

# Save QC Summary

In [None]:
summary_df = pd.DataFrame({'Chip': [chip_name], 'No. Inputs': [num_inputs], 'No. Combos': [num_combo], 
                           'Pre-Incomplete Kinetic Removed': [w_incomplete.shape[0]],
                           'Incomplete Kinetic Removed': [d_area_trimmed.shape[0]],
                           'Expected Replicates': [exp_reps], 
                           'Chip Efficiency': [chip_eff], 'Expected Replicates_Eff': [reps_eff],
                           'Average Rep (All)': [av_counts_all], 'Median Rep (All)': [med_counts_all],
                           'Lowest Combo Rep':[counts_all.Hash.min()], 'Media/Media Counts': [mm_count], 
                           'Median Rep (Coculture)': [bugs.Hash.median()], 'Median Rep (Mono)': [mono.Hash.mean()],
                           'No. Missing Combos': [num_missing], 
                           'Empirical 1F/12B score': [pos_dAUC],
                           'Error-Adjusted Empirical 1F/12B score': [pos_adj_score],
                           'z_1F12B_to1F': [z_1F]}).T

In [None]:
summary_df

In [None]:
summary_df.to_csv(qc_path+'chip_QC_summary.csv')