# Plate Batch Hit Analysis
**Written by:** Julie Chen\
**Written on:** 20230122\
**Last Updated**: 20230319

# Inputs & Imports

## configurable

In [None]:
# batch_file_prefix_
file_id = 'batch_'

# path to scripts
script_reroute = '../path/'

In [None]:
# to generate kinetic figure timepoints 
total_tps = 48

# half-an-hour tps
exact_tp = [i/2 for i in range(total_tps)]

## standard

In [None]:
import os
out_base = './output/'
out_path = out_base+file_id
batch_base = './output/batch/'
batch_path = batch_base+file_id
network_base = './output/network/'
network_path = network_base+file_id

os.makedirs(out_base, exist_ok=True)
os.makedirs(batch_base, exist_ok=True)
os.makedirs(network_base, exist_ok=True)

## packages & scripts 

In [None]:
import re
import glob
import numpy as np
import pandas as pd
import scipy
import scipy.stats as stats
from sklearn.metrics import auc
from statsmodels.stats.multitest import multipletests
import itertools

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
%matplotlib inline

import sys
sys.path.insert(1, script_reroute)
import sytox_scripts.bootstrap_and_z as bsz
import sytox_scripts.supplementary as helper
import sytox_scripts.cocultures as cocultures

In [None]:
# plotting style
plot_path = script_reroute+'sytox_scripts/plotting_parameters.py'
%run $plot_path

# Dataset Formatting

## import files & concatenate

In [None]:
def concatenate_files(suffix, import_dir=out_base, save_desc='', save_dir=batch_path):
    ''' Collect ind. files from all plates in directory, concatenate into single dataframe. '''
    files = [file for file in glob.glob(import_dir+'*'+suffix+'.csv')]
    dfs = []
    
    for f in files:
        dfs.append(pd.read_csv(f, index_col=0))
    
    concat_df = pd.concat(dfs).reset_index()
    concat_df.to_csv(save_dir+suffix+save_desc+'.csv')
    
    return concat_df

In [None]:
batch_kinetic = concatenate_files('summarized_all_combos_kinetic')

In [None]:
batch_nk = concatenate_files('summarized_all_combos_nonkinetic')

In [None]:
# clean NaNs generated
batch_kinetic = batch_kinetic[~batch_kinetic.Combo_simple.isna()].reset_index(drop=True)
batch_nk = batch_nk[~batch_nk.Combo_simple.isna()].reset_index(drop=True)

batch_kinetic.to_csv(batch_path+'summarized_all_combos_kinetic.csv')
batch_nk.to_csv(batch_path+'summarized_all_combos_nonkinetic.csv')

In [None]:
batch_kinetic

In [None]:
batch_nk

## alpha-sort labels

In [None]:
batch_kinetic = pd.read_csv(batch_path+'summarized_all_combos_kinetic.csv',index_col=0)
batch_nk = pd.read_csv(batch_path+'summarized_all_combos_nonkinetic.csv',index_col=0)

In [None]:
def alpha_sort_labels(df):
    ''' Alpha-sort Combo_simple, coculture, and all the mono columns. '''
    
    original_simple = df.Combo_simple
    new_simple, new_co = [], []
    
    new_mono1, new_mono2, new_mono1s, new_mono2s = [], [], [], []
    new_mono1_RFU, new_mono2_RFU, new_mono1_err, new_mono2_err = [], [], [], []
    new_mono1_AUC, new_mono2_AUC, new_mono1_adj, new_mono2_adj = [], [], [], []
    new_mono1_AUC_adj, new_mono2_AUC_adj = [], []
    new_mono1_od, new_mono2_od = [], []
    
    for n,i in enumerate(df.Combo_simple):
        switch = 0
        
        isolates_ = [i.split('_')[0], i.split('_')[1]]
        isolates_.sort()
        new = '_'.join(isolates_)
        
        ### see if the other column values have to be switched too ###
        if new == i:
            new_co.append(df.coculture[n])
            new_mono1.append(df.mono1[n])
            new_mono2.append(df.mono2[n])
            new_mono1s.append(df.mono1_s[n])
            new_mono2s.append(df.mono2_s[n])
            new_mono1_RFU.append(df.mono1_RFU[n])
            new_mono2_RFU.append(df.mono2_RFU[n])
            new_mono1_err.append(df.mono1_err[n])
            new_mono2_err.append(df.mono2_err[n])
            new_mono1_AUC.append(df.mono1_AUC[n])
            new_mono2_AUC.append(df.mono2_AUC[n])
            new_mono1_adj.append(df.mono1_adj[n])
            new_mono2_adj.append(df.mono2_adj[n])
            new_mono1_AUC_adj.append(df.mono1_AUC_adj[n])
            new_mono2_AUC_adj.append(df.mono2_AUC_adj[n])
            new_mono1_od.append(df.mono1_inoculum[n])
            new_mono2_od.append(df.mono2_inoculum[n])
        
        else:     
                
            new_co.append('/'.join(['co',df.mono2[n],df.mono1[n]]))
            
            new_mono1.append(df.mono2[n])
            new_mono2.append(df.mono1[n])
            new_mono1s.append(df.mono2_s[n])
            new_mono2s.append(df.mono1_s[n])
            new_mono1_RFU.append(df.mono2_RFU[n])
            new_mono2_RFU.append(df.mono1_RFU[n])
            new_mono1_err.append(df.mono2_err[n])
            new_mono2_err.append(df.mono1_err[n])
            new_mono1_AUC.append(df.mono2_AUC[n])
            new_mono2_AUC.append(df.mono1_AUC[n])
            new_mono1_adj.append(df.mono2_adj[n])
            new_mono2_adj.append(df.mono1_adj[n])
            new_mono1_AUC_adj.append(df.mono2_AUC_adj[n])
            new_mono2_AUC_adj.append(df.mono1_AUC_adj[n])
            new_mono1_od.append(df.mono2_inoculum[n])
            new_mono2_od.append(df.mono1_inoculum[n])
            
    df.coculture = new_co
    df.mono1 = new_mono1
    df.mono2 = new_mono2
    df.mono1_s = new_mono1s
    df.mono2_s = new_mono2s
    df.mono1_RFU = new_mono1_RFU
    df.mono2_RFU = new_mono2_RFU
    df.mono1_err = new_mono1_err
    df.mono2_err = new_mono2_err
    df.mono1_AUC = new_mono1_AUC
    df.mono2_AUC = new_mono2_AUC
    df.mono1_adj = new_mono1_adj
    df.mono2_adj = new_mono2_adj
    df.mono1_AUC_adj = new_mono1_AUC_adj
    df.mono2_AUC_adj = new_mono2_AUC_adj
    df.mono1_inoculum = new_mono1_od
    df.mono2_inoculum = new_mono2_od
    
    df['original_simple'] = original_simple
    df['Combo_simple'] = new_simple
    df.insert(1, 'Combo_simple', df.pop('Combo_simple'))
            
    return df

In [None]:
batch_kinetic_sorted = alpha_sort_labels(batch_kinetic)

In [None]:
batch_nk_sorted = alpha_sort_labels(batch_nk)

In [None]:
batch_nk_sorted

In [None]:
batch_kinetic_sorted.to_csv(batch_path+'summarized_all_combos_kinetic_alpha.csv')
batch_nk_sorted.to_csv(batch_path+'summarized_all_combos_nonkinetic_alpha.csv')

## add unique culture ID to pull between datasets

In [None]:
batch_kinetic_sorted = pd.read_csv(batch_path+'summarized_all_combos_kinetic_alpha.csv',index_col=0)
batch_nk_sorted = pd.read_csv(batch_path+'summarized_all_combos_nonkinetic_alpha.csv',index_col=0)

In [None]:
def add_cultureID(df):
    sep = ['_']*df.shape[0]
    df['Culture_ID'] = df.Plate_ID+sep+df.coculture
    return df

In [None]:
batch_nk_ = add_cultureID(batch_nk_sorted)
batch_kinetic_ = add_cultureID(batch_kinetic_sorted)

In [None]:
batch_nk_

In [None]:
# replace/update alpha csv
batch_kinetic_.to_csv(batch_path+'summarized_all_combos_kinetic_alpha.csv')
batch_nk_.to_csv(batch_path+'summarized_all_combos_nonkinetic_alpha.csv')

## separate null (mono/mono) & polymicrobial combos

In [None]:
batch_kinetic = pd.read_csv(batch_path+'summarized_all_combos_kinetic_alpha.csv', index_col=0)

In [None]:
null_kinetic = batch_kinetic[batch_kinetic.mono1_s == batch_kinetic.mono2_s].reset_index(drop=True)
null_nk = null_kinetic[null_kinetic.tp == 0].reset_index(drop=True)

In [None]:
null_nk

In [None]:
null_nk.to_csv(batch_path+'summarized_null_nonkinetic.csv')
null_kinetic.to_csv(batch_path+'summarized_null_kinetic.csv')

In [None]:
# number of null combos
len(np.unique(null_nk.Combo_simple))

In [None]:
poly_kinetic = batch_kinetic[batch_kinetic.mono1_s != batch_kinetic.mono2_s]
poly_nk = poly_kinetic[poly_kinetic.tp == 0]

poly_nk.to_csv(batch_path+'summarized_poly_nonkinetic.csv')
poly_kinetic.to_csv(batch_path+'summarized_poly_kinetic.csv')

In [None]:
poly_nk

# Significance 
- null population = mono/mono "cocultures" as opposed to
- exp population = polymicrobial cocultures

## right-tailed test for p-values
- bootstrap from the null scores (but just dAUC scores not standardized dAUC/SE as there is not SE)
    - where dAUC scores = error-adjusted dAUC scores* (adjustment derived @ RFU level)
    - originally, bs from all the droplet reps (co and mono) to calc new dAUC scores
    - but there are only 4 and that is excessive for plate validation lol
    - so just bs null 10,000 and right-tailed test +C=1
- FDR-correct

In [None]:
poly_nk = pd.read_csv(batch_path+'summarized_poly_nonkinetic.csv', index_col=0)
null_nk = pd.read_csv(batch_path+'summarized_null_nonkinetic.csv', index_col=0)

In [None]:
null_bs = bsz.boot_array(null_nk.dAUC_score_adj, bs_size=10000)

In [None]:
null_bs_all = np.concatenate(null_bs.values)

In [None]:
null_bs_all.shape

In [None]:
def calc_pval_bs(null_array, score, C=1):
    ''' Right-tailed test, constant to prevent zero. '''
    null = pd.DataFrame(null_array)
    return (null[null[0] >= score].shape[0] + C)/null.shape[0]

def get_all_pvals_bs(df, null_array, score_col='dAUC_score_adj'):
    '''
    Returns a df with p-values from t-distribution of null,
    FDR-corrected p-values and neg/neg_log10 transformed p-values.
    '''
    df['pval'] = [calc_pval_bs(null_array, t) for t in df[score_col]]
    df['pval_fdr'] = multipletests(df.pval, method='fdr_bh')[1]
    df['neg_pval_fdr'] = [-p for p in df. pval_fdr]
    df['neglog_pval'] = [-np.log(p) for p in df.pval_fdr]
    
    return df

In [None]:
bs_sig = get_all_pvals_bs(poly_nk, null_bs_all)

In [None]:
bs_sig

In [None]:
bs_sig.to_csv(batch_path+'summarized_poly_nonkinetic_pvals.csv')

# Threshold Calling
- by pval and adjusted dAUC_score

In [None]:
batch_kinetic = pd.read_csv(batch_path+'summarized_all_combos_kinetic_alpha.csv', index_col=0)
sig_nk = pd.read_csv(batch_path+'summarized_poly_nonkinetic_pvals.csv', index_col=0)

In [None]:
sig_nk

## cutoffs & export .csv's of final hits

In [None]:
def apply_hit_cutoff(df, dAUC_cut, pval_cut, 
                     dAUC_col = 'dAUC_score_adj', 
                     pval_col= 'pval_fdr'):
    
    return df[(df[dAUC_col] >= dAUC_cut) & (df[pval_col] <= pval_cut)]

In [None]:
pass_nk = apply_hit_cutoff(sig_nk, dAUC_cut=0.1, pval_cut=0.05)

In [None]:
# number of hits
len(np.unique(pass_nk.Combo_simple))

In [None]:
pass_nk.to_csv(batch_path+'summarized_threshold_passed_nonkinetic.csv')

In [None]:
# labels to then pull from kinetic data
pass_labels = pass_nk[['Combo_simple', 'Plate_ID', 'Culture_ID', 'dAUC_score', 'dAUC_score_adj', 
                       'mono1', 'mono2', 'mono1_s', 'mono2_s', 'mono1_inoculum', 'mono2_inoculum']]
pass_labels.to_csv(batch_path+'summarized_threshold_passed_labels.csv')

In [None]:
def pull_from_kinetic(labels_df, source_df, save_dir=batch_path, save_desc=''):
    '''
    Return a .csv with the full summary & calculations from kinetic dataset using given labels.
    
    Inputs:
        labels_df/source_df: dataframes after labels have been concatenated 
    '''
    sub_df = source_df[source_df.Culture_ID.isin(labels_df.Culture_ID)].reset_index(drop=True)
    sub_df.to_csv(save_dir+save_desc+'.csv')
    
    return sub_df 

In [None]:
pass_kinetic = pull_from_kinetic(pass_labels, batch_kinetic, 'summarized_threshold_passed_nonkinetic')

In [None]:
pass_kinetic

## best scores for each unique coculture

In [None]:
pass_nk = pd.read_csv(batch_path+'summarized_threshold_passed_nonkinetic.csv',index_col=0)

In [None]:
def call_best_hit(df, score_col='dAUC_score_adj'):
    ''' Saves highest scores and rankings between ratios for all cocultures. '''
    
    all_subdfs = []

    for n,i in enumerate(df.Combo_simple.unique()):
        subdf = df[df.Combo_simple.str.contains(i)].reset_index(drop=True)
        
        # highest score
        max_score = subdf[score_col].max()
        max_index = subdf[score_col].idxmax(axis=0)
        max_co = subdf.coculture[max_index]
        max_ratio = '_'.join([max_co.split('/')[1].split('_')[1], max_co.split('/')[2].split('_')[1]])
        
        # rank ratios' scores
        sorted_df = subdf.sort_values(score_col, ascending=False)
        
        full = pd.DataFrame({'Combo_simple': [i],
                             'mono1_simple': [i.split('_')[0]], 
                             'mono2_simple': [i.split('_')[1]],
                             'max_score': [max_score], 
                             'max_ratio': [max_ratio],
                             'mono1_inoculum': [subdf.mono1_inoculum[max_index]], 
                             'mono2_inoculum': [subdf.mono2_inoculum[max_index]],
                             'max_ID': [subdf.Culture_ID[max_index]], 
                             'max_plate': [subdf.Plate_ID[max_index]], 
                             'max_co': [max_co],
                             'max_peakfold': [subdf.peak_fold_adj[max_index]], 
                             'max_peakfold_tp': [subdf.peak_fold_tp_adj[max_index]],
                             'ratio_ranking': [list(sorted_df.coculture)], 
                             'score_ranking': [list(sorted_df.dAUC_score_adj)]})

        all_subdfs.append(full)
    
    return pd.concat(all_subdfs).reset_index(drop=True)

In [None]:
best_scores = call_best_hit(pass_nk)
best_scores

In [None]:
best_scores.to_csv(batch_path+'ranked_scores_per_coculture.csv')
best_scores.to_csv(batch_path+'ranked_scores_per_coculture.pkl')

# Plot Final Hits

In [None]:
batch_kinetic = pd.read_csv(batch_path+'summarized_all_combos_kinetic_alpha.csv', index_col=0)
pass_nk = pd.read_csv(batch_path+'summarized_threshold_passed_nonkinetic.csv',index_col=0)

In [None]:
def plot_all_cocurves(labels_df, source_df,
                      plots_per_page=45, plots_y=9, plots_x=5, fig_w=20, fig_h=25, 
                      rfu_lim_high=50000, rfu_lim_low=10000, fold_lim=20,
                      save_dir=batch_path, filename='cocultures_coplots', 
                      split_by='_', multi=False, position1=1, position2=3,
                      monosum_col = 'mono_sum_adj'):
    '''
    Separate onto many pages (as larger screen will have more 
    hits that expand beyond what fits on a page reasonably).
    
    On a single 20x25 page, fit 5x9 plots reasonably so divide by 45.
    
    Mono sum line has been error-adjusted (each mono+error).
    '''
    # separate all hits across pages
    starting_indices = list(range(0, labels_df.shape[0], plots_per_page))
    
    # save all to master PDF
    with PdfPages(save_dir+filename+'.pdf') as pdf:

        # plot each coculture
        for ind in starting_indices:
            # RFUs per page
            fig = plt.figure(figsize=(fig_w, fig_h))
            fig.subplots_adjust(hspace=0.4, wspace=0.4)
            
            fig2 = plt.figure(figsize=(fig_w, fig_h))
            fig2.subplots_adjust(hspace=0.4, wspace=0.4)
            
            fig3 = plt.figure(figsize=(fig_w, fig_h))
            fig3.subplots_adjust(hspace=0.4, wspace=0.4)

            for i, n in enumerate(labels_df.iloc[ind:ind+plots_per_page].index):
                # pull all data for single coculture
                sub_df = source_df[source_df.Culture_ID == labels_df.Culture_ID[n]]
                time = sub_df.tp
                
                # coplot RFUs - high lim
                ax = fig.add_subplot(plots_y, plots_x, i+1)
                ax.errorbar(time, y=sub_df.co_RFU, yerr=sub_df.co_err, alpha=0.5)
                ax.errorbar(time, y=sub_df[monosum_col], yerr=sub_df.mono_sum_err, alpha=0.5)
                ax.errorbar(time, y=sub_df.mono1_RFU, yerr=sub_df.mono1_err, alpha=0.5)
                ax.errorbar(time, y=sub_df.mono2_RFU, yerr=sub_df.mono2_err, alpha=0.5)
                ax.set_ylim(0, rfu_lim_high)
                ax.set_title(labels_df.coculture[n])
                
                # coplot RFUs - low lim
                ax2 = fig2.add_subplot(plots_y, plots_x, i+1)
                ax2.errorbar(time, y=sub_df.co_RFU, yerr=sub_df.co_err, alpha=0.5)
                ax2.errorbar(time, y=sub_df[monosum_col], yerr=sub_df.mono_sum_err, alpha=0.5)
                ax2.errorbar(time, y=sub_df.mono1_RFU, yerr=sub_df.mono1_err, alpha=0.5)
                ax2.errorbar(time, y=sub_df.mono2_RFU, yerr=sub_df.mono2_err, alpha=0.5)
                ax2.set_ylim(0, rfu_lim_low)
                ax2.set_title(labels_df.coculture[n])
                
                # respective fold-changes on next page
                ax3 = fig3.add_subplot(plots_y, plots_x, i+1)
                ax3.plot(time, sub_df.fold_change, alpha=0.5)
                ax3.plot(time, sub_df.fold_change_adj, alpha=0.5)
                ax3.set_ylim(0, fold_lim)
                ax3.set_title(labels_df.coculture[n])

            fig.legend(['coculture', 'mono sum', 'left', 'right'], loc='right')
            fig2.legend(['coculture', 'mono sum', 'left', 'right'], loc='right')
            fig3.legend(['fold-change', 'error-adjusted fold-change'], loc='right')
            
            fig.text(0.5, 0.08, 'time (h)', ha='center', fontsize=20)
            fig.text(0.08, 0.5,'RFU', va='center', rotation='vertical', fontsize=20)
            fig2.text(0.5, 0.08, 'time (h)', ha='center', fontsize=20)
            fig2.text(0.08, 0.5,'RFU', va='center', rotation='vertical', fontsize=20)
            fig3.text(0.5, 0.08, 'time (h)', ha='center', fontsize=20)
            fig3.text(0.08, 0.5,'fold-change (experimental/expected)', va='center', rotation='vertical', fontsize=20)

            pdf.savefig(fig)
            pdf.savefig(fig2)
            pdf.savefig(fig3)
            
        plt.close()
    
    return

In [None]:
plot_all_cocurves(labels_df, batch_kinetic, rfu_lim_high=5000, rfu_lim_low=2000)

In [None]:
plot_all_cocurves(labels_df, batch_kinetic, rfu_lim_high=5000, rfu_lim_low=2000, fold_lim=5, filename='cocultures_coplots_foldlim5')