In [None]:
import os
import glob
import re
import numpy as np
import pandas as pd
import string

from cell_cycle_gating import manual_gating as mg
from cell_cycle_gating import dead_cell_filter_ldrint as dcf_int

import patchworklib as pw
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
from matplotlib.gridspec import GridSpec

import time
import multiprocessing

import plotnine
from plotnine import *

from scipy.signal import find_peaks
from cell_cycle_gating import findpeaks as fp

In [None]:
## Get the names of wells, valid wells (excluding outer two rows/columns), etc.
## input:
##    var: either "all_wells", "valid_wells", "all_well_rows", or "all_well_cols"
## output: a list with the corresponding info, e.g. ["A01", "A02", ... , "P24"] for all_wells
def get_well_names(var):
    all_well_rows = string.ascii_uppercase[0:16]
    all_well_cols = [str(num).zfill(2) for num in range(1,25)]
    valid_well_rows = all_well_rows[2:14]
    valid_well_cols = all_well_cols[2:22]
    ### get all wells
    all_wells = [row + col for row in all_well_rows for col in all_well_cols]
    all_wells.sort()
    ### get valid wells
    valid_wells = [row + col for row in valid_well_rows for col in valid_well_cols]
    valid_wells.sort()
    out = {
        'all_wells':all_wells,
        'valid_wells':valid_wells,
        'all_well_rows':all_well_rows,
        'all_well_cols':all_well_cols,
        'valid_well_rows':valid_well_rows,
        'valid_well_cols':valid_well_cols
    }
    return(out[var])

## Define a dictionary with the local folder name for experiments on each date
def define_folder_dict(name='folder_dict'):
    folder_dict = {
            '2020-11-17':'rep3',
            '2021-02-19':'rep4',
            '2021-02-26':'rep5',
            '2021-03-02':'rep6',
            '2021-04-06':'rep7',
            '2021-04-23':'rep8',
            '2021-05-18':'rep9',
            '2021-05-21':'rep10',
            '2021-06-11':'rep11',
            '2021-07-27':'2_rep1/210727-combo-rep1',
            '2021-07-30':'2_rep2/210730_combo_rep2',
            '2021-08-06':'2_rep3/210806_combo_rep3',
            '2021-10-05':'redo_rep1_and_2',
            '2021-10-15':'redo_rep1_and_2/redo_rep2',
            '2021-10-29':'redo_rep3'
        }
    globals()[name] = folder_dict
    return(None)

def get_base_dir():
    ### note: using linux/unix folder conventions -- would need to re-write for Windows
    base_dir = "/mnt/y/lsp-analysis/LINCS-combinations"
    if os.path.exists(base_dir):
        return(base_dir)
    elif os.path.exists("/Volumes/hits/lsp-analysis/LINCS-combinations"):
        return("/Volumes/hits/lsp-analysis/LINCS-combinations")
    else:
        raise Exception("Base directory not found -- need to mount research.files and supply its path, e.g. '/mnt/y/lsp-analysis/LINCS-combinations'")

def get_plates_to_regate():
    base_dir = get_base_dir()
    ### time_zero plates
    time_zero_file = os.path.join(base_dir, "re_gating", "plates_to_regate", "time_zero_regate_plates.csv")
    well_file = os.path.join(base_dir, "re_gating", "plates_to_regate", "time_zero_regate_wells.csv")
    df_plates1 = pd.read_csv(time_zero_file)
    df_wells1 = pd.read_csv(well_file)
    df_wells1['time'] = "time_zero"
    df_plates1['time'] = "time_zero"
    ### end-time control plates
    ctrl_end_file = os.path.join(base_dir, "re_gating", "plates_to_regate", "ctrl_end_time_regate_plates.csv")
    well_file = os.path.join(base_dir, "re_gating", "plates_to_regate", "ctrl_end_time_regate_wells.csv")
    df_plates2 = pd.read_csv(ctrl_end_file)
    df_wells2 = pd.read_csv(well_file)
    df_wells2['time'] = "end_time_control"
    df_plates2['time'] = "end_time_control"
    ### combine data frames
    df_plates = pd.concat([df_plates1, df_plates2])
    df_wells = pd.concat([df_wells1, df_wells2])
    return(df_plates, df_wells)

## Get a list of plate barcodes for a given date
## input:
##    date: e.g. '2021-10-15'
## output: list of barcodes e.g. '211015_combo_173'
def get_barcodes(date):
    date_formatted = date_format_switch(date)
    main_dir = get_data_dir(date = date)
    dirs = [ x for x in os.listdir(main_dir) if os.path.isdir( os.path.join(main_dir, x) )]
    ### match date at the start of the sub-directory
    dirs_barcodes = [ x for x in dirs if bool(re.match(date_formatted+"_combo", x)) ]
    return( dirs_barcodes )

## Switch the format of a date from YYYY-MM-DD to YYMMDD
## input:
##    date: e.g. '2021-02-19'
## output: e.g. '210219'
def date_format_switch(date):
    new_str = date[2:4] + date[5:7] + date[8:10]
    return(new_str)

## Get the date in YYYY-MM-DD format from a plate barcode
## input:
##    barcode: '210406_combo_71'
## output: e.g. '2021-04-06'
def date_from_barcode(barcode):
    date = '20' + barcode[0:2] + '-' + barcode[2:4] + '-' + barcode[4:6]
    return(date)

## Get the well-level data directory for a given date or barcode
## input:
##    barcode: a plate barcode, e.g. '210406_combo_71'
##    date: a date in YYYY-MM-DD format, e.g. '2021-04-06'
##    base_dir: the full path of the data folder, e.g. "/mnt/y/lsp-analysis/LINCS-combinations/"
## output:
##    returns the directory of the well-level data for a barcode
##    if a date is given and no barcode, returns the directory of all data for the date
##    if no date or barcode is given, returns the base directory of all data
def get_data_dir(barcode=None, date=None, base_dir = "/mnt/y/lsp-analysis/LINCS-combinations/"):
    ### note: using unix folder conventions -- would need to re-write for Windows
    ### set for osx
    if not os.path.exists(base_dir):
        base_dir = "/Volumes/hits/lsp-analysis/LINCS-combinations/"
    if barcode is None and date is None:
        return(base_dir)
    if date is None:
        date = date_from_barcode(barcode)
    if barcode is None:
        plate_dir = ''
    else:
        plate_dir = barcode
    #folder_dict = define_folder_dict()
    if not 'folder_dict' in globals(): define_folder_dict('folder_dict')
    local_dir = folder_dict[date]
    full_dir = os.path.join(base_dir, local_dir, plate_dir)
    return(full_dir)

## Get the filename for well-level intensities for a given barcode and well
## input:
##    barcode: a plate barcode, e.g. '210406_combo_71'
##    well: a well of interest, e.g. 'D06'
## output:
##    full path/filename of the well-level data
def get_well_file(barcode, well):
    date = date_from_barcode(barcode)
    data_dir = get_data_dir(barcode)
    ### example file style
    f1 = barcode+".result."+well+"[test].csv"
    #f2 = barcode+".result."+well+"[test].csv"
    files = os.listdir(data_dir)
    if f1 in files:
        well_file = os.path.join(data_dir, f1)
    else:
        print("well csv not found!")
    return(well_file)

## Read the well-level data for a given well and barcode
## input:
##    barcode: a plate barcode, e.g. '210406_combo_71'
##    well: a well of interest, e.g. 'D06'
## output:
##    a pandas dataframe of dye intensities for individual cells
def read_well_data(barcode, well):
    ff = get_well_file(barcode, well)
    df = pd.read_csv(ff)
    return(df)

## Re-name columns of well-level dataframe for LDR, DNA, EDU, etc.
## input: original data frame read from csv
## output: data frame with re-names columns
def rename_df_columns(df, silent=True, hoechst_as_dna=False):
    col_dict = {}
    ### check for well name
    if 'Well Name' in df.columns:
        if not silent: print("'Well Name' column found -- re-naming as 'well'")
        col_dict['Well Name'] = 'well'
    else:
        print(df.columns)
        if not silent: print('Well Name column not found')
        return(df)
    ### check for LDRint
    if 'ldrint' in df.columns:
        if not silent: print("'ldrint' column found -- re-naming as 'ldr'")
        col_dict['ldrint'] = 'ldr'
    else:
        print(df.columns)
        if not silent: print('ldrint column not found')
        return(df)
    ### check for DNAcontent/Hoechst
    dna_col1 = 'Cell: DNAcontent (DD-bckgrnd)'
    dna_col2 = 'Cell: DNAcontent (DDD-bckgrnd)'
    hoechst1 = 'Cell: HoechstINT (DD-bckgrnd)'
    hoechst2 = 'Cell: HoechstINT (DDD-bckgrnd)'
    check_dna1 = dna_col1 in df.columns
    check_dna2 = dna_col2 in df.columns
    check_hoechst1 = hoechst1 in df.columns
    check_hoechst2 = hoechst2 in df.columns
    if not (check_dna1 or check_dna2 or check_hoechst1 or check_hoechst2):
        if not silent: print(df.columns)
        if not silent: print('DNA column not found')
    else:
        ### if hoechst_as_dna, use the HoechstINT column as DNA if available
        if hoechst_as_dna:
            if check_hoechst1 and check_hoechst2:
                print('Warning: Two HoechstINT columns -- using ' + "'"+hoechst2+"'")
                dna_col = hoechst2
            elif check_hoechst1: dna_col = hoechst1
            elif check_hoechst2: dna_col = hoechst2
            elif check_dna1 and check_dna2:
                print('Warning: Two DNAcontent columns -- using ' + "'"+dna_col2+"'")
                dna_col = dna_col2
            elif check_dna1: dna_col = dna_col1
            elif check_dna2: dna_col = dna_col2
        ### otherwise use DNAConent column as DNA
        else:
            if check_dna1 and check_dna2:
                print('Warning: Two DNAcontent columns -- using ' + "'"+dna_col2+"'")
                dna_col = dna_col2
            elif check_dna1: dna_col = dna_col1
            elif check_dna2: dna_col = dna_col2
            elif check_hoechst1 and check_hoechst2:
                print('Warning: Two HoechstINT columns -- using ' + "'"+hoechst2+"'")
                dna_col = hoechst2
            elif check_hoechst1: dna_col = hoechst1
            elif check_hoechst2: dna_col = hoechst2
        if not silent: print("'"+dna_col+"'"+" column found -- re-naming as 'dna'")
        col_dict[dna_col] = 'dna'
    ### check for Edu (raw)
    if 'Cell: EdUrawINT (DDD-bckgrnd)' in df.columns:
        if not silent: print("'Cell: EdUrawINT (DDD-bckgrnd)' column found -- re-naming as 'edu_raw'")
        col_dict['Cell: EdUrawINT (DDD-bckgrnd)'] = 'edu_raw'
    elif 'Cell: EdUrawINT (DD-bckgrnd)' in df.columns:
        if not silent: print("'Cell: EdUrawINT (DD-bckgrnd)' column found -- re-naming as 'edu_raw'")
        col_dict['Cell: EdUrawINT (DD-bckgrnd)'] = 'edu_raw'
    ### check for Edu (background)
    if 'Cell: EdUbackground (DDD-bckgrnd)' in df.columns:
        if not silent: print("'Cell: EdUbackground (DDD-bckgrnd)' column found -- re-naming as 'edu_bg'")
        col_dict['Cell: EdUbackground (DDD-bckgrnd)'] = 'edu_bg'
    elif 'Cell: EdUbackground (DD-bckgrnd)' in df.columns:
        if not silent: print("'Cell: EdUbackground (DD-bckgrnd)' column found -- re-naming as 'edu_bg'")
        col_dict['Cell: EdUbackground (DD-bckgrnd)'] = 'edu_bg'
    ### re-name data-frame columns
    df = df.rename(columns=col_dict)
    if 'edu_raw' in df.columns and 'edu_bg' in df.columns:
        df['edu'] = df.edu_raw - df.edu_bg
    return(df)

def read_and_rename_well_data(barcode, well, silent = False, hoechst_as_dna=False):
    df = read_well_data(barcode, well)
    df = rename_df_columns(df, silent = silent, hoechst_as_dna=hoechst_as_dna)
    return(df)


## Get all wells with data for a given plate
## input:
##    barcode: a plate barcode, e.g. '210406_combo_71'
## output:
##    A list of wells, e.g. ['C03', 'C04', ... , 'N22']
def get_all_wells(barcode):
    data_dir = get_data_dir(barcode)
    wells_with_data = []
    all_wells = get_well_names("all_wells")
    for well in all_wells:
        ### example file style
        f1 = barcode+".result."+well+"[test].csv"
        f1_full = os.path.join(data_dir, f1)
        check = os.path.exists( f1_full )
        if check:
            wells_with_data.append(well)
    wells_with_data.sort()
    return(wells_with_data)

### maybe not necessary for LDR intensity data?
#def read_all_wells(barcode):
#    wells = get_all_wells(barcode)
#    df_list = [read_well_data(barcode, well) for well in wells]
#    return(df_list)

def get_ldr_cutoff(barcode, well, peak_loc = 1.2, silent=False, hoechst_as_dna=False):
    #df = read_well_data(barcode, well)
    #df = rename_df_columns(df)
    df = read_and_rename_well_data(barcode, well, silent, hoechst_as_dna=hoechst_as_dna)
    ldr_gates, ldr_lims = dcf_int.get_ldrgates(ldrint = df['ldr'], peak_loc=peak_loc) ## 1.2 is default
    return(ldr_gates[1])

def get_ldr_cutoff_many(barcode, wells, peak_loc = 1.2, silent=True, hoechst_as_dna=False):
    ldrs = [get_ldr_cutoff(barcode, well, peak_loc = peak_loc, silent=silent, hoechst_as_dna=hoechst_as_dna) for well in wells]
    return(ldrs)

### note: file location hard-coded, only for wsl on my desktop right now
def load_well_metadata(name = 'meta', folder=None, file='single_timepoint_cleaned_from_raw_2023-06-08.parquet'):
    ### read parquet file w/ all metadata
    if folder is None:
        folder = "/mnt/c/Users/NC168/git/LINCS_combos/data/cleaned/"
    full_file = os.path.join(folder, file)
    df = pd.read_parquet(full_file)
    globals()[name] = df
    
def get_wells(barcode, cell_line):
    ### get only the wells for a certain cell line on a given barcode
    if not 'meta' in globals(): load_well_metadata()
    #query = " cell_line == 'SUM1315' & barcode == '201117_combo_33' "
    query = "cell_line == '"+cell_line+"' & barcode == '"+barcode+"'"
    meta_sub = meta.query(query)
    wells = list(meta_sub.well)
    wells.sort()
    return(wells)

def get_cell_lines_on_plate(barcode):
    if not 'meta' in globals(): load_well_metadata()
    query = "barcode == '"+barcode+"'"
    meta_sub = meta.query(query)
    cell_lines = meta_sub.cell_line.unique()
    return(cell_lines)

def get_ldr_cutoffs_plate(barcode, peak_loc = 1.2, silent = True):
    cell_lines = get_cell_lines_on_plate(barcode)
    df = get_ldr_cutoffs_cell_line_and_barcode(barcode, cell_lines, peak_loc=peak_loc, silent=silent)
    return(df)

def get_ldr_cutoffs_cell_line_and_barcode(barcode, cell_lines, peak_loc=1.2, silent = True):
    df_list = []
    for cell_line in cell_lines:
        if not silent: print(cell_line)
        wells = get_wells(barcode, cell_line)
        ldrs = get_ldr_cutoff_many(barcode, wells, peak_loc=peak_loc)
        d = {'well':wells, 'ldr_cutoff': ldrs, 'barcode':barcode, 'cell_line': cell_line}
        df_tmp = pd.DataFrame(data=d)
        df_list.append(df_tmp)
    df = pd.concat(df_list)
    return(df)

## x_lims: tuple of x limits for the plot
## y_lims: tuple of y limits for the plot
def plot_ldr(df, peak_loc = 1.2, scatter = True, silent = True, show_fig = True, 
             fig=None, outer=None, i=None, title = "", x_lims=None, y_lims=None, add_ldr_line = None):
    ldr_gates, ldr_lims = dcf_int.get_ldrgates(ldrint = df['ldr'], peak_loc=peak_loc)
    ldr_cutoff = ldr_gates[1]
    #df = df.query("ldr > 0")
    #df['ldr'] = [x if x>0 else 10**(-10) for x in df.ldr]
    df_pos1 = df.query("ldr > 0")
    min_ldr = np.min(df_pos1.ldr)
    df_pos2 = df.query("dna > 0")
    min_dna = np.min(df_pos2.dna)
    df['ldr'] = [x if x>0 else min_ldr for x in df.ldr]
    df['dna'] = [x if x>0 else min_dna for x in df.dna]
    if scatter:
        fig = mg.plot_ldr_dna_scatter(np.log10(df.dna), np.log10(df.ldr), ldr_cutoff, 
                                            dna_gates=None, plot_ldr_log10=True, is_ldrint=True,
                                           show_fig=show_fig, fig = fig, outer=outer, i=i,
                                     title=title, x_lims=x_lims, y_lims=y_lims, add_ldr_line = add_ldr_line)
    else:
        fig = mg.ldr_gating(np.log10(df.ldr), ldr_cutoff, nbins = 20)
    return(fig)

def plot_ldr_well(barcode, well, peak_loc = 1.2, scatter = True, silent = True, 
                  show_fig = True, fig=None, outer=None, i=None, title="", x_lims=None, 
                  y_lims=None, hoechst_as_dna=False, add_ldr_line=None):
    df = read_and_rename_well_data(barcode, well, silent, hoechst_as_dna=hoechst_as_dna)
    #df = df.query("ldr > 0")
    #df['ldr'] = [x if x>0 else 10**(-10) for x in df.ldr]
    #df_pos1 = df.query("ldr > 0")
    #min_ldr = np.min(df_pos1.ldr)
    #df_pos2 = df.query("dna > 0")
    #min_dna = np.min(df_pos2.dna)
    #df['ldr'] = [x if x>0 else min_ldr for x in df.ldr]
    #df['dna'] = [x if x>0 else min_dna for x in df.dna]
    fig = plot_ldr(df, peak_loc = peak_loc, scatter = scatter, silent = silent, 
                   show_fig = show_fig, fig = fig, outer=outer, i=i, title = title,x_lims=x_lims,
                   y_lims=y_lims, add_ldr_line = add_ldr_line)
    return(fig)

def plot_ldr_many(barcode, wells, peak_loc = 1.2, scatter = False, silent = True, hoechst_as_dna=False):
    for well in wells:
        df = read_and_rename_well_data(barcode, well, silent, hoechst_as_dna=hoechst_as_dna)
        plot_ldr(df, peak_loc = peak_loc, scatter = scatter)

def plot_ldr_pdf(barcode, wells, peak_loc = 1.2, figname = "test_ldr.pdf", scatter = True, 
                 silent = True, show_fig = True, hoechst_as_dna=False):
    pdf_pages = PdfPages(figname)
    fig_list = []
    for i in range(len(wells)):
        well = wells[i]
        well_meta = get_well_meta(barcode, well)
        cell_line = list(well_meta.cell_line)[0]
        trt1 = list(well_meta.agent1)[0]
        trt2 = list(well_meta.agent2)[0]
        conc1 = list(well_meta.concentration1_chr)[0]
        conc2 = list(well_meta.concentration2_chr)[0]
        df = read_and_rename_well_data(barcode, well, silent, hoechst_as_dna=hoechst_as_dna)
        ldr_gates, ldr_lims = dcf_int.get_ldrgates(ldrint = df['ldr'], peak_loc=peak_loc)
        ldr_cutoff = ldr_gates[1]
        df = df.query("ldr > 0")
        #df['ldr'] = [x if x>0 else 10**(-10) for x in df.ldr]
        if scatter:
            fig = mg.plot_ldr_dna_scatter(np.log10(df.dna), np.log10(df.ldr), ldr_cutoff, dna_gates=None, 
                                          plot_ldr_log10=True, is_ldrint=True, show_fig = show_fig)
        else:
            fig = mg.ldr_gating(np.log10(df.ldr), ldr_cutoff, nbins = 20)
        fig_title = str(trt1) + ": "+ str(conc1) + " uM, " + str(trt2) + ": " + str(conc2) + " uM"
        #print(fig_title)
        fig.suptitle(well + "\n" + fig_title, fontsize=12)
        fig_list.append(fig)
        plt.close()
        pdf_pages.savefig(fig)
    pdf_pages.close()
    return(fig_list)

def test_regate(barcode, cell_line, peak_loc = 1.2, figname = "test_figure", scatter = True, silent = True, test = True, 
                show_fig = False, hoechst_as_dna=False):
    path1 = os.path.join('temp_regating', 'csv')
    path2 = os.path.join('temp_regating', 'pdf')
    if not os.path.exists(path1):
        os.makedirs(path1)
    if not os.path.exists(path2):
        os.makedirs(path2)
    wells = get_wells(barcode, cell_line)
    df_list = []
    csv_file = os.path.join(path1, figname+'.csv')
    pdf_file = os.path.join(path2, figname+'.pdf')
    plot_list = plot_ldr_pdf(barcode, wells, peak_loc, figname=pdf_file, scatter=scatter, silent=silent, 
                             show_fig=show_fig, hoechst_as_dna=hoechst_as_dna)
    print('figures written to: ' + pdf_file)
    for well in wells:
        df = read_and_rename_well_data(barcode, well, silent, hoechst_as_dna=hoechst_as_dna)
        df_tmp = dcf_int.get_counts_df(df=df, barcode=barcode, well=well, peak_loc = peak_loc)
        df_list.append(df_tmp)
    df_out = pd.concat(df_list)
    df_out.to_csv(csv_file)
    return(df_out, plot_list)

def plot_wells_ldr(barcode, cell_line, peak_loc=1.2, scatter = True, silent=True,
                   figname = None, output_dir="default_gating", hoechst_as_dna=False):
    if figname is None: figname=barcode+'_'+cell_line+'_'+'peak_loc_'+str(peak_loc)
    if not 'meta' in globals(): load_well_metadata()
    wells = get_wells(barcode, cell_line)
    df_list = []
    df_full_list = []
    for well in wells:
        df = read_and_rename_well_data(barcode, well, silent, hoechst_as_dna=hoechst_as_dna)
        #df = df.query("ldr > 0")
        df_tmp = dcf_int.get_counts_df(df=df, barcode=barcode, well=well, peak_loc = peak_loc)
        df_list.append(df_tmp)
        df_full_list.append(df)
    df2 = pd.concat(df_list)
    df_full = pd.concat(df_full_list)
    df_pos1 = df_full.query("ldr > 0")
    y_log = np.log10(df_pos1.ldr)
    y_lims = (min(y_log), max(y_log))
    print(y_lims)
    df_pos2 = df_full.query("dna > 0")
    x_log = np.log10(df_pos2.dna)
    x_lims = (min(x_log)-0.2, max(x_log)+0.2)
    ### save counts data frame to csv
    csv1 = "all_wells_" + figname + ".csv"
    csv1_full = os.path.join(output_dir, csv1)
    df2.to_csv(csv1_full)
    ### plot wells that changed
    fig_list = []
    pdf = "all_wells_scatter_" + figname + ".pdf"
    pdf_full = os.path.join(output_dir, pdf)
    pdf_pages = PdfPages(pdf_full)
    nb_plots = len(df2.well)
    plots_per_page = 6
    for i in range(nb_plots):
        #print(i)
        if i % plots_per_page == 0:
            fig = plt.figure(figsize=(8.5, 11))
            outer = GridSpec(3, 2, wspace=0.2, hspace=0.5)
        well = wells[i]
        #print(well)
        df = read_and_rename_well_data(barcode, well, silent=True, hoechst_as_dna=hoechst_as_dna)
        ### get well metadata
        well_meta = get_well_meta(barcode, well)
        cell_line = list(well_meta.cell_line)[0]
        trt1 = list(well_meta.agent1)[0]
        trt2 = list(well_meta.agent2)[0]
        conc1 = list(well_meta.concentration1_chr)[0]
        conc2 = list(well_meta.concentration2_chr)[0]
        ### add title to figures
        fig_title = str(trt1) + ": "+ str(conc1) + " uM, " + str(trt2) + ": " + str(conc2) + " uM"
        fig_title = well+", peak_loc = "+str(peak_loc)+"\n"+fig_title
    
        i_page = i % plots_per_page
        ### make figures
        fig_tmp = plot_ldr_well(barcode, well, peak_loc = peak_loc, scatter = scatter, 
                                 silent = silent, show_fig = False, fig = fig, outer = outer, i = i_page,
                               title = fig_title, x_lims=x_lims, y_lims=y_lims, hoechst_as_dna=hoechst_as_dna)
        #plt.close()
        fig_list.append(fig_tmp)
        if (i + 1) % plots_per_page == 0 or (i + 1) == nb_plots:
               plt.tight_layout()
               pdf_pages.savefig()
               plt.close('all')
    pdf_pages.close()
    return([df2, fig_list])

def get_well_meta(barcode, well):
    query = "barcode == '"+ barcode+ "' & well == '"+ well + "'"
    df_sub = meta.query(query)
    return(df_sub)

def plot_ldr_cutoff_change(barcode, cell_line, peak_loc, scatter = True, silent=True, 
                           default_peak_loc = 1.2, figname = None,
                          #output_dir="/mnt/y/lsp-analysis/LINCS-combinations/re_gating/new_gating"):
                           output_dir="temp_regating",
                          hoechst_as_dna=False):
    if figname is None: figname=barcode+'_'+cell_line+'_'+'peak_loc_'+str(peak_loc)
    if not 'meta' in globals(): load_well_metadata()
    wells = get_wells(barcode, cell_line)
    df_list_orig = []
    df_list_new = []
    for well in wells:
        df = read_and_rename_well_data(barcode, well, silent, hoechst_as_dna=hoechst_as_dna)
        #df = df.query("ldr > 0")
        df_tmp_new = dcf_int.get_counts_df(df=df, barcode=barcode, well=well, peak_loc = peak_loc)
        df_list_new.append(df_tmp_new)
        df_tmp_orig = dcf_int.get_counts_df(df=df, barcode=barcode, well=well, peak_loc = default_peak_loc)
        df_list_orig.append(df_tmp_orig)
    df_orig = pd.concat(df_list_orig)
    df_new = pd.concat(df_list_new)
    ### add suffixes to measured columns in each data frame
    df_orig2 = df_orig.rename(columns={c: c+'_orig' for c in df_orig.columns if c not in ['barcode', 'well']})
    df_new2 = df_new.rename(columns={c: c+'_new' for c in df_new.columns if c not in ['barcode', 'well']})
    ### join the data frames
    df2 = df_orig2.merge(df_new2, on = ['barcode', 'well'], how = 'inner')
    meta_select = meta[['barcode', 'cell_line', 'well', 'agent1', 'concentration1_chr', 'agent2', 'concentration2_chr', 'timepoint']]
    df2 = df2.merge(meta_select, on = ['barcode', 'well'], how = 'left')
    cols = ['barcode', 'well', 'cell_count__dead_orig', 'cell_count__dead_new', 'cell_count_orig', 'cell_count_new', 
                  'ldr_cutoff_orig', 'ldr_cutoff_new', 'cell_line','agent1', 'concentration1_chr', 'agent2', 
                  'concentration2_chr', 'timepoint']
    last_cols = [ x for x in df2.columns if x not in cols ]
    cols.extend(last_cols)
    df2 = df2[cols]
    df2['label'] = df2.apply(lambda row: row.well if row.ldr_cutoff_orig != row.ldr_cutoff_new else "", axis=1)
    df2_sub = df2.query("ldr_cutoff_orig != ldr_cutoff_new")
    df2_sub.reset_index(drop=True, inplace=True)
    ### plot new vs. old cutoffs, live/dead counts
    gg1 = pw.load_ggplot(ggplot(df2, aes(x = "ldr_cutoff_orig", y = "ldr_cutoff_new")) +\
        geom_point(alpha = 0.5) +\
        geom_label(aes(label="label"), alpha = 0.5, nudge_x = 0.05, nudge_y = 0.05), figsize=(3,3))
    gg2 = pw.load_ggplot(ggplot(df2, aes(x = "cell_count__dead_orig", y = "cell_count__dead_new")) +\
        geom_point(alpha = 0.5) +\
        geom_label(aes(label="label"), alpha = 0.5, nudge_x = 20, nudge_y = 5), figsize=(3,3))
    gg3 = pw.load_ggplot(ggplot(df2, aes(x = "cell_count_orig", y = "cell_count_new")) +\
        geom_point(alpha = 0.5) +\
        geom_label(aes(label="label"), alpha = 0.5, nudge_x = 100, nudge_y = 100), figsize=(3,3))
    
    gg = (gg1|gg2|gg3)
    pdf1 = figname + str("_summary.pdf")
    pdf1_full = os.path.join(output_dir, pdf1)
    gg.savefig(pdf1_full)
                               
    ### plot wells that changed
    fig_list_orig = []
    fig_list_new = []
    pdf2 = figname + str("_wells_changed.pdf")
    pdf2_full = os.path.join(output_dir, pdf2)
    pdf_pages = PdfPages(pdf2_full)
    nb_rows = len(df2_sub.well)
    rows_per_page = 3
    for i in range(nb_rows):
        if i % rows_per_page == 0:
            fig = plt.figure(figsize=(8.5, 11))
            outer = GridSpec(3, 2, wspace=0.2, hspace=0.5)
        well = df2_sub.well[i]
        df = read_and_rename_well_data(barcode, well, silent=True, hoechst_as_dna=hoechst_as_dna)
    
        ### get well metadata
        well_meta = get_well_meta(barcode, well)
        cell_line = list(well_meta.cell_line)[0]
        trt1 = list(well_meta.agent1)[0]
        trt2 = list(well_meta.agent2)[0]
        conc1 = list(well_meta.concentration1_chr)[0]
        conc2 = list(well_meta.concentration2_chr)[0]
        ### add title to figures
        fig_title = str(trt1) + ": "+ str(conc1) + " uM, " + str(trt2) + ": " + str(conc2) + " uM"
        fig_title_orig = well+", peak_loc = "+str(default_peak_loc)+" (default)"+"\n"+fig_title
        fig_title_new = well+", peak_loc = "+str(peak_loc)+"\n"+fig_title
    
        i_page = i % rows_per_page
        ### make figures for new and old peak_loc values
        fig_orig = plot_ldr_well(barcode, well, peak_loc = default_peak_loc, scatter = scatter, 
                                 silent = silent, show_fig = False, fig = fig, outer = outer, i = 2*i_page,
                                title = fig_title_orig, hoechst_as_dna=hoechst_as_dna)
        #plt.close()
        fig_new = plot_ldr_well(barcode, well, peak_loc = peak_loc, scatter = scatter, 
                                 silent = silent, show_fig = False, fig = fig, outer = outer, i = 2*i_page+1,
                               title = fig_title_new, hoechst_as_dna=hoechst_as_dna)
        #plt.close()
        fig_list_orig.append(fig_orig)
        fig_list_new.append(fig_new)
        if (i + 1) % rows_per_page == 0 or (i + 1) == nb_rows:
               plt.tight_layout()
               pdf_pages.savefig()
               plt.close('all')
    pdf_pages.close()
    ### write data frames to csv files
    # write cell counts for all wells
    csv1 = figname + str("_all_wells.csv")
    csv1_full = os.path.join(output_dir, csv1)
    df2.to_csv(csv1_full)
    # write cell counts for only wells where counts changed
    csv2 = figname + str("_wells_changed.csv")
    csv2_full = os.path.join(output_dir, csv2)
    df2_sub.to_csv(csv2_full)
    
    return([df2, df2_sub, gg, fig_list_orig, fig_list_new])

### plot LDR cutoffs
def plot_flagged_wells_ldr(barcode, cell_line, well_df, figname = None, peak_loc = 1.2, output_dir="default_gating", write_pdf=True,
                          hoechst_as_dna=False):
    if figname is None: figname=barcode+'_'+cell_line+'_'+'peak_loc_'+str(peak_loc)
    query = "cell_line == '"+cell_line+"' & barcode == '"+barcode+"'"
    well_df = well_df.query(query)
    wells = get_wells(barcode, cell_line)
    df_list = []
    for well in wells:
        df = read_and_rename_well_data(barcode, well, silent=True,hoechst_as_dna=hoechst_as_dna)
        #df = df.query("ldr > 0")
        df_tmp = dcf_int.get_counts_df(df=df, barcode=barcode, well=well, peak_loc = peak_loc)
        df_list.append(df_tmp)
    df = pd.concat(df_list)
    #return(df)
    df['flagged'] = ["flagged" if x in list(well_df.well) else "not_flagged" for x in df.well]
    gg = ggplot(df, aes(x = 'flagged', y = 'ldr_cutoff')) + geom_boxplot() + geom_jitter()

    csv_file = "flagged_wells_" + figname + ".csv"
    csv_full = os.path.join(output_dir, csv_file)
    pdf = "boxplot_ldr_cutoff_" + figname + ".pdf"
    pdf_full = os.path.join(output_dir, pdf)
    
    if write_pdf: gg.save(pdf_full, format = "pdf", width = 2.5, height = 3)
    well_df.to_csv(csv_full)
    return(df, gg)
    
def plot_problem_plate(barcode, cell_line, peak_loc=1.2, df_wells=None, scatter=True, silent=True, output_dir="default_gating",
                      hoechst_as_dna=False):
    final_dir = os.path.join(output_dir, cell_line + "_" + barcode)
    if not os.path.exists(final_dir): os.makedirs(final_dir)
    ### plot ldr vs. dna scatterplots for all wells:
    plot_wells_ldr(barcode, cell_line, peak_loc=peak_loc, scatter = scatter, silent=silent,
                   figname = None, output_dir=final_dir, hoechst_as_dna=hoechst_as_dna)
    ### plot ldr cutoffs for flagged vs. unflagged wells
    if df_wells is not None:
        plot_flagged_wells_ldr(barcode, cell_line, df_wells, output_dir = final_dir, write_pdf=True, hoechst_as_dna=hoechst_as_dna)

def plot_all_problem_plates(peak_loc=1.2, scatter=True, silent=True, output_dir = "default_gating", hoechst_as_dna=False):
    df_plates, df_wells = get_plates_to_regate()
    n_plates = len(df_plates.barcode)
    print("plotting LDR for " + str(n_plates) + " cell lines/plates")
    for i in range(n_plates):
        barcode = list(df_plates.barcode)[i]
        cell_line = list(df_plates.cell_line)[i]
        print("Plate " + str(i) + ": "+ cell_line + " " + barcode)
        plot_problem_plate(barcode, cell_line, peak_loc=peak_loc, df_wells=df_wells, scatter=scatter,
                           silent=silent, output_dir=output_dir, hoechst_as_dna=hoechst_as_dna)
        

### Todo:
## 1) def plot_problem_plate(): 
        ## steps:
        ## 1) call plot_wells_ldr to create plots of all wells for a problem plate-- write to pdf
        ## 2) load problem well metadata, create box plot of LDR cutoffs for "flagged wells" vs. "unflagged wells" -- print to pdf
        ## 3) save both (plus a csv, already written from plot_wells_ldr function) to "default_gating/
                               
## 2) loop over all bad plates/cell lines and plot LDR gating with default options
    ## for each, look at box plots and well-level scatter plots and pick a cutoff in-between the "flagged" and "non-flagged" ldr cutoffs.
    ## put new cutoffs manually into a dict/dataframe

## 3) call plot_LDR_cutoff_change with the new LDR cutoffs to write new plots and data to files

## Define a dictionary with the local folder name for experiments on each date
def define_regating_df(name = 'regate_df'):
    #barcode = '211015_combo_176'
    #cell_line = 'SUM1315'
    data = [
        ### Time zero plates
        ## note: plate 62 gating is fine with default peak_loc=1.2 on all but one well -- not sure why it was bad before
        {'barcode': '210406_combo_62', 'cell_line': 'SUM1315', 'peak_loc': 1.2, 'hoechst_as_dna': False},
        {'barcode': '210423_combo_78', 'cell_line': 'SUM185PE', 'peak_loc': 2, 'hoechst_as_dna': False},
        ### End-time plates
        ### note: wells I06 and J07 -- almost 500 dead cells dead sub-g1, only ~50 LDR positive -- dna gating issue, not LDR gating?
        {'barcode': '210226_combo_51', 'cell_line': 'HCC1937', 'peak_loc': 1.2, 'hoechst_as_dna': True}, #using Hoechst column as dna fixes dead count
        ### note: only well I11 -- almost 500 dead cells dead sub-g1, only ~50 LDR positive -- dna gating issue, not LDR gating?
        ###  note: well I18 -- high subg1 dead cells
        {'barcode': '210226_combo_52', 'cell_line': 'HCC1937', 'peak_loc': 1.2, 'hoechst_as_dna': True}, #using Hoechst column as dna fixes dead count
        {'barcode': '210226_combo_53', 'cell_line': 'HCC1937', 'peak_loc': 1.2, 'hoechst_as_dna': True}, #using Hoechst column as dna fixes dead count
        {'barcode': '210226_combo_54', 'cell_line': 'HCC1937', 'peak_loc': 1.2, 'hoechst_as_dna': True}, #using Hoechst column as dna fixes dead count
        {'barcode': '210226_combo_55', 'cell_line': 'HCC1937', 'peak_loc': 1.2, 'hoechst_as_dna': True}, #using Hoechst column as dna fixes dead count
        {'barcode': '210226_combo_56', 'cell_line': 'HCC1937', 'peak_loc': 1.2, 'hoechst_as_dna': True}, #using Hoechst column as dna fixes dead count
        {'barcode': '210226_combo_57', 'cell_line': 'HCC1937', 'peak_loc': 1.2, 'hoechst_as_dna': True}, #using Hoechst column as dna fixes dead count
        {'barcode': '210302_combo_59', 'cell_line': 'HCC1937', 'peak_loc': 1.2, 'hoechst_as_dna': True}, #using Hoechst column as dna fixes dead count
        {'barcode': '210302_combo_60', 'cell_line': 'HCC1937', 'peak_loc': 1.2, 'hoechst_as_dna': True}, #using Hoechst column as dna fixes dead count
        {'barcode': '210302_combo_61', 'cell_line': 'HCC1937', 'peak_loc': 1.2, 'hoechst_as_dna': True}, #using Hoechst column as dna fixes dead count
        {'barcode': '210406_combo_69', 'cell_line': 'SUM1315', 'peak_loc': 1.2, 'hoechst_as_dna': False}, #regating w/ default values gives low dead count
        {'barcode': '210406_combo_70', 'cell_line': 'SUM1315', 'peak_loc': 1.2, 'hoechst_as_dna': False}, #regating w/ default values gives low dead count
        {'barcode': '210406_combo_71', 'cell_line': 'SUM1315', 'peak_loc': 1.2, 'hoechst_as_dna': False}, #regating w/ default values gives low dead count
        # plate 72 -- E11 is the only control well that looks bad after regating -- will be solved by trimmed mean
        ### notes: a few non-control wells look wrong -- E05, possibly E07, F05, F19, etc.
        ### notes: a wide range of LDR cutoffs -- from 2 to 4 -- good cutoff looks like around 3 to 3.25
        {'barcode': '210406_combo_72', 'cell_line': 'SUM1315', 'peak_loc': 1.2, 'hoechst_as_dna': False}, #regating w/ default values gives low dead count
        {'barcode': '210406_combo_73', 'cell_line': 'SUM1315', 'peak_loc': 1.2, 'hoechst_as_dna': False}, #regating w/ default values gives low dead count
        {'barcode': '210406_combo_74', 'cell_line': 'SUM1315', 'peak_loc': 1.2, 'hoechst_as_dna': False}, #regating w/ default values gives low dead count
        ### plate 75: E11 is the only control well that looks bad -- will be solved by trimmed mean
        {'barcode': '210406_combo_75', 'cell_line': 'SUM1315', 'peak_loc': 1.2, 'hoechst_as_dna': False}, #regating w/ default values gives low dead count
        {'barcode': '210406_combo_76', 'cell_line': 'SUM1315', 'peak_loc': 1.2, 'hoechst_as_dna': False}, #regating w/ default values gives low dead count
        {'barcode': '210406_combo_77', 'cell_line': 'SUM1315', 'peak_loc': 1.2, 'hoechst_as_dna': False}, #regating w/ default values gives low dead count
        {'barcode': '211005_combo_158', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211005_combo_160', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211005_combo_161', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211005_combo_162', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211005_combo_163', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211005_combo_164', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211005_combo_165', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211005_combo_166', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211015_combo_168', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211015_combo_169', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211015_combo_170', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211015_combo_171', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211015_combo_172', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211015_combo_173', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211015_combo_174', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211015_combo_175', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False},
        {'barcode': '211015_combo_176', 'cell_line': 'SUM1315', 'peak_loc': 2.75, 'hoechst_as_dna': False}
    ]
    df = pd.DataFrame(data)
    globals()[name] = df

def gate_well(barcode, well, peak_loc=1.2, silent=False, hoechst_as_dna=False):
    df = read_and_rename_well_data(barcode, well, silent, hoechst_as_dna=hoechst_as_dna)
    df_tmp = dcf_int.get_counts_df(df=df, barcode=barcode, well=well, peak_loc = peak_loc)
    return(df_tmp)

def regate_wells(silent=True):
    df_list2 = []
    for i in range(regate_df.shape[0]):
        print(i)
        barcode = regate_df['barcode'][i]
        cell_line = regate_df['cell_line'][i]
        peak_loc = regate_df['peak_loc'][i]
        hoechst_as_dna = regate_df['hoechst_as_dna'][i]
        if not 'meta' in globals(): load_well_metadata()
        wells = get_wells(barcode, cell_line)
        df_list = []
        for well in wells:
            #df = read_and_rename_well_data(barcode, well, silent, hoechst_as_dna=hoechst_as_dna)
            #df_tmp = dcf_int.get_counts_df(df=df, barcode=barcode, well=well, peak_loc = peak_loc)
            df_tmp = gate_well(barcode, well, peak_loc=peak_loc, silent=silent, hoechst_as_dna=hoechst_as_dna)
            df_list.append(df_tmp)
        df_out = pd.concat(df_list)
        df_list2.append(df_out)
    df_out2 = pd.concat(df_list2)
    return(df_out2)

def get_ldr_cutoffs_all(peak_loc = 1.2):
    if not 'folder_dict' in globals(): define_folder_dict('folder_dict')
    df_list_full = []
    for date in folder_dict.keys():
        print(date)
        plates = get_barcodes(date)
        df_list_date=[]
        for plate in plates:
            print(plate)
            df_tmp = get_ldr_cutoffs_plate(plate, peak_loc = peak_loc)
            df_list_date.append(df_tmp)
        df_date = pd.concat(df_list_date)
        df_list_full.append(df_date)
    df_full = pd.concat(df_list_full)
    return(df_full)

def get_ldr_cutoffs_fast(peak_loc=1.2):
    if not 'meta' in globals(): load_well_metadata()
    cutoffs = []
    for i in range(meta.shape[0]):
    #for i in range(50):
        if i % 1000 == 0: print(i)
        barcode = meta.barcode[i]
        well = meta.well[i]
        cutoff = get_ldr_cutoff(barcode, well, peak_loc = 1.2, silent=True)
        cutoffs.append(cutoff)
    return(cutoffs)

def get_ldr_cutoff_i(i, peak_loc=1.2):
    return(get_ldr_cutoff(meta.barcode[i], meta.well[i], peak_loc = 1.2, silent=True))

def get_ldr_cutoffs_parallel(peak_loc=1.2, nproc = 10, batch = 1000):
    if not 'meta' in globals(): load_well_metadata()
    cutoffs = []
    n_total = meta.shape[0]
    batches = np.ceil(meta.shape[0]/batch)
    for i in range(int(batches)):
        print(i)
        tic = time.time()
        start_batch = i*batch
        end_batch = min( (i+1)*batch, n_total)
        range_obj = range(start_batch, end_batch)
        pool = multiprocessing.Pool(nproc)
        cutoffs_batch = pool.map(get_ldr_cutoff_i, range_obj)
        #cutoffs_batch = pool.map(get_meta_i, range_obj)
        cutoffs.extend(cutoffs_batch)
        toc = time.time()
        print(str(toc-tic))
    return(cutoffs)

load_well_metadata('meta')
define_regating_df('regate_df')
define_folder_dict('folder_dict')

In [None]:
cutoffs = get_ldr_cutoffs_parallel()
meta_ldr = meta
meta_ldr['ldr_cutoff'] = cutoffs
#meta_ldr.to_csv("meta_with_ldr.csv")

In [None]:
### find possibly mis-gated wells
df = pd.read_csv('meta_with_ldr.csv')
df_tmp = df.query('cell_line == "BT20"')
df_join = df.groupby(['cell_line', 'barcode']).agg({'ldr_cutoff': ['median', np.std]})
df_join = df_join['ldr_cutoff']
df_join
df_join['high_cutoff'] = df_join['median'] + 3*df_join['std']
df_join['low_cutoff'] = df_join['median'] - 3*df_join['std']
df2 = df.merge(df_join, on = ['cell_line', 'barcode'], how = 'left')
df_high = df2.query('ldr_cutoff > high_cutoff')
df_low = df2.query('ldr_cutoff < low_cutoff')
df_high.reset_index()
df_low.reset_index()

In [None]:
df = df_high
df.reset_index()
#fig_list = []
output_dir = ""
pdf = "possible_misgated_test.pdf"
pdf_full = os.path.join(output_dir, pdf)
pdf_pages = PdfPages(pdf_full)
nb_plots = len(df.well)
plots_per_page = 6
print(nb_plots)
for i in range(nb_plots):
    print(i)
    barcode = df.iloc[i].barcode
    cell_line = df.iloc[i].cell_line
    well = df.iloc[i].well
    median_ldr = df.iloc[i]['median']
    ldr_high = df.iloc[i].high_cutoff
    agent1 = df.iloc[i].agent1
    agent2 = df.iloc[i].agent2
    conc1 = df.iloc[i].concentration1_chr
    conc2 = df.iloc[i].concentration2_chr
    fig_title = well + " " + barcode + " " + cell_line + "\n" +str(agent1) + " " + str(conc1) + " uM " + str(agent2) + " " + str(conc2) + " uM " + "\n" +"Median LDR: " + str(round(median_ldr, 2)) +" LDR cutoff high: " + str(round(ldr_high, 2))
    if i % plots_per_page == 0:
        fig = plt.figure(figsize=(8.5, 11))
        outer = GridSpec(3, 2, wspace=0.2, hspace=0.5)
    i_page = i % plots_per_page
    ### make figures
    plot_ldr_well(barcode, well, add_ldr_line=median_ldr, fig=fig, outer=outer, i=i_page,
                 silent=True, show_fig=False, scatter=True, title=fig_title)
    #fig_list.append(fig_tmp)
    if (i + 1) % plots_per_page == 0 or (i + 1) == nb_plots:
           plt.tight_layout()
           pdf_pages.savefig()
           plt.close('all')
pdf_pages.close()

In [None]:
barcode

In [None]:
#ggplot(meta_ldr, aes(x = 'date', y = 'ldr_cutoff')) + geom_boxplot() + coord_flip() + facet_wrap('cell_line', scales = "free_y") +  theme(figure_size = (10, 10))

In [None]:
#ggplot(meta_ldr.query('cell_line == "BT20"'), aes(x = 'barcode', y = 'ldr_cutoff')) + geom_boxplot() + geom_jitter() + coord_flip() + theme(figure_size = (5, 15))

In [None]:
plot_all_problem_plates()

In [None]:
df_new_gating = regate_wells()
df_new_gating.to_csv('regating_counts_07_10_2023.csv')

In [None]:
df1 = get_ldr_cutoffs_plate("211029_combo_180", peak_loc = 1.2)

In [None]:
ggplot(df1, aes(x = 'cell_line', y = 'ldr_cutoff')) + geom_boxplot() + geom_jitter()

In [None]:
barcode = "211029_combo_180"
cell_line = "SUM1315"
well = "G06"
#plot_wells_ldr(barcode, cell_line, output_dir = "")
#plot_ldr_well(barcode, well)

######## plot peaks w/ normal algorithm
df = read_and_rename_well_data(barcode, well, silent=True)
ldrint = df['ldr']
ldrint = ldrint[ldrint > 0]
logint = np.log10(ldrint)

import seaborn as sns
fig, ax = plt.subplots()
x, y = sns.kdeplot(logint, ax=ax).get_lines()[0].get_data()

peak_locs, _ = find_peaks(-y)
#print(peak_locs)
cc = x[peak_locs]
print(cc)

###### plot peaks w/ negative ldr values mapped to minimum positive ldr
df = read_and_rename_well_data(barcode, well, silent=True)

df_pos1 = df.query("ldr > 0")
min_ldr = np.min(df_pos1.ldr)
df_pos2 = df.query("dna > 0")
min_dna = np.min(df_pos2.dna)
df['ldr'] = [x if x>0 else min_ldr for x in df.ldr]
df['dna'] = [x if x>0 else min_dna for x in df.dna]

ldrint = df['ldr']
ldrint = ldrint[ldrint > 0]
logint = np.log10(ldrint)

import seaborn as sns
fig, ax = plt.subplots()
x, y = sns.kdeplot(logint, ax=ax).get_lines()[0].get_data()

peak_locs, _ = find_peaks(-y)
#print(peak_locs)
cc = x[peak_locs]
print(cc)

In [None]:
barcode = "211029_combo_180"
cell_line = "SUM1315"
peak_loc = 1.2
well = "G06"
well = "C19"
well = "C20"


######## plot peaks w/ smoothing
df = read_and_rename_well_data(barcode, well, silent=True)
ldrint = df['ldr']
ldrint = ldrint[ldrint > 0]
logint = np.log10(ldrint)

import seaborn as sns
fig, ax = plt.subplots()
x, y = sns.kdeplot(logint, ax=ax, bw_adjust=1).get_lines()[0].get_data()
peak_locs, test = find_peaks(-y, prominence=0, width=0)
#print(peak_locs)
cc = x[peak_locs]
try:
    ldr_cutoff = cc[cc > peak_loc][0]
except IndexError:
    ldr_cutoff = np.quantile(logint, 0.99)
plt.axvline(x=ldr_cutoff, ls = "--", color = "red")
print(cc)
print(test)
print(ldr_cutoff)

fig, ax = plt.subplots()
x, y = sns.kdeplot(logint, ax=ax, bw_adjust=1.5).get_lines()[0].get_data()
peak_locs, test = find_peaks(-y, prominence=0, width=0)
#print(peak_locs)
cc = x[peak_locs]
try:
    ldr_cutoff = cc[cc > peak_loc][0]
except IndexError:
    ldr_cutoff = np.quantile(logint, 0.99)
plt.axvline(x=ldr_cutoff, ls = "--", color = "red")
print(cc)
print(test)
print(ldr_cutoff)

fig, ax = plt.subplots()
x, y = sns.kdeplot(logint, ax=ax, bw_adjust=2).get_lines()[0].get_data()
peak_locs, test = find_peaks(-y, prominence=0, width=0)
#print(peak_locs)
cc = x[peak_locs]
try:
    ldr_cutoff = cc[cc > peak_loc][0]
except IndexError:
    ldr_cutoff = np.quantile(logint, 0.99)
plt.axvline(x=ldr_cutoff, ls = "--", color = "red")
print(cc)
print(test)
print(ldr_cutoff)

In [None]:
barcode = "211029_combo_180"
cell_line = "SUM1315"
peak_loc = 1.2
well = "C19"
df_test = read_and_rename_well_data(barcode, well)
test = plot_ldr(df_test, peak_loc = peak_loc)
dcf_int.get_counts_df(df=df_test, barcode=barcode, well=well, peak_loc = peak_loc)

In [None]:
barcode = "211029_combo_180"
cell_line = "SUM1315"
well = "G06"

###### plot peaks w/ negative ldr values mapped to minimum positive ldr
df = read_and_rename_well_data(barcode, well, silent=True)

df_pos1 = df.query("ldr > 0")
min_ldr = np.min(df_pos1.ldr)
df_pos2 = df.query("dna > 0")
min_dna = np.min(df_pos2.dna)
df['ldr'] = [x if x>0 else min_ldr for x in df.ldr]
df['dna'] = [x if x>0 else min_dna for x in df.dna]

ldrint = df['ldr']
ldrint = ldrint[ldrint > 0]
logint = np.log10(ldrint)

import seaborn as sns
fig, ax = plt.subplots()
x, y = sns.kdeplot(logint, ax=ax).get_lines()[0].get_data()

peak_locs, _ = find_peaks(-y, prominence=0.2)
#print(peak_locs)
cc = x[peak_locs]
print(cc)

In [None]:
barcode = "211029_combo_180"
well = "G06" ### example of bad gating
well = "C07" ### example of good separation, large peak

######## plot peaks w/ peak_prominence
df = read_and_rename_well_data(barcode, well, silent=True)
ldrint = df['ldr']
ldrint = ldrint[ldrint > 0]
logint = np.log10(ldrint)

import seaborn as sns
fig, ax = plt.subplots()
x, y = sns.kdeplot(logint, ax=ax).get_lines()[0].get_data()

peak_locs, test = find_peaks(-y, prominence=0.1, width = 30)
#print(peak_locs)
cc = x[peak_locs]
print(cc)

In [None]:
test

In [None]:
######## plot peaks w/ normal algorithm
df = read_and_rename_well_data(barcode, well, silent=True)
ldrint = df['ldr']
ldrint = ldrint[ldrint > 0]
logint = np.log10(ldrint)

import seaborn as sns
fig, ax = plt.subplots()
x, y = sns.kdeplot(logint, ax=ax).get_lines()[0].get_data()

peak_locs, _ = find_peaks(-y)
#print(peak_locs)
cc = x[peak_locs]
print(cc)

fig,ax = plt.subplots()
# plot the data
ax.plot(x,-y)

In [None]:
#### test viewing a few plates, looking for badly gated wells
barcode = "211029_combo_180"
cell_line = "BT20"
#plot_wells_ldr(barcode, cell_line, output_dir = "")

barcode = "211029_combo_182"
cell_line = "SUM159"
#plot_wells_ldr(barcode, cell_line, output_dir = "")


barcode = "211029_combo_180"
cell_line = "SUM1315"
plot_wells_ldr(barcode, cell_line, output_dir = "")

#well = "D12"
#plot_ldr_well(barcode, well)
#df = read_and_rename_well_data(barcode, well)
#df

In [None]:
for i in range(df.shape[0]):
    if df.ldr[i]<=0:
        print(i)

In [None]:
#meta_sub = meta.filter(meta.columns[0:9])
#df_new = df_new_gating.merge(meta_sub, on = ['barcode', 'well'], how = 'left')
#df_new.sort_values(by=['cell_count__dead'])

In [None]:
#df_new.query('agent1 == ""').sort_values(by=['cell_count__dead'])

In [None]:
#barcode = '210226_combo_51'
#cell_line = 'HCC1937'
#well = "I06"
#well = "J07"

#barcode = '210226_combo_52'
#cell_line = 'HCC1937'
#well = "I11"

#barcode = '210226_combo_53'
#cell_line = 'HCC1937'
#well = "I18"

barcode = '210226_combo_54'
cell_line = 'HCC1937'
well = "I03"
well = "I11"
well = "J04"
well = "J07"

barcode = '210226_combo_55'
cell_line = 'HCC1937'
well = "I15"

barcode = '210226_combo_56'
cell_line = 'HCC1937'
well = "I11"
#well = "I14"
#well = "I18"
#well = "I21"

barcode = '210226_combo_57'
cell_line = 'HCC1937'
well = "I03"
well = "I06"
well = "I11"
well = "J04"
well = "J07"

barcode = '210302_combo_59'
cell_line = 'HCC1937'
well = "I15"
#well = "J20"

barcode = '210302_combo_60'
cell_line = 'HCC1937'
well = "I11"
well = "I18"

barcode = '210302_combo_61'
cell_line = 'HCC1937'
well = "I06"
#well = "J07"

barcode = '210406_combo_69'
cell_line = 'SUM1315'
well = "E15"
well = "F20"

barcode = '210406_combo_70'
cell_line = 'SUM1315'
well = "E18"
#well = "E11"

barcode = '210406_combo_71'
cell_line = 'SUM1315'
#well = "E06"
well = "F07"

barcode = '210406_combo_72'
cell_line = 'SUM1315'
well = "E11"
#well = "E12"
#well = "E14"
#well = "E15"
#well = "F20"

barcode = '210406_combo_73'
cell_line = 'SUM1315'
well = "E11"
#well = "E14"
#well = "E18"
#well = "E21"
#well = "F05"

barcode = '210406_combo_74'
cell_line = 'SUM1315'
well = "E03"
well = "E06"
well = "E11"
well = "F04"
well = "F07"

barcode = '210406_combo_75'
cell_line = 'SUM1315'
well = "E11"
#well = "E12"
#well = "E14"
#well = "E15"
#well = "F20"

barcode = '210406_combo_76'
cell_line = 'SUM1315'
well = "E11"
well = "E14"
well = "E18"
well = "E21"
well = "F05"

barcode = '210406_combo_77'
cell_line = 'SUM1315'
well = "E03"
#well = "E06"
#well = "E11"
#well = "F04"
#well = "F07"

barcode = '211005_combo_158'
cell_line = 'SUM1315'
well = "G11"
well = "G12"
well = "G14"

barcode = '211005_combo_160'
cell_line = 'SUM1315'
well = "G11"
well = "G12"
well = "G14"

barcode = '211005_combo_164'
cell_line = 'SUM1315'
well = "G11"
#well = "G14"
#well = "H20"

barcode = '211005_combo_165'
cell_line = 'SUM1315'
well = "G11"
#well = "G14"
#well = "H20"

barcode = '211005_combo_166'
cell_line = 'SUM1315'
well = "G11"
well = "H04"
well = "H07"

### using DNAcontent column as dna -- same as before
df_test = read_and_rename_well_data(barcode, well)
### using HoechstINT column as dna
### note: using Hoechst gives lower dead count -- only 12 dead_subg1 vs. ~500 for DNAContent
#df_test = read_and_rename_well_data(barcode, well, hoechst_as_dna=True)
peak_loc = 1.2
test = plot_ldr(df_test, peak_loc = peak_loc)
dcf_int.get_counts_df(df=df_test, barcode=barcode, well=well, peak_loc = peak_loc)

In [None]:
### testing wells that legitimately do have high dead counts

barcode = "210423_combo_84"
well = "K21" ### Alp. + Tram -- cutoff at 1.5 ?? -- similar for K19
well = "K11" ### ctrl well -- low dead cells -- cutoff at ~3.25

### using DNAcontent column as dna -- same as before
df_test = read_and_rename_well_data(barcode, well)
### using HoechstINT column as dna
### note: using Hoechst gives lower dead count -- only 12 dead_subg1 vs. ~500 for DNAContent
#df_test = read_and_rename_well_data(barcode, well, hoechst_as_dna=True)
peak_loc = 1.2
test = plot_ldr(df_test, peak_loc = peak_loc)
dcf_int.get_counts_df(df=df_test, barcode=barcode, well=well, peak_loc = peak_loc)

In [None]:
barcode = '210406_combo_62'
cell_line = 'SUM1315'
well = "E03"

#barcode = "210423_combo_78"
#well = 'K04'
#cell_line = "SUM185PE"

#test = plot_problem_plate(barcode, cell_line)
plot_ldr_well(barcode, well, peak_loc = 1.2)
df_test = read_and_rename_well_data(barcode, well)
df_tmp = dcf_int.get_counts_df(df=df_test, barcode=barcode, well=well, peak_loc = 1.2)
#plot_wells_ldr(barcode, cell_line)
#df_tmp

dcf_int.get_ldrgates(df_test.ldr, peak_loc=1.2)[0]
#df_tmp