In [None]:
import os
import glob
import re
import numpy as np
import pandas as pd
import string

from cell_cycle_gating import manual_gating as mg
from cell_cycle_gating import dead_cell_filter_ldrint as dcf_int

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

import plotnine
from plotnine import *

In [None]:
## Get the names of wells, valid wells (excluding outer two rows/columns), etc.
## input:
##    var: either "all_wells", "valid_wells", "all_well_rows", or "all_well_cols"
## output: a list with the corresponding info, e.g. ["A01", "A02", ... , "P24"] for all_wells
def get_well_names(var):
    all_well_rows = string.ascii_uppercase[0:16]
    all_well_cols = [str(num).zfill(2) for num in range(1,25)]
    valid_well_rows = all_well_rows[2:14]
    valid_well_cols = all_well_cols[2:22]
    ### get all wells
    all_wells = [row + col for row in all_well_rows for col in all_well_cols]
    all_wells.sort()
    ### get valid wells
    valid_wells = [row + col for row in valid_well_rows for col in valid_well_cols]
    valid_wells.sort()
    out = {
        'all_wells':all_wells,
        'valid_wells':valid_wells,
        'all_well_rows':all_well_rows,
        'all_well_cols':all_well_cols,
        'valid_well_rows':valid_well_rows,
        'valid_well_cols':valid_well_cols
    }
    return(out[var])

## Define a dictionary with the local folder name for experiments on each date
def define_folder_dict():
    folder_dict = {
            '2020-11-17':'rep3',
            '2021-02-19':'rep4',
            '2021-02-26':'rep5',
            '2021-03-02':'rep6',
            '2021-04-06':'rep7',
            '2021-04-23':'rep8',
            '2021-05-18':'rep9',
            '2021-05-21':'rep10',
            '2021-06-11':'rep11',
            '2021-07-27':'2_rep1/210727-combo-rep1',
            '2021-07-30':'2_rep2/210730_combo_rep2',
            '2021-08-06':'2_rep3/210806_combo_rep3',
            '2021-10-05':'redo_rep1_and_2',
            '2021-10-15':'redo_rep1_and_2/redo_rep2',
            '2021-10-29':'redo_rep3'
        }
    return(folder_dict)

## Define a dictionary with the local folder name for experiments on each date
def define_regating_df():
    barcode = '211015_combo_176'
    cell_line = 'SUM1315'
    data = [
        {'barcode': '211015_combo_174', 'cell_line': 'SUM1315', 'peak_loc': None}
        {'barcode': '211015_combo_175', 'cell_line': 'SUM1315', 'peak_loc': 3},
        {'barcode': '211015_combo_176', 'cell_line': 'SUM1315', 'peak_loc': 3}
    ]
    df = pd.DataFrame(data)
    globals()['regate_df'] = df
    
## Get a list of plate barcodes for a given date
## input:
##    date: e.g. '2021-10-15'
## output: list of barcodes e.g. '211015_combo_173'
def get_barcodes(date):
    date_formatted = date_format_switch(date)
    main_dir = get_data_dir(date = date)
    dirs = [ x for x in os.listdir(main_dir) if os.path.isdir( os.path.join(main_dir, x) )]
    ### match date at the start of the sub-directory
    dirs_barcodes = [ x for x in dirs if bool(re.match(date_formatted+"_combo", x)) ]
    return( dirs_barcodes )

## Switch the format of a date from YYYY-MM-DD to YYMMDD
## input:
##    date: e.g. '2021-02-19'
## output: e.g. '210219'
def date_format_switch(date):
    new_str = date[2:4] + date[5:7] + date[8:10]
    return(new_str)

## Get the date in YYYY-MM-DD format from a plate barcode
## input:
##    barcode: '210406_combo_71'
## output: e.g. '2021-04-06'
def date_from_barcode(barcode):
    date = '20' + barcode[0:2] + '-' + barcode[2:4] + '-' + barcode[4:6]
    return(date)

## Get the well-level data directory for a given date or barcode
## input:
##    barcode: a plate barcode, e.g. '210406_combo_71'
##    date: a date in YYYY-MM-DD format, e.g. '2021-04-06'
##    base_dir: the full path of the data folder, e.g. "/mnt/y/lsp-analysis/LINCS-combinations/"
## output:
##    returns the directory of the well-level data for a barcode
##    if a date is given and no barcode, returns the directory of all data for the date
##    if no date or barcode is given, returns the base directory of all data
def get_data_dir(barcode=None, date=None, base_dir = "/mnt/y/lsp-analysis/LINCS-combinations/"):
    ### note: using unix folder conventions -- would need to re-write for Windows
    ### set for osx
    if not os.path.exists(base_dir):
        base_dir = "/Volumes/hits/lsp-analysis/LINCS-combinations/"
    if barcode is None and date is None:
        return(base_dir)
    if date is None:
        date = date_from_barcode(barcode)
    if barcode is None:
        plate_dir = ''
    else:
        plate_dir = barcode
    folder_dict = define_folder_dict()
    local_dir = folder_dict[date]
    full_dir = os.path.join(base_dir, local_dir, plate_dir)
    return(full_dir)

## Get the filename for well-level intensities for a given barcode and well
## input:
##    barcode: a plate barcode, e.g. '210406_combo_71'
##    well: a well of interest, e.g. 'D06'
## output:
##    full path/filename of the well-level data
def get_well_file(barcode, well):
    date = date_from_barcode(barcode)
    data_dir = get_data_dir(barcode)
    ### example file style
    f1 = barcode+".result."+well+"[test].csv"
    #f2 = barcode+".result."+well+"[test].csv"
    files = os.listdir(data_dir)
    if f1 in files:
        well_file = os.path.join(data_dir, f1)
    else:
        print("well csv not found!")
    return(well_file)

## Read the well-level data for a given well and barcode
## input:
##    barcode: a plate barcode, e.g. '210406_combo_71'
##    well: a well of interest, e.g. 'D06'
## output:
##    a pandas dataframe of dye intensities for individual cells
def read_well_data(barcode, well):
    ff = get_well_file(barcode, well)
    df = pd.read_csv(ff)
    return(df)

## Re-name columns of well-level dataframe for LDR, DNA, EDU, etc.
## input: original data frame read from csv
## output: data frame with re-names columns
def rename_df_columns(df, silent=True):
    col_dict = {}
    ### check for well name
    if 'Well Name' in df.columns:
        if not silent: print("'Well Name' column found -- re-naming as 'well'")
        col_dict['Well Name'] = 'well'
    else:
        print(df.columns)
        if not silent: print('Well Name column not found')
        return(df)
    ### check for LDRint
    if 'ldrint' in df.columns:
        if not silent: print("'ldrint' column found -- re-naming as 'ldr'")
        col_dict['ldrint'] = 'ldr'
    else:
        print(df.columns)
        if not silent: print('ldrint column not found')
        return(df)
    ### check for DNAcontent/Hoechst
    check_dna1 = 'Cell: DNAcontent (DD-bckgrnd)' in df.columns
    check_dna2 = 'Cell: HoechstINT (DDD-bckgrnd)' in df.columns
    if not check_dna1 and not check_dna2:
        if not silent: print(df.columns)
        if not silent: print('DNA column not found')
    else:
        if check_dna1:
            dna_col = 'Cell: DNAcontent (DD-bckgrnd)'
            if not silent: print("'Cell: DNAcontent (DD-bckgrnd)' column found -- re-naming as 'dna'")
        else:
            if check_dna2:
                dna_col = 'Cell: HoechstINT (DDD-bckgrnd)'
                if not silent: print("'Cell: HoechstINT (DDD-bckgrnd)' column found -- re-naming as 'dna'")
        col_dict[dna_col] = 'dna'
    ### check for Edu (raw)
    if 'Cell: EdUrawINT (DDD-bckgrnd)' in df.columns:
        if not silent: print("'Cell: EdUrawINT (DDD-bckgrnd)' column found -- re-naming as 'edu_raw'")
        col_dict['Cell: EdUrawINT (DDD-bckgrnd)'] = 'edu_raw'
    elif 'Cell: EdUrawINT (DD-bckgrnd)' in df.columns:
        if not silent: print("'Cell: EdUrawINT (DD-bckgrnd)' column found -- re-naming as 'edu_raw'")
        col_dict['Cell: EdUrawINT (DD-bckgrnd)'] = 'edu_raw'
    ### check for Edu (background)
    if 'Cell: EdUbackground (DDD-bckgrnd)' in df.columns:
        if not silent: print("'Cell: EdUbackground (DDD-bckgrnd)' column found -- re-naming as 'edu_bg'")
        col_dict['Cell: EdUbackground (DDD-bckgrnd)'] = 'edu_bg'
    elif 'Cell: EdUbackground (DD-bckgrnd)' in df.columns:
        if not silent: print("'Cell: EdUbackground (DD-bckgrnd)' column found -- re-naming as 'edu_bg'")
        col_dict['Cell: EdUbackground (DD-bckgrnd)'] = 'edu_bg'
    ### re-name data-frame columns
    df = df.rename(columns=col_dict)
    if 'edu_raw' in df.columns and 'edu_bg' in df.columns:
        df['edu'] = df.edu_raw - df.edu_bg
    return(df)

## Get all wells with data for a given plate
## input:
##    barcode: a plate barcode, e.g. '210406_combo_71'
## output:
##    A list of wells, e.g. ['C03', 'C04', ... , 'N22']
def get_all_wells(barcode):
    data_dir = get_data_dir(barcode)
    wells_with_data = []
    all_wells = get_well_names("all_wells")
    for well in all_wells:
        ### example file style
        f1 = barcode+".result."+well+"[test].csv"
        f1_full = os.path.join(data_dir, f1)
        check = os.path.exists( f1_full )
        if check:
            wells_with_data.append(well)
    wells_with_data.sort()
    return(wells_with_data)

### maybe not necessary for LDR intensity data?
#def read_all_wells(barcode):
#    wells = get_all_wells(barcode)
#    df_list = [read_well_data(barcode, well) for well in wells]
#    return(df_list)

def get_ldr_cutoff(barcode, well, peak_loc = 1.2):
    df = read_well_data(barcode, well)
    df = rename_df_columns(df)
    ldr_gates, ldr_lims = dcf_int.get_ldrgates(ldrint = df['ldr'], peak_loc=peak_loc) ## 1.2 is default
    return(ldr_gates[1])

def get_ldr_cutoff_many(barcode, wells, peak_loc = 1.2):
    ldrs = [get_ldr_cutoff(barcode, well, peak_loc = peak_loc) for well in wells]
    return(ldrs)

### note: file location hard-coded, only for wsl on my desktop right now
def load_well_metadata(folder=None, file='single_timepoint_cleaned_from_raw_2023-06-08.parquet'):
    ### read parquet file w/ all metadata
    if folder is None:
        folder = "/mnt/c/Users/NC168/git/LINCS_combos/data/cleaned/"
    full_file = os.path.join(folder, file)
    df = pd.read_parquet(full_file)
    globals()['meta'] = df
    
def get_wells(barcode, cell_line):
    ### get only the wells for a certain cell line on a given barcode
    if not 'meta' in globals(): load_well_metadata()
    #query = " cell_line == 'SUM1315' & barcode == '201117_combo_33' "
    query = "cell_line == '"+cell_line+"' & barcode == '"+barcode+"'"
    meta_sub = meta.query(query)
    wells = list(meta_sub.well)
    wells.sort()
    return(wells)

def get_cell_lines_on_plate(barcode):
    if not 'meta' in globals(): load_well_metadata()
    query = "barcode == '"+barcode+"'"
    meta_sub = meta.query(query)
    cell_lines = meta_sub.cell_line.unique()
    return(cell_lines)

def get_ldr_cutoffs_plate(barcode, peak_loc = 1.2):
    cell_lines = get_cell_lines_on_plate(barcode)
    df = get_ldr_cutoffs_cell_line_and_barcode(barcode, cell_lines, peak_loc=peak_loc)
    return(df)

def get_ldr_cutoffs_cell_line_and_barcode(barcode, cell_lines, peak_loc=1.2, silent = False):
    df_list = []
    for cell_line in cell_lines:
        if not silent: print(cell_line)
        wells = get_wells(barcode, cell_line)
        ldrs = get_ldr_cutoff_many(barcode, wells, peak_loc=peak_loc)
        d = {'well':wells, 'ldr_cutoff': ldrs, 'barcode':barcode, 'cell_line': cell_line}
        df_tmp = pd.DataFrame(data=d)
        df_list.append(df_tmp)
    df = pd.concat(df_list)
    return(df)

def read_and_rename_well_data(barcode, well, silent = False):
    df = read_well_data(barcode, well)
    df = rename_df_columns(df, silent = silent)
    return(df)

def plot_ldr(df, peak_loc = 1.2, scatter = False):
    ldr_gates, ldr_lims = dcf_int.get_ldrgates(ldrint = df['ldr'], peak_loc=peak_loc)
    ldr_cutoff = ldr_gates[1]
    df = df.query("ldr > 0")
    if scatter:
        fig, axes = mg.plot_ldr_dna_scatter(np.log10(df.dna), np.log10(df.ldr), ldr_cutoff, dna_gates=None, plot_ldr_log10=True, is_ldrint=True)
    else:
        fig = mg.ldr_gating(np.log10(df.ldr), ldr_cutoff, nbins = 20)
    return(fig, axes)


def plot_ldr_many(barcode, wells, peak_loc = 1.2, scatter = False, silent = True):
    for well in wells:
        df = read_and_rename_well_data(barcode, well, silent)
        plot_ldr(df, peak_loc = peak_loc, scatter = scatter)

def plot_ldr_pdf(barcode, wells, peak_loc = 1.2, figname = "test_ldr.pdf", scatter = True, silent = True, show_fig = True):
    pdf_pages = PdfPages(figname)
    fig_list = []
    for i in range(len(wells)):
        well = wells[i]
        well_meta = get_well_meta(barcode, well)
        cell_line = list(well_meta.cell_line)[0]
        trt1 = list(well_meta.agent1)[0]
        trt2 = list(well_meta.agent2)[0]
        #print(trt2)
        #print(len(trt2))
        #if len(trt2) < 1: trt2 = " "
        conc1 = list(well_meta.concentration1_chr)[0]
        conc2 = list(well_meta.concentration2_chr)[0]
        df = read_and_rename_well_data(barcode, well, silent)
        ldr_gates, ldr_lims = dcf_int.get_ldrgates(ldrint = df['ldr'], peak_loc=peak_loc)
        ldr_cutoff = ldr_gates[1]
        df = df.query("ldr > 0")
        if scatter:
            fig = mg.plot_ldr_dna_scatter(np.log10(df.dna), np.log10(df.ldr), ldr_cutoff, dna_gates=None, plot_ldr_log10=True, is_ldrint=True, show_fig = show_fig)
        else:
            fig = mg.ldr_gating(np.log10(df.ldr), ldr_cutoff, nbins = 20)
        fig_list.append(fig)
        fig_title = str(trt1) + ": "+ str(conc1) + " uM, " + str(trt2) + ": " + str(conc2) + " uM"
        #print(fig_title)
        fig.suptitle(well + "\n" + fig_title, fontsize=12)
        plt.close()
        pdf_pages.savefig(fig)
    pdf_pages.close()
    return(fig_list)

def test_regate(barcode, cell_line, peak_loc = 1.2, figname = "test_figure", scatter = True, silent = True, test = True, show_fig = False):
    path1 = os.path.join('temp_regating', 'csv')
    path2 = os.path.join('temp_regating', 'pdf')
    if not os.path.exists(path1):
        os.makedirs(path1)
    if not os.path.exists(path2):
        os.makedirs(path2)
    wells = get_wells(barcode, cell_line)
    df_list = []
    csv_file = os.path.join(path1, figname+'.csv')
    pdf_file = os.path.join(path2, figname+'.pdf')
    plot_list = plot_ldr_pdf(barcode, wells, peak_loc, figname=pdf_file, scatter=scatter, silent=silent, 
                             show_fig=show_fig)
    print('figures written to: ' + pdf_file)
    for well in wells:
        df = read_and_rename_well_data(barcode, well, silent)
        df_tmp = dcf_int.get_counts_df(df=df, barcode=barcode, well=well, peak_loc = peak_loc)
        df_list.append(df_tmp)
    df_out = pd.concat(df_list)
    df_out.to_csv(csv_file)
    return(df_out, plot_list)

def get_well_meta(barcode, well):
    query = "barcode == '"+ barcode+ "' & well == '"+ well + "'"
    df_sub = meta.query(query)
    return(df_sub)

### Todo:
## 1) define all problem plates/cell-lines (from R script) -- add to regate_df above

## 2) def plot_problem_plates(): loop over all bad plates/cell lines and plot LDR gating with default options

## 3) for each plate/cell-line, come up with a reasonable peak_loc value, add to regate_df

## 4) for each plate/cell-line, plot the new LDR cutoffs and compare

## 5) def plot_LDR_cutoffs(barcode, cell_line, peak_loc):
##       ## get LDR cutoffs and counts for the default (peak_loc = 1.2) and for the proposed peak_loc value
##       ## plot 1: scatterplot of LDR cutoffs (x: default, y: proposed)
##       ## plot 2: scatterplot of dead counts (x: default, y: proposed) -- control counts in different color
##       ## plot 3: scatterplot of live counts (x: default, y: proposed) -- control counts in different color
##       ## data frame output: df1: all data, df2: only wells where LDR changed
##       ## same plots for only wells that changed, with wells labeled

load_well_metadata()
define_regating_df()

In [None]:
test = get_well_meta('211015_combo_176', 'G05')
list(test.cell_line)[0]
regate_df

In [None]:
fig = plot_ldr_pdf(barcode = '211015_combo_176', wells = ['G03','G04', 'G05','G06'], scatter = True, peak_loc=3, show_fig = True, figname="test_many.pdf")

In [None]:
row_num = 0
barcode = list(regate_df.barcode)[row_num]
cell_line = list(regate_df.cell_line)[row_num]
print(barcode); print(cell_line)
peak_loc = 1.2 ##default
df_old, plots_old = test_regate(barcode, cell_line, peak_loc=peak_loc, figname=barcode+'_'+cell_line+'_'+'peak_loc_'+str(peak_loc), show_fig = False)
peak_loc = 3
df_new, plots_new = test_regate(barcode, cell_line, peak_loc=peak_loc, figname=barcode+'_'+cell_line+'_'+'peak_loc_'+str(peak_loc), show_fig = False)

In [None]:
load_well_metadata()
meta.head()

In [None]:
barcode = '211015_combo_176'
cell_line = 'BT20'
df_old, plots_old = test_regate(barcode, cell_line, peak_loc=1.2, figname=barcode+'_'+cell_line+'_'+'peak_loc_1.2', show_fig = False)

In [None]:
barcode = '211015_combo_176'
cell_line = 'SUM1315'
df_old, plots_old = test_regate(barcode, cell_line, peak_loc=1.2, figname=barcode+'_'+cell_line+'_'+'peak_loc_1.2', show_fig = False)
df_new, plots_new = test_regate(barcode, cell_line, peak_loc=3, figname=barcode+'_'+cell_line+'_'+'peak_loc_3.0', show_fig = False)

In [None]:
df = read_and_rename_well_data(barcode, well="G03")
test = plot_ldr(df, scatter = True)

In [None]:
test

In [None]:
barcode = "210423_combo_78" ### bad SUM185PE plate
barcode = "210406_combo_62" ### bad SUM1315 plate
df = get_ldr_cutoffs_plate(barcode)
ggplot(df) + geom_boxplot(aes(x="cell_line", y = "ldr_cutoff")) + geom_jitter(aes(x="cell_line", y = "ldr_cutoff"))

In [None]:

#test = meta.query(" cell_line == 'SUM1315' & barcode == '201117_combo_33' ")
#get_wells('201117_combo_33', 'SUM1315')
barcode = '201117_combo_33'
cell_lines = ['SUM1315']
cell_lines = ['SUM1315', 'SUM149']
cell_lines = get_cell_lines_on_plate(barcode)
print(cell_lines)
df = get_ldr_cutoffs_cell_line_and_barcode(barcode, cell_lines)
df

In [None]:
cell_lines = 'SUM1315'
len(cell_lines)

In [None]:
#get_ldr_cutoff('210423_combo_78', 'K04', peak_loc = 2)
test = get_ldr_cutoff_many('210423_combo_78', ['K03', 'K04', 'K05'], peak_loc = 1.2)


In [None]:
test

In [None]:
#df = read_well_data('211015_combo_173', 'C05')
#df = rename_df_columns(df)
df = read_and_rename_well_data('211015_combo_176', 'G03')
#df
plot_ldr(df, scatter = True)

In [None]:
plot_ldr_many(barcode = '211015_combo_176', wells = ['G03', 'G06', 'G11', 'H04', 'H07'], scatter = True, peak_loc=3)

In [None]:
date_from_barcode('210406_combo_71')
#get_all_wells('2021-04-06', '210406_combo_71')

In [None]:
#get_data_dir()
#get_data_dir("211015_combo_176")
#get_data_dir(date = '2021-02-19')
get_data_dir("211015_combo_176", '2021-10-15')

In [None]:
date_format_switch("2021-02-19")

In [None]:
re.search("234", "abdsf234")

In [None]:
get_barcodes('2021-10-15')

In [None]:
### get column names for each date
folder_dict = define_folder_dict()
for date in folder_dict.keys():
    print(date)
    barcodes = get_barcodes(date)
    barcodes.sort()
    test_plate = barcodes[0]
    df = read_well_data(test_plate, 'D06')
    print(df.columns)

In [None]:
folder_dict = define_folder_dict()
for date in list(folder_dict.keys()):
    print(date)
    barcodes = get_barcodes(date)
    barcodes.sort()
    for barcode in barcodes:
        #df = read_well_data(date, plate, 'D06')
        #print(df.columns)
        #print(barcode)
        data_dir = get_data_dir(barcode)
        #all_files = [ file for file in os.listdir(data_dir) if not os.path.isdir( os.path.join(data_dir, file) )]
        #csvs = [ file for file in all_files if file.endswith("csv") ]
        #well_files = [ file for file in csvs if file.startswith(barcode)]
        #well_files.sort()
        #print(len(well_files))
        #print(well_files[0])
        missing = False
        valid_wells = get_well_names("valid_wells")
        for well in valid_wells:
            ### example file style
            f1 = barcode+".result."+well+"[test].csv"
            f1_full = os.path.join(data_dir, f1)
            check = os.path.exists( f1_full )
            if not check:
                #print(well)
                missing = True
        if missing:
            print(barcode)

In [None]:
folder_dict = define_folder_dict()
test = list(folder_dict.keys())
test[0]