In [None]:
import os
import glob
import re
import pandas as pd
import string

from cell_cycle_gating import manual_gating as mg
from cell_cycle_gating import dead_cell_filter_ldrint as dcf_int
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [None]:
## Get the names of wells, valid wells (excluding outer two rows/columns), etc.
## input:
##    var: either "all_wells", "valid_wells", "all_well_rows", or "all_well_cols"
## output: a list with the corresponding info, e.g. ["A01", "A02", ... , "P24"] for all_wells
def get_well_names(var):
    all_well_rows = string.ascii_uppercase[0:16]
    all_well_cols = [str(num).zfill(2) for num in range(1,25)]
    valid_well_rows = all_well_rows[2:14]
    valid_well_cols = all_well_cols[2:22]
    ### get all wells
    all_wells = [row + col for row in all_well_rows for col in all_well_cols]
    all_wells.sort()
    ### get valid wells
    valid_wells = [row + col for row in valid_well_rows for col in valid_well_cols]
    valid_wells.sort()
    out = {
        'all_wells':all_wells,
        'valid_wells':valid_wells,
        'all_well_rows':all_well_rows,
        'all_well_cols':all_well_cols,
        'valid_well_rows':valid_well_rows,
        'valid_well_cols':valid_well_cols
    }
    return(out[var])

## Define a dictionary with the local folder name for experiments on each date
def define_folder_dict():
    folder_dict = {
            '2020-11-17':'rep3',
            '2021-02-19':'rep4',
            '2021-02-26':'rep5',
            '2021-03-02':'rep6',
            '2021-04-06':'rep7',
            '2021-04-23':'rep8',
            '2021-05-18':'rep9',
            '2021-05-21':'rep10',
            '2021-06-11':'rep11',
            '2021-07-27':'2_rep1/210727-combo-rep1',
            '2021-07-30':'2_rep2/210730_combo_rep2',
            '2021-08-06':'2_rep3/210806_combo_rep3',
            '2021-10-05':'redo_rep1_and_2',
            '2021-10-15':'redo_rep1_and_2/redo_rep2',
            '2021-10-29':'redo_rep3'
        }
    return(folder_dict)

## Get a list of plate barcodes for a given date
## input:
##    date: e.g. '2021-10-15'
## output: list of barcodes e.g. '211015_combo_173'
def get_barcodes(date):
    date_formatted = date_format_switch(date)
    main_dir = get_data_dir(date = date)
    dirs = [ x for x in os.listdir(main_dir) if os.path.isdir( os.path.join(main_dir, x) )]
    ### match date at the start of the sub-directory
    dirs_barcodes = [ x for x in dirs if bool(re.match(date_formatted+"_combo", x)) ]
    return( dirs_barcodes )

## Switch the format of a date from YYYY-MM-DD to YYMMDD
## input:
##    date: e.g. '2021-02-19'
## output: e.g. '210219'
def date_format_switch(date):
    new_str = date[2:4] + date[5:7] + date[8:10]
    return(new_str)

## Get the date in YYYY-MM-DD format from a plate barcode
## input:
##    barcode: '210406_combo_71'
## output: e.g. '2021-04-06'
def date_from_barcode(barcode):
    date = '20' + barcode[0:2] + '-' + barcode[2:4] + '-' + barcode[4:6]
    return(date)

## Get the well-level data directory for a given date or barcode
## input:
##    barcode: a plate barcode, e.g. '210406_combo_71'
##    date: a date in YYYY-MM-DD format, e.g. '2021-04-06'
##    base_dir: the full path of the data folder, e.g. "/mnt/y/lsp-analysis/LINCS-combinations/"
## output:
##    returns the directory of the well-level data for a barcode
##    if a date is given and no barcode, returns the directory of all data for the date
##    if no date or barcode is given, returns the base directory of all data
def get_data_dir(barcode=None, date=None, base_dir = "/mnt/y/lsp-analysis/LINCS-combinations/"):
    ### note: using unix folder conventions -- would need to re-write for Windows
    ### set for osx
    if not os.path.exists(base_dir):
        base_dir = "/Volumes/hits/lsp-analysis/LINCS-combinations/"
    if barcode is None and date is None:
        return(base_dir)
    if date is None:
        date = date_from_barcode(barcode)
    if barcode is None:
        plate_dir = ''
    else:
        plate_dir = barcode
    folder_dict = define_folder_dict()
    local_dir = folder_dict[date]
    full_dir = os.path.join(base_dir, local_dir, plate_dir)
    return(full_dir)

## Get the filename for well-level intensities for a given barcode and well
## input:
##    barcode: a plate barcode, e.g. '210406_combo_71'
##    well: a well of interest, e.g. 'D06'
## output:
##    full path/filename of the well-level data
def get_well_file(barcode, well):
    date = date_from_barcode(barcode)
    data_dir = get_data_dir(barcode)
    ### example file style
    f1 = barcode+".result."+well+"[test].csv"
    #f2 = barcode+".result."+well+"[test].csv"
    files = os.listdir(data_dir)
    if f1 in files:
        well_file = os.path.join(data_dir, f1)
    else:
        print("well csv not found!")
    return(well_file)

## Read the well-level data for a given well and barcode
## input:
##    barcode: a plate barcode, e.g. '210406_combo_71'
##    well: a well of interest, e.g. 'D06'
## output:
##    a pandas dataframe of dye intensities for individual cells
def read_well_data(barcode, well):
    ff = get_well_file(barcode, well)
    df = pd.read_csv(ff)
    return(df)

## Re-name columns of well-level dataframe for LDR, DNA, EDU, etc.
## input: original data frame read from csv
## output: data frame with re-names columns
def rename_df_columns(df, silent=True):
    if 'Well Name' in df.columns:
        if not silent: print("'Well Name' column found -- re-naming as 'well'")
    else:
        print(df.columns)
        if not silent: print('Well Name column not found')
        return(df)
    if 'ldrint' in df.columns:
        if not silent: print("'ldrint' column found -- re-naming as 'ldr'")
    else:
        print(df.columns)
        if not silent: print('ldrint column not found')
        return(df)
    check_dna1 = 'Cell: DNAcontent (DD-bckgrnd)' in df.columns
    check_dna2 = 'Cell: HoechstINT (DDD-bckgrnd)' in df.columns
    if not check_dna1 and not check_dna2:
        if not silent: print(df.columns)
        if not silent: print('DNA column not found')
        ndict = {
            'Well Name':'well',
            'ldrint': 'ldr'
        }
    else:
        if check_dna1:
            dna_col = 'Cell: DNAcontent (DD-bckgrnd)'
            if not silent: print("'Cell: DNAcontent (DD-bckgrnd)' column found -- re-naming as 'dna'")
        else:
            if check_dna2:
                dna_col = 'Cell: HoechstINT (DDD-bckgrnd)'
                if not silent: print("'Cell: HoechstINT (DDD-bckgrnd)' column found -- re-naming as 'dna'")
        ndict = {
            'Well Name':'well',
            'ldrint': 'ldr',
            dna_col: 'dna'
        }
    df = df.rename(columns=ndict)
    return(df)

## Get all wells with data for a given plate
## input:
##    barcode: a plate barcode, e.g. '210406_combo_71'
## output:
##    A list of wells, e.g. ['C03', 'C04', ... , 'N22']
def get_all_wells(barcode):
    data_dir = get_data_dir(barcode)
    wells_with_data = []
    all_wells = get_well_names("all_wells")
    for well in all_wells:
        ### example file style
        f1 = barcode+".result."+well+"[test].csv"
        f1_full = os.path.join(data_dir, f1)
        check = os.path.exists( f1_full )
        if check:
            wells_with_data.append(well)
    wells_with_data.sort()
    return(wells_with_data)

### maybe not necessary for LDR intensity data?
#def read_all_wells(barcode):
#    wells = get_all_wells(barcode)
#    df_list = [read_well_data(barcode, well) for well in wells]
#    return(df_list)

def get_ldr_cutoff(barcode, well, peak_loc = 1.2):
    df = read_well_data(barcode, well)
    df = rename_df_columns(df)
    ldr_gates, ldr_lims = dcf_int.get_ldrgates(ldrint = df['ldr'], peak_loc=peak_loc) ## 1.2 is default
    return(ldr_gates[1])

def get_ldr_cutoff_many(barcode, wells, peak_loc = 1.2):
    ldrs = [get_ldr_cutoff(barcode, well, peak_loc = peak_loc) for well in wells]
    return(ldrs)

### note: file location hard-coded, only for wsl on my desktop right now
def get_well_metadata(folder=None, file='single_timepoint_cleaned_from_raw_2023-06-08.parquet'):
    ### read parquet file w/ all metadata
    if folder is None:
        folder = "/mnt/c/Users/NC168/git/LINCS_combos/data/cleaned/"
    full_file = os.path.join(folder, file)
    df = pd.read_parquet(full_file)
    return(df)
    
def get_wells(barcode, cell_line):
    ### get only the wells for a certain cell line on a given barcode
    meta = get_well_metadata()
    #query = " cell_line == 'SUM1315' & barcode == '201117_combo_33' "
    query = "cell_line == '"+cell_line+"' & barcode == '"+barcode+"'"
    meta_sub = meta.query(query)
    wells = list(meta_sub.well)
    wells.sort()
    return(wells)

def get_ldr_cutoffs_cell_line_and_barcode(barcode, cell_line, peak_loc=1.2):
    wells = get_wells(barcode, cell_line)
    ldrs = get_ldr_cutoff_many(barcode, wells, peak_loc=peak_loc)
    d = {'well':wells, 'ldr_cutoff': ldrs}
    df = pd.DataFrame(data=d)
    return(df)

In [None]:
#meta = get_well_metadata()
#test = meta.query(" cell_line == 'SUM1315' & barcode == '201117_combo_33' ")
#get_wells('201117_combo_33', 'SUM1315')
barcode = '201117_combo_33'
cell_line = 'SUM1315'
get_ldr_cutoffs_cell_line_and_barcode(barcode, cell_line)

In [None]:
#get_ldr_cutoff('210423_combo_78', 'K04', peak_loc = 2)
get_ldr_cutoff_many('210423_combo_78', ['K03', 'K04', 'K05'], peak_loc = 1.2)

In [None]:
df = read_well_data('211015_combo_173', 'C05')
df = rename_df_columns(df)
df

In [None]:
df

In [None]:
date_from_barcode('210406_combo_71')
#get_all_wells('2021-04-06', '210406_combo_71')

In [None]:
#get_data_dir()
#get_data_dir("211015_combo_176")
#get_data_dir(date = '2021-02-19')
get_data_dir("211015_combo_176", '2021-10-15')

In [None]:
date_format_switch("2021-02-19")

In [None]:
re.search("234", "abdsf234")

In [None]:
get_barcodes('2021-10-15')

In [None]:
### get column names for each date
folder_dict = define_folder_dict()
for date in folder_dict.keys():
    print(date)
    barcodes = get_barcodes(date)
    barcodes.sort()
    test_plate = barcodes[0]
    df = read_well_data(test_plate, 'D06')
    print(df.columns)

In [None]:
folder_dict = define_folder_dict()
for date in list(folder_dict.keys()):
    print(date)
    barcodes = get_barcodes(date)
    barcodes.sort()
    for barcode in barcodes:
        #df = read_well_data(date, plate, 'D06')
        #print(df.columns)
        #print(barcode)
        data_dir = get_data_dir(barcode)
        #all_files = [ file for file in os.listdir(data_dir) if not os.path.isdir( os.path.join(data_dir, file) )]
        #csvs = [ file for file in all_files if file.endswith("csv") ]
        #well_files = [ file for file in csvs if file.startswith(barcode)]
        #well_files.sort()
        #print(len(well_files))
        #print(well_files[0])
        missing = False
        valid_wells = get_well_names("valid_wells")
        for well in valid_wells:
            ### example file style
            f1 = barcode+".result."+well+"[test].csv"
            f1_full = os.path.join(data_dir, f1)
            check = os.path.exists( f1_full )
            if not check:
                #print(well)
                missing = True
        if missing:
            print(barcode)

In [None]:
folder_dict = define_folder_dict()
test = list(folder_dict.keys())
test[0]