Combine all outputs across multiple imaging batches into a single file and produce segmentation projections for manual evaluation

In [1]:
#### ADD CUSTOM PATH ###
path_to_quant = '//home/gridsan/djuna/homer/github/confocalQuant'

In [2]:
import sys
import os
from tqdm import tqdm 
from aicsimageio import AICSImage
from cellpose import models
import numpy as np
import pandas as pd
import numpy as np
import glob
from skimage.segmentation import find_boundaries
from PIL import Image

sys.path.append(path_to_quant)
from confocalQuant.segmentation import get_czi_files, hide_masks, gamma_correct_image, extract_channels, float_to_int
from confocalQuant.plotting import get_out_files, add_metadata, is_string_present, return_results, filter_data, get_id_data
from confocalQuant.quantification import concatenate_Y
from confocalQuant.image import save_mean_proj



### 1. Create metadata file

In [3]:
parent_path = '/home/gridsan/djuna/homer/github/confocalQuant/'
dirs = np.array(os.listdir('../'))
out_dirs = dirs[['neuron' in x for x in dirs]]
in_dirs = [x.split('_out')[0] for x in out_dirs]

In [20]:
# combine all file info
all_data = []

for i in tqdm(range(len(in_dirs))):
    im_path_root = parent_path + 'data/' + in_dirs[i]
    out_path_root = parent_path + 'outs/' + out_dirs[i]
    
    in_file_names = get_czi_files(im_path_root)
    out_file_names = get_out_files(out_path_root)
    
    df = pd.DataFrame([x.split('.')[0] for x in in_file_names])
    df.columns = ['filename']
    df['ID'] = range(len(in_file_names))
    df['batch'] = in_dirs[i]
    wellname = [x.split('-')[0] for x in df['filename']]
    df['well_name'] = wellname
    dictionary = dict(zip([int(x.split('_')[1].split('.')[0]) for x in out_file_names], out_file_names))
    df['slurm_name'] = [dictionary[x] for x in df['ID']]
    
    add_metadata(df, im_path_root+'/temp.csv')
    
    all_data.append(df)
    
im_data = pd.DataFrame(np.vstack(all_data))
im_data.columns = ['filename', 'fileID', 'batch', 'well_name', 'slurm_file', 'treatment', 'line']

100%|██████████| 12/12 [00:00<00:00, 94.82it/s]


In [5]:
im_data.head()

Unnamed: 0,filename,fileID,batch,well_name,slurm_file,treatment,line
0,C10,0,neuronbatch10132023_cellrox,C10,slurm-24409544_0.out,DGATi,Y622
1,C2,1,neuronbatch10132023_cellrox,C2,slurm-24409544_1.out,vehicle,Y622
2,C3,2,neuronbatch10132023_cellrox,C3,slurm-24409544_2.out,vehicle,Y622
3,C4,3,neuronbatch10132023_cellrox,C4,slurm-24409544_3.out,CDP-choline,Y622
4,C5,4,neuronbatch10132023_cellrox,C5,slurm-24409544_4.out,CDP-choline,Y622


In [6]:
# add info on whether job finished runnning
res = []
for i in tqdm(range(im_data.shape[0])):
    file = '../'+im_data['batch'][i]+'_out/' + im_data['slurm_file'][i]
    res.append(is_string_present(file, 'done'))

im_data['job_done'] = res

100%|██████████| 529/529 [00:00<00:00, 1571.12it/s]


In [7]:
# add unique image ID
np.random.seed(5)
im_data['unique_image_ID'] = [np.random.randint(10**9, 10**10) for x in range(im_data.shape[0])]

In [8]:
# a few checks
im_data[np.invert(im_data['job_done'])]
random_10_digit_number = im_data['unique_image_ID']
len(np.unique(random_10_digit_number))==len(random_10_digit_number)

True

In [9]:
# add 'keep' column to populate later
im_data['keep'] = np.nan

In [10]:
np.unique(im_data['job_done'], return_counts=True)

(array([ True]), array([529]))

In [11]:
im_data[im_data['job_done']==False]

Unnamed: 0,filename,fileID,batch,well_name,slurm_file,treatment,line,job_done,unique_image_ID,keep


In [12]:
# check that these IDs are consistent with the previous im_data file and that the new IDs are different
im_data_prev = pd.read_csv('./im_data.csv')
np.unique([x in set( np.array(im_data_prev['unique_image_ID'])) for x in im_data['unique_image_ID'][389:]])
np.array_equal(np.array(im_data['unique_image_ID'][:389]), np.array(im_data_prev['unique_image_ID']))

True

In [13]:
im_data.to_csv('./im_data_with1108.csv')

  values = values.astype(str)


In [14]:
im_data = im_data[[x in set(['neuronbatch11082023_mitohealth', 'neuronbatch11082023_cellrox_bodipy']) for x in im_data['batch']]]

In [15]:
im_data.shape

(140, 10)

### 2. add the Y_data to this datatable

In [16]:
# make dictionaries
x, y = np.unique(im_data['batch'], return_counts=True)
Nfiles_dict = dict(zip(x,y))
batches = np.unique(im_data['batch'])
colnames = pd.read_csv('colnames.csv')
colnames_dict = dict(zip(list(colnames['batch']), ([[x.split("'")[1] for x in y.split(',')] for y in colnames['colnames']])))
list_of_dfs = [(_, group) for _, group in im_data.groupby('batch')]
id_dicts = [dict(zip(x[1]['fileID'], x[1]['unique_image_ID'])) for x in list_of_dfs]
names = [x[0] for x in list_of_dfs]
id_dicts_per_batch = dict(zip(names, id_dicts))

# concat per cell Y info
res = []
start = 0
end = 0
s_index = np.empty(im_data.shape[0])
e_index = np.empty(im_data.shape[0])
im_index = 0

for i in (range(len(batches))):
    
    # load data per batch
    batch = batches[i]
    directory = '../' + batch +'_out/'
    path_to_sbatch_file = glob.glob(directory+'*.sbatch')[0]
    mat, masks, Y, Ncells, Nzi, cells_per_job, zi_per_job = return_results(path_to_sbatch_file,  '../../')
   
    # extract Y per batch
    Nfiles = Nfiles_dict[batch]
    colnames = colnames_dict[batch]
    Y_extracted = concatenate_Y(Nfiles, Y, cells_per_job, Ncells, colnames)
    id_d = id_dicts_per_batch[batch]
    Y_extracted['unique_image_ID'] = [id_d[int(x)] for x in Y_extracted['ID']]
    Y_extracted['NeuN_per_point'] = Y_extracted['NeuN']/Y_extracted['Npoints']
    Y_extracted['DAPI_per_point'] = Y_extracted['DAPI']/Y_extracted['Npoints']

    # filter and append Y per batch
    Y_filtered = filter_data(Y_extracted, 'NeuN_per_point', 'DAPI_per_point', 'cellvolume', 'wellname', lower_thresh_vol = 200, upper_thresh_vol = 7000, C_nuc=2, C_soma=1.25, C_nuc_upper=3, plot=False)
    Y_extracted['keep_cell'] = [x in set(Y_filtered.index) for x in Y_extracted.index]
    res.append(Y_extracted)
    
    # plot image per ID per batch
    IDs = list(id_d.keys())
    for ID in tqdm(range(len(IDs))):
        save_mean_proj(ID, zi_per_job, Nzi, mat, masks, Y_extracted, Y_filtered, id_d)
    

# concatenate across all batches
cell_data = pd.concat(res, ignore_index=True)


100%|██████████| 68/68 [05:06<00:00,  4.51s/it]
100%|██████████| 72/72 [05:28<00:00,  4.56s/it]


In [17]:
cell_data.to_csv('./cell_data_only1108.csv')

  values = values.astype(str)


In [None]:
# upload images to labelbox.com to determine which images to keep 


In [None]:
find ./segmentations -type f -newermt 2023-12-01 ! -newermt 2023-01-02 -exec tar -rvf output_files_created_on_2023-01-01.tar.gz {} +
