In [61]:
import numpy as np
import pandas as pd

#get all the files from your experiment
import glob

#deal with the .fcs file format
import fcsparser

#for writing info back to excel ID sheet
import openpyxl


## Use glob to get all the fcs files you want to deal with

### functions

In [62]:
def write_file_assoc_to_xlsx (ids_frame_w_fnames, ids_xlsx_path):

    #the check to see if the ids xlsx file already has filenames is done in the function that calls this one [assoc_fname_well]
    
    #open the workbook
    book = openpyxl.load_workbook(ids_xlsx_path)

    #get the worksheet you want to edit by name. Assumes standard sheet naming where single sheet in book is called 'Sheet1'
    sheet = book["Sheet1"]

    #initialize row counter (because lazy)
    r = 1
    #set the file column name in row 1
    sheet.cell(row=r, column=4).value = "file"

    #loop over file dataframe column entries and assign to the xlsx in successive rows (r) in same column
    for filename in ids_frame_w_fnames['file']:
        #go to the next row
        r += 1

        #assign the values of successive cells in the col
        sheet.cell(row=r, column=4).value = filename

    #save it to same place so its overwritten with new good file
    book.save(ids_xlsx_path)
    
    print ("filename associations are written to the ids xlsx file")
    
    return None

In [63]:
def assoc_fname_well (ids_frame, fcs_filename_list, ids_xlsx_path):

    #make copy so you aren't editing the original id frame in the function
    ids = ids_frame.copy()

    #if the ids dataframe loaded from xlsx doesn't already have a "file" column with associated filenames
    if 'file' not in ids.columns:

        #add the filename to the IDs dataframe so you can look for well and get filename for analysis

        #get the index and the fcs file num as a tuple.
        #I'm pretty sure if the index was noncontinuous, this would still associate correct index to fcs num entry
        for idx, num in zip(ids['fcs num'].index, ids['fcs num']):


            #get the list of filenames that contain the fcs num you're looking for
            #this list should ideally be len = 1
            fname = [f for f in fcs_filename_list if num in f]

            #this list that will be added to the 'file' column is sometimes just a string, sometimes a list with one element
            #I want just the value inside the list
            if len(fname) == 1:
                add = fname[0]
            elif len(fname) == 0:
                add = 'NO MATCH'
            elif len(fname) > 1:
                add = '> 1 MATCH'


            ids.loc[idx, 'file'] = add

        
        #check how the filename assignment went
        
        #if the value of these statements are TRUE, then there's a non-assignment or a multiple assignment of filenames
        if any(ids['file'] == 'NO MATCH'):
            print("there's a non-assignment of at least one well and filename!!!")

        elif any(ids['file'] == '> 1 MATCH'):
            print("there's an assignment of multiple filenames to at least one well!!!")

        else:
            print("assignment completed without issue, all wells have a single filaname assignment")

            
        #write the new IDs frame with file associations to the original ids xlsx file
        write_file_assoc_to_xlsx(ids, ids_xlsx_path)    
        
        #return the frame with the new filename associations
        return ids

    #if the ids dataframe already has the filename associations
    else:
        #do nothing
        print ("the id dataframe and its parent xlsx file already contain a column called 'file' that has the fcs filename associations, the id dataframe and its parent file have not been modified")
        #return it as is
        return ids

### work

In [64]:
upper_dir = 'Z:/Reed/Projects/micro_consortia/DARPA_biocon/Task 1.1/A=B/20190214 A=B mar cfp yfp small screen 1/flow/'

tpt = '18'


dir_with_fcs_files_path = (upper_dir + tpt + '/' + 'raw/')

#get all the .fcs files in the directory (doesn't walk down directories to aggregate all further along dir tree)
allfcs = glob.glob(dir_with_fcs_files_path + '/*.fcs')

In [65]:
allfcs[0:3]

['Z:/Reed/Projects/micro_consortia/DARPA_biocon/Task 1.1/A=B/20190214 A=B mar cfp yfp small screen 1/flow/18/raw\\bfp-RDM2019-02-15.0001.fcs',
 'Z:/Reed/Projects/micro_consortia/DARPA_biocon/Task 1.1/A=B/20190214 A=B mar cfp yfp small screen 1/flow/18/raw\\blank-RDM2019-02-15.0001.fcs',
 'Z:/Reed/Projects/micro_consortia/DARPA_biocon/Task 1.1/A=B/20190214 A=B mar cfp yfp small screen 1/flow/18/raw\\blankno-RDM2019-02-15.0001.fcs']

In [66]:
#get just the endings, which are the actual filenames, separate from the directory tree
fnames = [x.split('\\')[1] for x in allfcs]

#get the ones that don't have anything at the front, those are the experimental ones
expnames = [x for x in fnames if x.startswith('RDM')]

#re-attach the directory tree
expfcs = [upper_dir+tpt+'/'+'raw\\'+x for x in expnames]


#nicer way to use previous list to get ctrls out of allfcs

ctrlfcs = [d for d in allfcs if d not in expfcs]

finalctrlfcs = [f for f in ctrlfcs if 'final' in f]

In [67]:
ctrlfcs

['Z:/Reed/Projects/micro_consortia/DARPA_biocon/Task 1.1/A=B/20190214 A=B mar cfp yfp small screen 1/flow/18/raw\\bfp-RDM2019-02-15.0001.fcs',
 'Z:/Reed/Projects/micro_consortia/DARPA_biocon/Task 1.1/A=B/20190214 A=B mar cfp yfp small screen 1/flow/18/raw\\blank-RDM2019-02-15.0001.fcs',
 'Z:/Reed/Projects/micro_consortia/DARPA_biocon/Task 1.1/A=B/20190214 A=B mar cfp yfp small screen 1/flow/18/raw\\blankno-RDM2019-02-15.0001.fcs',
 'Z:/Reed/Projects/micro_consortia/DARPA_biocon/Task 1.1/A=B/20190214 A=B mar cfp yfp small screen 1/flow/18/raw\\blch1-RDM2019-02-15.0001.fcs',
 'Z:/Reed/Projects/micro_consortia/DARPA_biocon/Task 1.1/A=B/20190214 A=B mar cfp yfp small screen 1/flow/18/raw\\blch2-RDM2019-02-15.0001.fcs',
 'Z:/Reed/Projects/micro_consortia/DARPA_biocon/Task 1.1/A=B/20190214 A=B mar cfp yfp small screen 1/flow/18/raw\\cal-RDM2019-02-15.0001.fcs',
 'Z:/Reed/Projects/micro_consortia/DARPA_biocon/Task 1.1/A=B/20190214 A=B mar cfp yfp small screen 1/flow/18/raw\\di1-RDM2019-02-15.

In [69]:
#I created a csv file that correlates fcs file number to well and volume flowed
ids_xlsx_path = dir_with_fcs_files_path + '/' + tpt + '-fcs num to well ID.xlsx'

ids = pd.read_excel(ids_xlsx_path, dtype=str)

In [70]:
ids.head(11)

Unnamed: 0,fcs num,well,vol,Unnamed: 3,sampling_dil,flow_dil
0,1,a1-BAD,25,,,
1,2,A1,10,,4.0,40.0
2,3,B1,7,,4.0,40.0
3,4,C1,10,,4.0,40.0
4,5,D1,15,,4.0,40.0
5,6,E1,6,,4.0,40.0
6,7,F1,10,,4.0,40.0
7,8,G1,10,,4.0,40.0
8,9,H1,10,,4.0,40.0
9,10,A2,10,,4.0,40.0


In [71]:
ids = assoc_fname_well (ids, expfcs, ids_xlsx_path)

assignment completed without issue, all wells have a single filaname assignment
filename associations are written to the ids xlsx file


### Now create some csv files

In [72]:
def get_dataframe_from_fcs (desired_well, ids_frame):
    
    #get the index of the well you want in the ids frame
    idx = ids_frame['well'] == desired_well
    
    #if the idx list is empty (no well matches), return None
    if not any(idx):
        return None
    
    #gotta get values, which is an array, hence the [0], to get the actual string inside the array
    #because fcsparser only takes string input, can't deal with dataframe slices or arrays
    path = ids_frame.loc[idx, 'file'].values[0]
    
    #use fcsparser to unpack fcs file to dataframe, get both the metadata and dataframe in case you want both
    meta, data = fcsparser.parse(path, meta_data_only=False, reformat_meta=True)
    
    return meta, data

In [73]:
def save_as_csv (well_data_dict, dir_to_save):
    
    for key in well_data_dict.keys():
        #meta data stored in [0] of tuple, data in [1]
        data = well_data_dict[key]
        
        #for entries that aren't matched
        if data is None:
            pass
        #if it is a match
        else:
            data = data[1]
            data.to_csv(dir_to_save + '/' + key + '.csv', index=False)
        
    return None

In [74]:
wells = [i+j for i in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] for j in ['1','2','3', '4', '5', '6', '7', '8', '9', '10', '11', '12']]

In [75]:
well_data_dictionary = {well : get_dataframe_from_fcs(well, ids) for well in wells}

In [76]:
save_as_csv(well_data_dictionary, dir_with_fcs_files_path)