In [56]:
#workhorses
import numpy as np
import pandas as pd

#for writing to excel files
from openpyxl import load_workbook

## Get the data

In [57]:
directory = 'C:/Users/geeze/Box/biocircuits/Reed/projects/DARPA_biocon/Task 1.1/A+pTet-ccdA/20200917 capnresc 4 fresh trans full96 set 1/'

filename = '20200917 capnresc 4 fresh trans full96 set 1.xlsx'

In [58]:
data_dict = pd.read_excel(directory + filename, sheet_name=None)

In [59]:
data_dict.keys()

dict_keys(['1', '2', '3', '4', '5', 'IDs', 'Exp', 'OD700,_tidy'])

## do you have multiple plates you need to concat?

In [60]:
many_plates = True

## Concat multiple plates over time

In [61]:
plate_keys = [x for x in data_dict.keys() if x not in ['Exp', 'IDs']]
print(plate_keys)

['1', '2', '3', '4', '5', 'OD700,_tidy']


In [62]:
if many_plates:
    plate_data = []

    for k in keys:
        plate_data.append(data_dict[k])
        
    full_data = pd.concat(plate_data)

In [63]:
# look for the temperature column, it contains the channel information
channel_label = [x for x in full_data.columns if ('°' in x)]

#make sure it's unique
if len(channel_label) == 1:
    channel_label = channel_label[0]
else:
    raise ValueError('you have not identified the unique Temp column that contains the channel info')
    
# get the channel info
channel_label = channel_label.lstrip('T° ')

#colons are not allowed, neither are brackets
channel_label = channel_label.replace(':', '')
channel_label = channel_label.replace('[', '')
channel_label = channel_label.replace(']', '')

In [65]:
path = directory + filename

book = load_workbook(path)
writer = pd.ExcelWriter(path, engine = 'openpyxl')
writer.book = book

full_data.to_excel(writer, sheet_name = channel_label + '_concat', index=False)


writer.save()
writer.close()

## Munge the data

### Functions

In [112]:
def get_sec (obj):
    
    #pandas seems to read the biotek time column as strings sometimes, sometimes as datetime objects
    if isinstance(obj, str):
        hour, minute, sec = [int(x) for x in obj.split(':')]
        total = (hour * 60 * 60) + (minute * 60) + sec
    else:
        #if it's read as datetime
        total = (obj.hour * 60 + obj.minute) * 60 + obj.second
    
    return total

In [113]:
def replace_time (data_sheet):
    
    df = data_sheet.copy()
    
    df['Time'] = df['Time'].apply(get_sec).divide(3600)
    
    return df

In [114]:
def replace_time_sequential (data_sheet):
    
    df = data_sheet.copy()
    
    df['Time'] = df['Time'].apply(get_sec).divide(3600)
    
    first = df.loc[0, 'Time']
    second = df.loc[1, 'Time']
    
    diff = second - first
    
    too_long = np.arange(first, 1000, diff)
    
    #should just put the len(df) part above, but whatever this already does the job
    #20201009 actually that doesn't work! the second argument to arange is the actual time value, not the length in entries
    #so keep it this way
    proper_length = too_long[:(len(df['Time']))]
    
    df['Time'] = proper_length
    
    return df

In [131]:
def replace_temp (data_sheet):
    
    df = data_sheet.copy()
    
    c = ['Temp C' if '°' in x else x for x in df.columns]
    
    df.columns = c
    
    return df

In [124]:
def add_id_info (data_sheet_melted, df_id):
    
    #make a copy of the dataframe so you can return the new one and set whatever name you want
    df = data_sheet_melted.copy()

    #initialize the columns to be used to store ID info
    all_id_columns = [col for col in df_id.columns if col not in 'well']

    for c in all_id_columns:
        df[c] = np.nan

    #get the list of all the wells you want to annotate
    wells = np.unique(df_id['well'])

    #loop over these wells and add the information to the initialized columns
    for w in wells:
        #get the ID information for this well
        append_this = df_id.loc[df_id['well'] == w, :]

        #loop over each column containing a unique piece of ID info
        for c in all_id_columns:

            #set the value for that column for that well
            df.loc[df['well'] == w, c] = append_this[c].values
                                                                #have to use .values otherwise index carries along and only
                                                                #sets the value for the data table index equal to the
                                                                #id table index

    return df

## Do the munging

In [146]:
# reload the data since you may have just edited it above. This is necessary rather then passing the data
# through from above because not every use of this code will concat multiple plate sheets together
# in which case you need to find the right sheet right here

data_dict = pd.read_excel(directory + filename, sheet_name=None)

In [147]:
data_dict.keys()

dict_keys(['1', '2', '3', '4', '5', 'OD700_concat', 'IDs', 'Exp'])

In [148]:
keys_to_munge = [x for x in data_dict.keys() if '_' in x]

In [149]:
keys_to_munge

['OD700_concat']

In [150]:
#replace time and temp by overwriting the original data
for key in keys_to_munge:
    
    if many_plates:
        # the reason you need this sequential replace time is because you sometimes concat multiple plates
        # so if that value is true you know you need to do it. There aren't any other circumstances for using this
        # function I don't think
        data_dict[key] = replace_time_sequential(data_dict[key])
    else:
        data_dict[key] = replace_time(data_dict[key])

    data_dict[key] = replace_temp(data_dict[key])

### Assign well IDs

In [151]:
#get the well identifying information
ids = data_dict['IDs']

In [152]:
#independently melt the data so you can check and control the ID vars and value vars since that might differ per expt
melted_dict = {key : pd.melt(data_dict[key], id_vars=['Time', 'Temp C'], var_name=['well'])
              for key in keys_to_munge}

In [153]:
#then assign all the well information
final_dict = {key : add_id_info(melted_dict[key], ids)
             for key in keys_to_munge}

## Write the data to the file

In [155]:
for key in final_dict:
    path = directory + filename

    book = load_workbook(path)
    writer = pd.ExcelWriter(path, engine = 'openpyxl')
    writer.book = book


    final_dict[key].to_excel(writer, sheet_name = key + '_tidy', index=False)


    writer.save()
    writer.close()