In [108]:
#workhorses
import numpy as np
import pandas as pd


## Get the data

In [109]:
directory = 'Z:/Reed/Projects/micro_consortia/DARPA_biocon/Task 1.1/A=B/20181128 A=B cfp yfp screen 1/'

filename = '20181128 cfp yfp A=B screen 1.xlsx'

In [110]:
data_dict = pd.read_excel(directory + filename, sheet_name=None)

In [111]:
data_dict.keys()

odict_keys(['OD700', 'CFP61', 'CFP100', 'YFP61', 'YFP100', 'Exp', 'IDs'])

## Concat multiple plates over time

In [10]:
def fix_columns (dataframe):
    
    renames = {curr_name : dataframe.loc[min(dataframe.index), curr_name] for curr_name in dataframe.columns}
    
    dataframe = dataframe.rename(columns = renames)
    
    dataframe = dataframe.drop(min(dataframe.index))
    
    return dataframe

In [30]:
def get_plate_dfs (multiplate_df):
    
    #gets the indices where the row says "time" which is the header row for that plate's table
    times = multiplate_df.loc[multiplate_df['Time'] == 'Time'].index
    
    #since the first one has it's header as the column titles already, add this index to capture the first one
    timesn = [0] + list(times) + [max(multiplate_df.index) + 1]
    
    #get number of indices to add to the "Times" header row to get this plate's table
    timediffs = np.diff(np.array(timesn))
    
    #get the start and end indices for each table you want
    start_end_idxs = [(i, i+j) for i,j in zip(timesn, timediffs)]
    
    #select the data from the big composite set
    datas = [multiplate_df.iloc[x[0]:x[1]] for x in start_end_idxs]
    
    #the first set has proper column labels, the others have the column labels as the first row
    #replace the existing columns with the first row entries to get proper column labels
    alls = [datas[0]] + [fix_columns(data) for data in datas[1:]]
    
    #put them all together in order to get the final df
    #concat joins them on matching columns, so order of columns doesn't matter, just that they are present in all dfs
    output = pd.concat(alls).reset_index(drop=True)
    
    return output

In [31]:
keys = [x for x in data_dict.keys() if x not in ['Exp', 'IDs']]

In [33]:
for k in keys:
    get_plate_dfs(data_dict[k]).to_csv(directory + k + '_condensed.csv', index=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




## Munge the data

### Functions

In [112]:
def get_sec (datetimeTime_obj):
    
    return (datetimeTime_obj.hour * 60 + datetimeTime_obj.minute) * 60 + datetimeTime_obj.second

In [113]:
def replace_time (data_sheet):
    
    df = data_sheet.copy()
    
    df['Time'] = df['Time'].apply(get_sec).divide(3600)
    
    return df

In [130]:
def replace_time_sequential (data_sheet):
    
    df = data_sheet.copy()
    
    df['Time'] = df['Time'].apply(get_sec).divide(3600)
    
    first = df.loc[0, 'Time']
    second = df.loc[1, 'Time']
    
    diff = second - first
    
    too_long = np.arange(first, 1000, diff)
    
    proper_length = too_long[:(len(df['Time']))]
    
    df['Time'] = proper_length
    
    return df

In [115]:
def replace_temp (data_sheet):
    
    df = data_sheet.copy()
    
    c = ['Temp C' if 'Â°' in x else x for x in df.columns]
    
    df.columns = c
    
    return df

In [116]:
def add_id_info (data_sheet_melted, df_id):
    
    #make a copy of the dataframe so you can return the new one and set whatever name you want
    df = data_sheet_melted.copy()

    #initialize the columns to be used to store ID info
    all_id_columns = [col for col in df_id.columns if col not in 'well']

    for c in all_id_columns:
        df[c] = np.nan

    #get the list of all the wells you want to annotate
    wells = np.unique(df_id['well'])

    #loop over these wells and add the information to the initialized columns
    for w in wells:
        #get the ID information for this well
        append_this = df_id.loc[df_id['well'] == w, :]

        #loop over each column containing a unique piece of ID info
        for c in all_id_columns:

            #set the value for that column for that well
            df.loc[df['well'] == w, c] = append_this[c].values
                                                                #have to use .values otherwise index carries along and only
                                                                #sets the value for the data table index equal to the
                                                                #id table index

    return df

## Do the munging

In [131]:
keys_to_munge = [x for x in data_dict.keys() if x not in ['Exp', 'IDs']]

#replace time and temp by overwriting the original data
for key in keys_to_munge:
    
#     data_dict[key] = replace_time(data_dict[key])
    data_dict[key] = replace_time_sequential(data_dict[key])
    data_dict[key] = replace_temp(data_dict[key])

130
130
130
130
130


### Assign well IDs

In [133]:
#get the well identifying information
ids = data_dict['IDs']

#get the data you want
od_raw = data_dict['OD700']
cfp61_raw = data_dict['CFP61']
cfp100_raw = data_dict['CFP100']
yfp61_raw = data_dict['YFP61']
yfp100_raw = data_dict['YFP100']

In [134]:
#independently melt the data so you can check and control the ID vars and value vars since that might differ per expt
od = pd.melt(od_raw, id_vars=['Time', 'Temp C'], var_name=['well'])

cfp61 = pd.melt(cfp61_raw, id_vars=['Time', 'Temp C'], var_name=['well'])

cfp100 = pd.melt(cfp100_raw, id_vars=['Time', 'Temp C'], var_name=['well'])

yfp61 = pd.melt(yfp61_raw, id_vars=['Time', 'Temp C'], var_name=['well'])

yfp100 = pd.melt(yfp100_raw, id_vars=['Time', 'Temp C'], var_name=['well'])

In [135]:
od_final = add_id_info(od, ids)

cfp61 = add_id_info(cfp61, ids)

cfp100 = add_id_info(cfp100, ids)

yfp61 = add_id_info(yfp61, ids)

yfp100 = add_id_info(yfp100, ids)

## Write the data to a new file

In [136]:
to_write = [od_final, cfp61, cfp100, yfp61, yfp100]
names = ['OD700_final', 'CFP61', 'CFP100', 'YFP61', 'YFP100']

In [137]:
for df, name in zip(to_write, names):
    df.to_csv(directory + name + '_tidy.csv', index=False)