In [12]:
#workhorses
import numpy as np
import pandas as pd


## Get the data

In [13]:
directory = 'Z:/Reed/Projects/micro_consortia/DARPA_biocon/Task 1.1/A+pTet-ccdA/20200107 capnresc 4 count/'

filename = '20200107 capnresc mwild 4 count.xlsx'

In [14]:
data_dict = pd.read_excel(directory + filename, sheet_name=None)

In [15]:
data_dict.keys()

odict_keys(['Plate 1_raw', 'OD700', 'IDs', 'Exp'])

## Concat multiple plates over time

In [8]:
def fix_columns (dataframe):
    
    renames = {curr_name : dataframe.loc[min(dataframe.index), curr_name] for curr_name in dataframe.columns}
    
    dataframe = dataframe.rename(columns = renames)
    
    dataframe = dataframe.drop(min(dataframe.index))
    
    return dataframe

In [9]:
def get_plate_dfs (multiplate_df):
    
    #gets the indices where the row says "time" which is the header row for that plate's table
    times = multiplate_df.loc[multiplate_df['Time'] == 'Time'].index
    
    #since the first one has it's header as the column titles already, add this index to capture the first one
    timesn = [0] + list(times) + [max(multiplate_df.index) + 1]
    
    #get number of indices to add to the "Times" header row to get this plate's table
    timediffs = np.diff(np.array(timesn))
    
    #get the start and end indices for each table you want
    start_end_idxs = [(i, i+j) for i,j in zip(timesn, timediffs)]
    
    #select the data from the big composite set
    datas = [multiplate_df.iloc[x[0]:x[1]] for x in start_end_idxs]
    
    #the first set has proper column labels, the others have the column labels as the first row
    #replace the existing columns with the first row entries to get proper column labels
    alls = [datas[0]] + [fix_columns(data) for data in datas[1:]]
    
    #put them all together in order to get the final df
    #concat joins them on matching columns, so order of columns doesn't matter, just that they are present in all dfs
    output = pd.concat(alls).reset_index(drop=True)
    
    return output

In [10]:
keys = [x for x in data_dict.keys() if x not in ['Exp', 'IDs']]

In [11]:
keys = [x for x in keys if 'Plate' not in x]

In [12]:
keys

['OD700_raw']

In [13]:
for k in keys:
    get_plate_dfs(data_dict[k]).to_csv(directory + k + '_condensed.csv', index=False)

## Munge the data

### Functions

In [16]:
def get_sec (datetimeTime_obj):
    
    return (datetimeTime_obj.hour * 60 + datetimeTime_obj.minute) * 60 + datetimeTime_obj.second

In [17]:
def replace_time (data_sheet):
    
    df = data_sheet.copy()
    
    df['Time'] = df['Time'].apply(get_sec).divide(3600)
    
    return df

In [18]:
def replace_time_sequential (data_sheet):
    
    df = data_sheet.copy()
    
    df['Time'] = df['Time'].apply(get_sec).divide(3600)
    
    first = df.loc[0, 'Time']
    second = df.loc[1, 'Time']
    
    diff = second - first
    
    too_long = np.arange(first, 1000, diff)
    
    #should just put the len(df) part above, but whatever this already does the job
    proper_length = too_long[:(len(df['Time']))]
    
    df['Time'] = proper_length
    
    return df

In [19]:
def replace_temp (data_sheet):
    
    df = data_sheet.copy()
    
    c = ['Temp C' if '°' in x else x for x in df.columns]
    
    df.columns = c
    
    return df

In [20]:
def add_id_info (data_sheet_melted, df_id):
    
    #make a copy of the dataframe so you can return the new one and set whatever name you want
    df = data_sheet_melted.copy()

    #initialize the columns to be used to store ID info
    all_id_columns = [col for col in df_id.columns if col not in 'well']

    for c in all_id_columns:
        df[c] = np.nan

    #get the list of all the wells you want to annotate
    wells = np.unique(df_id['well'])

    #loop over these wells and add the information to the initialized columns
    for w in wells:
        #get the ID information for this well
        append_this = df_id.loc[df_id['well'] == w, :]

        #loop over each column containing a unique piece of ID info
        for c in all_id_columns:

            #set the value for that column for that well
            df.loc[df['well'] == w, c] = append_this[c].values
                                                                #have to use .values otherwise index carries along and only
                                                                #sets the value for the data table index equal to the
                                                                #id table index

    return df

## Do the munging

In [21]:
keys_to_munge = [x for x in data_dict.keys() if x not in ['Exp', 'IDs']]

#remove the sheets that have the non-concatenated data
keys_to_munge = [x for x in keys_to_munge if 'raw' not in x]

In [22]:
keys_to_munge

['OD700']

In [23]:
#replace time and temp by overwriting the original data
for key in keys_to_munge:
    
    data_dict[key] = replace_time(data_dict[key])
#     data_dict[key] = replace_time_sequential(data_dict[key])
    data_dict[key] = replace_temp(data_dict[key])

### Assign well IDs

In [24]:
#get the well identifying information
ids = data_dict['IDs']

In [25]:
#independently melt the data so you can check and control the ID vars and value vars since that might differ per expt
melted_dict = {key : pd.melt(data_dict[key], id_vars=['Time', 'Temp C'], var_name=['well'])
              for key in keys_to_munge}

In [26]:
#then assign all the well information
final_dict = {key : add_id_info(melted_dict[key], ids)
             for key in keys_to_munge}

## Write the data to a new file

In [27]:
for key in final_dict:
    final_dict[key].to_csv(directory + key + '_tidy.csv', index=False)