In [511]:
import pandas as pd
import numpy as np

import itertools

## open data

In [512]:
directory = "Z:/Reed/Projects/lab misc/echo_cfu_count/"

filename = "20200123 1536 echu cfu test.xlsx"

d = pd.read_excel(directory + filename, sheet_name=None)

In [513]:
d.keys()

odict_keys(['Plate 1', 'Plate 2', 'Plate 3', 'Plate 4', 't0 seal', 't0', 'te', 'tef', 'IDs'])

## get columns and rows named correctly

In [514]:
#for more than a few plates

plates = [x for x in d.keys() if '-' in x]
plates = [x for x in plates if 'tidy' not in x]

In [515]:
plates = ['t0', 'te', 'tef']

In [516]:
#get plate
plate = plates[2]

df = d[plate]

#move the letter index in as column so you get numerical index for reference
df = df.reset_index()

#rename for clarity

#make generic for different plate types
rename = []
for col in df.columns.tolist():
    if isinstance(col, str):
        if "Unnamed" in col:
            rename.append(col)

df = df.rename(columns={rename[0]: 'channel', 'index': 'row'})

#reorder columns
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]

df = df[cols]

In [517]:
# rows aren't all named with the row letter, fill the NaNs with the letter
df['row'] = df['row'].fillna(method='ffill')

#make the channel names easier
split_channel_names = df['channel'].str.split(':', expand=True)

df['ch'] = split_channel_names[0]

#put the channel names where they're easy to see
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]

df = df[cols]

#if there are overflow values, replace them

df = df.replace({'OVFLW': 99999})

## break it into long form

In [518]:
df.head()

Unnamed: 0,ch,channel,row,1,2,3,4,5,6,7,...,39,40,41,42,43,44,45,46,47,48
0,OD700,OD700:700,A,0.341,0.314,0.31,0.352,0.359,0.333,0.349,...,0.337,0.32,0.34,0.332,0.444,0.325,0.329,0.33,0.334,0.514
1,RFP,"RFP:579,616",A,104.0,150.0,148.0,96.0,80.0,111.0,131.0,...,99.0,119.0,96.0,96.0,93.0,109.0,112.0,108.0,123.0,96.0
2,CFP,"CFP:430,491",A,455.0,533.0,564.0,427.0,276.0,460.0,457.0,...,458.0,459.0,474.0,460.0,1982.0,489.0,500.0,466.0,495.0,2498.0
3,YFP,"YFP:500,541",A,65.0,75.0,79.0,57.0,38.0,63.0,64.0,...,64.0,64.0,67.0,64.0,73.0,68.0,69.0,66.0,70.0,71.0
4,OD700,OD700:700,B,0.367,0.343,0.361,0.48,0.367,0.488,0.366,...,0.495,0.315,0.611,0.328,0.353,0.473,0.35,0.326,0.358,0.324


In [519]:
df = pd.melt(df.drop(columns = 'channel'), id_vars=['ch', 'row'], var_name='col').sort_values(by=['row', 'col'])

In [520]:
#couldn't figure out how to get each column to be a measurement with just row and col to be the indices using melt
#and pivot so I just hack it here

l = []
#unfortunate iteration through the entire df
for i in df.index:
    #just pull one row
    row = df.loc[i]
    
    #split out the channel name and the value
    name = row['ch']
    value = row['value']
    
    #make a tiny dataframe with these things in the right places
    add = pd.DataFrame({name:value}, index=[i])
    
    #big list
    l.append(add)
    
#make the ugliest dataframe ever with tons of nans
stick_on = pd.concat(l, sort=False)

In [521]:
#add the ugly dataframe to the existing one and backfill values. This will result in the FIRST row of each row/col
#pair having the correct set of data since measurement entries after this row will fill across it and measurement
#entries in this row will stay there. This only puts the right data in the FIRST row of each row/col pair.
#We will use the lowest index later to get this row

new = pd.concat([df, stick_on], axis='columns').fillna(method='bfill')

In [522]:
#go through the row/col pairs and select the first row of each

letters = new['row'].unique()
nums = new['col'].unique()

combs = list(itertools.product(letters, nums))

good_rows = []

for c in combs:
    let = c[0]
    num = c[1]
    
    i1 = new['row'] == let
    i2 = new['col'] == num
    i = i1&i2
    
    good_row = new.loc[min(new.loc[i].index)]
    
    good_rows.append(good_row)
    
df = pd.concat(good_rows, axis='columns').T.drop(columns=['ch', 'value'])

In [523]:
# df.to_csv(directory + '{}_tidy.csv'.format(plate), index=False)

# STOP, move the generated csv files into the master data file, then delete them. Also add any easy to generate information you need to the files

## make a single table from all tidy ones

In [538]:
#reopen file after the sheets have been added
d = pd.read_excel(directory + filename, sheet_name=None)

In [539]:
tidy = [x for x in d.keys() if 'tidy' in x]

In [540]:
data_list = []

for name in tidy:
    data = d[name]
    
    data_list.append(data)

In [542]:
# pd.concat(data_list).to_csv(directory + 'tall_tidy.csv', index=False)

# STOP, move the generated csv file into the master data file, then delete it

## associate the ID's

In [543]:
#reopen file after the sheets have been added
d = pd.read_excel(directory + filename, sheet_name=None)

In [544]:
df = d['tall_tidy']

ids = d['IDs']

In [545]:
df.head()

Unnamed: 0,row,col,OD700,RFP,CFP,YFP,time
0,A,1,0.455,82,1299,71,0
1,A,2,0.061,28,70,6,0
2,A,3,0.447,64,1053,41,0
3,A,4,0.523,122,2299,94,0
4,A,5,0.543,7984,2500,111,0


In [546]:
ids.head()

Unnamed: 0,row,cell,dil0,dile
0,A,2c1r,10,10000
1,B,2c1r,10,10000
2,C,2c1r,100,100000
3,D,2c1r,100,100000
4,E,2c1r,1000,1000000


In [547]:
stuff_to_add = ['cell', 'dil0', 'dile']
stuff_to_check = ['row']

for i in ids.index:
    check = ids.loc[i, stuff_to_check]
    
    add = ids.loc[i, stuff_to_add]
    
    i1 = df[stuff_to_check[0]] == check[stuff_to_check[0]]
    i = i1
    if len(stuff_to_check) == 2:
        i2 = df[stuff_to_check[1]] == check[stuff_to_check[1]]
        i = i1&i2
    
    for a in add.index.tolist():
        df.loc[i, a] = add[a]

In [548]:
df.head()

Unnamed: 0,row,col,OD700,RFP,CFP,YFP,time,cell,dil0,dile
0,A,1,0.455,82,1299,71,0,2c1r,10.0,10000.0
1,A,2,0.061,28,70,6,0,2c1r,10.0,10000.0
2,A,3,0.447,64,1053,41,0,2c1r,10.0,10000.0
3,A,4,0.523,122,2299,94,0,2c1r,10.0,10000.0
4,A,5,0.543,7984,2500,111,0,2c1r,10.0,10000.0


In [549]:
df['dil'] = np.nan

#again a dumb loop over the whole thing instead of slicing, whatever it works
for i in df.index:
    
    if df.loc[i, 'time'] == 0:
        df.loc[i, 'dil'] = df.loc[i, 'dil0']
    else:
        df.loc[i, 'dil'] = df.loc[i, 'dile']

In [550]:
df = df.drop(columns=['dil0', 'dile'])

In [551]:
# df.to_csv(directory + 'tall_tidy_master.csv'.format(plate), index=False)

# you're done, you can delete the existing tall_tidy sheet and replace it with this master sheet, then edit the name back to tall_tidy if you like