In [2]:
import numpy as np
import pandas as pd

import os
import glob

In [3]:
#plotting things

#%matplotlib qt5 -- I don't know what this is
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from cycler import cycler


#All of Anandh's customized seaborn/matplotlib settings

sns.set_context("talk", font_scale=1.5, rc={"lines.linewidth": 1.5})
sns.set_style("ticks")
sns.set_style({"xtick.direction": "in","ytick.direction": "in"})

#%config InlineBackend.figure_f.ormats=['svg']

mpl.rc('axes', prop_cycle=(cycler('color', ['r', 'k', 'b','g','y','m','c']) ))

mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

#mpl.rc('text', usetex=False)
#mpl.rc('text.latex', preamble=r'\usepackage{helvet}
#\renewcommand\familydefault{\sfdefault}\usepackage{sansmath}\sansmath')

    #If you want to use a different font
# mpl.rc('font',**{'family':'sans-serif','sans-serif':['Helvetica'], 
#                  'serif': ['Helvetica']})

tw = 1.5
sns.set_style({"xtick.major.size": 3, "ytick.major.size": 3,
               "xtick.minor.size": 2, "ytick.minor.size": 2,
               'axes.labelsize': 16, 'axes.titlesize': 16,
               'xtick.major.width': tw, 'xtick.minor.width': tw,
               'ytick.major.width': tw, 'ytick.minor.width': tw})

mpl.rc('xtick', labelsize=14) 
mpl.rc('ytick', labelsize=14)
mpl.rc('axes', linewidth=1.5)
mpl.rc('legend', fontsize=14)
mpl.rc('figure', figsize=(9,8))

## Get data

In [62]:
directory = 'Z:/Reed/Projects/micro_consortia/DARPA_biocon/Task 1.1/A=B/20190214 A=B mar cfp yfp small screen 1/'

filename = '20190214 A=B mar 1.xlsx'

In [63]:
dd = pd.read_excel(directory + filename, sheet_name=None)

dd.keys()

odict_keys(['OD700_raw', 'CFP_raw', 'YFP_raw', 'OD700', 'CFP', 'YFP', 'OD700_tidy', 'CFP_tidy', 'YFP_tidy', 'dense_norm_cumsum_tidy', 'IDs', 'Exp'])

## Create condensed data

In [8]:
def join_fluor_and_od700 (dict_of_data):
    
    #get the data sheets you want to join
    useful_sheets = [x for x in dd.keys() if '_tidy' in x]

    #find the od sheet, which will be the master one
    od_sheet = [x for x in useful_sheets if 'OD700' in x]

    #get the others using od_sheet
    others = [x for x in useful_sheets if x not in od_sheet]
    
    #now that od_sheet is used, pop the value out of the list
    if len(od_sheet) == 1:
        od_sheet = od_sheet[0]
    else:
        raise ValueError("looking for the od_sheet with 'OD700' found more than one data sheet")

    
    #get the od time column
    ot = np.sort(np.unique(dict_of_data[od_sheet]['Time']))
    
    #rename od stuff
    dict_of_data[od_sheet] = dict_of_data[od_sheet].rename({'value' : od_sheet.lower().replace('_tidy', '')}, axis='columns')
    
    #for the remaining sheets of data you want to join
    for sheet in others:
        #get this one's time values
        ft = np.sort(np.unique(dict_of_data[sheet]['Time']))
        
        #create dict to identify fluor time column entries with od time columns entries
        #every time you see time (fluor time entry), replace it with (od time entry)
        time_replacement_dict = {x : y for x,y in zip(ft,ot)}
    
        #replace the time identifier column in the fluor df
        dict_of_data[sheet]['Time'] = dict_of_data[sheet]['Time'].map(time_replacement_dict)
        
        #do some renaming
        dict_of_data[sheet] = dict_of_data[sheet].rename({'value' : sheet.lower().replace('_tidy', '')}, axis='columns')
    

    #load all data for joining
    dataframes = [dict_of_data[sheet] for sheet in useful_sheets]
    
    all_data_joined = pd.concat(dataframes, axis='columns')
    
    all_data_joined = all_data_joined.loc[:,~all_data_joined.columns.duplicated()]
    
    #fix overflow errors
    all_data_joined = all_data_joined.replace("OVRFLW", 99999)
    
    return all_data_joined

In [9]:
dense = join_fluor_and_od700(dd)

In [10]:
dense

Unnamed: 0,Time,Temp C,well,od700,iptg,sal,a,b,dil,cfp,yfp,cfp_norm,yfp_norm,cfp_sum,yfp_sum,cfp_norm_sum,yfp_norm_sum
0,0.152222,37.0,A1,0.084,0,0,aa6,b7,0,7310,590,87023.809524,7023.809524,7310,590,8.702381e+04,7.023810e+03
1,0.318889,37.0,A1,0.085,0,0,aa6,b7,0,7320,609,86117.647059,7164.705882,14630,1199,1.731415e+05,1.418852e+04
2,0.485556,37.0,A1,0.086,0,0,aa6,b7,0,6886,632,80069.767442,7348.837209,21516,1831,2.532112e+05,2.153735e+04
3,0.652222,37.0,A1,0.088,0,0,aa6,b7,0,7165,674,81420.454545,7659.090909,28681,2505,3.346317e+05,2.919644e+04
4,0.818889,37.0,A1,0.090,0,0,aa6,b7,0,7044,695,78266.666667,7722.222222,35725,3200,4.128983e+05,3.691867e+04
5,0.985556,37.0,A1,0.093,0,0,aa6,b7,0,6936,721,74580.645161,7752.688172,42661,3921,4.874790e+05,4.467135e+04
6,1.152222,37.0,A1,0.097,0,0,aa6,b7,0,7194,802,74164.948454,8268.041237,49855,4723,5.616439e+05,5.293940e+04
7,1.318889,37.0,A1,0.104,0,0,aa6,b7,0,7401,841,71163.461538,8086.538462,57256,5564,6.328074e+05,6.102593e+04
8,1.485556,37.0,A1,0.112,0,0,aa6,b7,0,7306,962,65232.142857,8589.285714,64562,6526,6.980395e+05,6.961522e+04
9,1.652222,37.0,A1,0.123,0,0,aa6,b7,0,7536,1067,61268.292683,8674.796748,72098,7593,7.593078e+05,7.829002e+04


## Create od normalized fluorescence

In [102]:
#create od normalized fluor data

dense_n = dense.copy()

dense_n['cfp_norm'] = dense_n['cfp'] / dense_n['od700']
dense_n['yfp_norm'] = dense_n['yfp'] / dense_n['od700']

## Create cumulative fluorescence

In [103]:
def cumsum_fluor (df):
    """
    create cumulative summed fluor data for fluorescence and od normalized fluorescence, append it to big datasheet
    """
    
    #get the columns you want to sum
    sum_these = [x for x in df.columns if 'fp' in x] #works only if target columns are unique in containing 'fp'
    
    #start with the orig df in the list so you can just give the final list to pd.concat
    add_these = []
    for well in np.unique(df['well']):

        i = df['well'] == well

        #sort values by time so you get the cum sum of the values in the correct time order
        sums = df.loc[i].sort_values('Time').loc[:, sum_these].cumsum()

        sums.columns = sums.columns + '_sum'
        
        add_these.append(sums)
    
    sums_together = pd.concat(add_these, axis='index')
    
    df_plus_sums = pd.concat([df, sums_together], axis='columns')
    
    return df_plus_sums

In [104]:
dense_n_sum = cumsum_fluor(dense_n)

['cfp', 'yfp', 'cfp_norm', 'yfp_norm']


In [107]:
dense_n_sum.to_csv(directory + 'condensed_normed_cumsum_data.csv')

## create a 2 stage non cum CFP YFP trace

In [64]:
#already had the above analysis done
df = dd['dense_norm_cumsum_tidy'].copy()

In [111]:
[x+'_aaa' for x in fp_not_sum]

['cfp_aaa', 'yfp_aaa', 'cfp_norm_aaa', 'yfp_norm_aaa']

In [136]:
def recenter_at_dil (dense_data):
    """
    recenters the fluorescence data at the first dilution of a plate, only supports one dilution.
    
    Initial growth phase is the 0th dilution, then first dilution is the only dilution
    
    ---Params---
    
    dense_data : a dataframe in which all the different measurement modalities (YFP, OD, CFP)
                    with differing time indices have been condensed into this sheet which has one single
                    time index
    """
    
    df = dense_data.copy()
    
    
    #I only want to do this recentering on raw and normalized fluor, not summed stuff
    just_fp = [x for x in df.columns if 'fp' in x]
    fp_not_sum = [x for x in just_fp if '_sum' not in x]
    recntr_cols = [x + '_recntr' for x in fp_not_sum]
    
    #preallocate these columns to fill
    for col in recntr_cols:
        df[col] = np.nan
        
    
    #gotta loop over all the wells
    for well in np.unique(df['well']):
        
        iwell = df['well'] == well

        #this is where the function restricts its support to just one dilution
        idil = df['dil'] == 0
        
        ### 0th dil ###
        #get last time value in 0th dil
        max_time = max(df.loc[iwell & idil, 'Time'])

        imaxtime = df['Time'] == max_time

        #get values at that time
        last_time_data = df.loc[iwell & idil & imaxtime]

        ### 1st dil ###
        #get first time value in 1st dil
        min_time = min(df.loc[iwell & ~idil, 'Time'])
                                        #note the NOT operator here to get the 1st dil

        imintime = df['Time'] == min_time

        #get values at that time too
        first_time_data = df.loc[iwell & ~idil & imintime]

        
        ###get values to subtract from all values in the first dil
        
        #gotta reset the index because subtraction of dataframes will only work when BOTH index and columns are matched up
        sub_from_first = (first_time_data.loc[:,fp_not_sum].reset_index(drop=True)
                          - last_time_data.loc[:,fp_not_sum].reset_index(drop=True))

        #do the subtraction, has to be done column-wise because dataframe subtraction is very particular and lame
        #and will only accept index-less scalars from the .values at the end there.
        for recntr_col, col in zip(recntr_cols, fp_not_sum):
            #just modifying the entries in the 1st dilution to center them on the last values of the 0th dil
            new_info = df.loc[iwell & ~idil, col] - sub_from_first.loc[:, col].values
            df.loc[iwell & ~idil, recntr_col] = new_info
            
            #backfilling the 0th dilution values in the recentered data with
            #a copy of the original values (they don't change)
            df.loc[iwell & idil, recntr_col] = df.loc[iwell & idil, col]
            
            
    return df

In [138]:
#currently it won't overwrite the nans i put in the preallocated columns, 
#the assignent is obviously not actually happening in the final loop
dense_norm_sum_recntr = recenter_at_dil(df)

In [139]:
dense_norm_sum_recntr.to_csv(directory + 'condensed_normed_cumsum_recentered_data.csv')