In [30]:
'''
plot-growth-Fig2.py by Rohan Maddamsetti.

This script loads the well labels, and the raw time series data
from the plate reader.

It then puts these data into 'tidy' format in a data frame.
'tidy' is the format that Hadley Wickham uses.

The analysis follows what Zack has done:
1) average blank measurement at every time point.
2) subtract average blank measurement from each time point.
3) log2 transform the data.
4) plot the mean of the log-transformed data.
5) plot facets using seaborn.
'''

%matplotlib inline

from math import log2
from os.path import join
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt


In [2]:
def read_plate_labels(f):
    ''' return a tidy pandas dataframe with the well layout.'''
    fh = open(f)
    cols = { 'Name':[], 'Well':[] }
    for i,l in enumerate(fh):
        l = l.strip()
        if i == 0:
            continue
        data = l.split(',')
        row = data[0]
        names = data[1:]
        column = [str(i) for i in [1,2,3,4,5,6,7,8,9,10,11,12]]
        well = [row+x for x in column]
        assert len(well) == len(names)
        for i in range(len(well)):
            cols['Well'].append(well[i])
            cols['Name'].append(names[i])
    return pd.DataFrame(cols)

In [3]:
def read_growth_data(f):
    ''' return a tidy pandas dataframe of the growth curve time series.'''
    cols = {'Time':[], 'Temperature':[],'Well':[],'OD420':[]}
    fh = open(f)
    well = []
    for i,l in enumerate(fh):
        l = l.strip()
        if i == 0:
            well = l.split(',')[2:]
            continue
        data = l.split(',')
        time = data[0]
        temp = data[1]
        OD420 = data[2:]
        assert len(well) == len(OD420)
        for i in range(len(well)):            
            cols['Time'].append(pd.to_datetime(time))
            cols['Temperature'].append(temp)
            cols['Well'].append(well[i])
            cols['OD420'].append(float(OD420[i]))
    return pd.DataFrame(cols)

In [56]:
def process_data(df):
    '''
    1) average blank at each time point.
    2) subtract blank.
    3) log2 transform OD420.
    4) get average log2(OD420) for each strain
    then return the dataframe.
    '''

    blank_data = df[df.Name=='Blank']
    blank_time_avg = blank_data.groupby('Time').mean()
    ##print(blank_time_avg)
    ##print(blank_time_avg.dtypes)
    ##df = df.sort_values('Time')
    ##blank_subtract = pd.concat([blank_time_avg]*96)#.sort('Time')
    ##print(blank_subtract.dtypes)
    ##quit()
    ## This next line is really slow (~7 seconds).
    df['subtracted_OD420'] = df.apply(lambda x: x['OD420'] - blank_time_avg.loc[x['Time']],axis=1)
    ##df['subtracted_OD420'] = df.apply(lambda x: x['OD420'] - blank_time_avg.loc[x['Time']] + 1,axis=1)
    ## This next line is not too slow.
    df['log2_OD420'] = df['subtracted_OD420'].apply(lambda x: log2(x) if x > 0 else -np.inf)
    df2 = df.groupby(['Time','Name']).mean().reset_index()
    ## filter out Blank rows.
    df2 = df2[df2.Name != 'Blank']
    return df2

In [57]:
proj_dir = "/Users/Rohandinho/Dropbox (HMS)/DM-zero-evolution/"
data_dir = "data/rohan-formatted/"

wellfile = join(proj_dir,data_dir,"growth-plate-layout.csv")
plate_labels = read_plate_labels(wellfile)
popfile = join(proj_dir,data_dir,"populations-and-clones.csv")
pop_labels = pd.read_csv(popfile)

raw_DM0data = join(proj_dir,data_dir,"DM0-evolved-DM0-growth-4-18-13.csv")
tidy_DM0_data = read_growth_data(raw_DM0data)
full_DM0_data = pd.merge(tidy_DM0_data,plate_labels,how='outer',on='Well')

raw_DM25data = join(proj_dir,data_dir,"DM0-evolved-DM25-growth-4-15-13.csv")
tidy_DM25_data = read_growth_data(raw_DM25data)
full_DM25_data = pd.merge(tidy_DM25_data,plate_labels,how='outer',on='Well')
    
avgDM0values = process_data(full_DM0_data)
labeled_avgDM0values = pd.merge(avgDM0values,pop_labels,how='left',on='Name')

avgDM25values = process_data(full_DM25_data)
labeled_avgDM25values = pd.merge(avgDM25values,pop_labels,how='left',on='Name')

TypeError: mean() got an unexpected keyword argument 'skipna'

In [None]:
''' now make Figure 2.'''
outfile="/Users/Rohandinho/Desktop/test1.pdf"
sns.set(style="ticks")

#ax = sns.tsplot(data=labeled_avgDM0values, time="Time", unit="Name",condition="Name", value="log2_OD420")
ax.set(ylim=(10, 40))
g = sns.FacetGrid(labeled_avgDM0values, col="Founder")
## datetime does not play nicely with scatterplot!
#g.map_dataframe(plt.scatter,"Time","log2_OD420", c="Name") 
g.map_dataframe(sns.tsplot, time="Time",unit="Name",
                condition="Name",value="log2_OD420") 
g.savefig(outfile)
print(set(labeled_avgDM0values.Name))

In [None]:
outfile="/Users/Rohandinho/Desktop/test2.pdf"
sns.set(style="ticks")

#ax = sns.tsplot(data=labeled_avgDM0values, time="Time", unit="Name",condition="Name", value="log2_OD420")
ax.set(ylim=(10, 40))
g = sns.FacetGrid(labeled_avgDM25values, col="Founder")
## datetime does not play nicely with scatterplot!
#g.map_dataframe(plt.scatter,"Time","log2_OD420", c="Name") 
g.map_dataframe(sns.tsplot, time="Time",unit="Name",
                condition="Name",value="log2_OD420") 
g.savefig(outfile)