## Data input and cleanup 

Data were collected into a table with the columns `well`, `rate`, and `mutant` using Google Docs and exported to CSV, which is the input file here. 

In [88]:
%matplotlib inline
import matplotlib.pyplot as plt 
import pandas
from sklearn.linear_model import LinearRegression
from scipy.optimize import curve_fit 
from numpy import log, exp, linspace, sqrt, diag, array, nan
from numpy import concatenate 

In [78]:
df = pandas.read_csv( 'raw_data.csv', index_col='mutant' )
raw_len = len( df )

df.drop( ['WT', 'WT1'], inplace=True )
df.dropna( inplace=True ) 

with open( '../../reference/allowed_mutants.txt' ) as fn:
    mutants = [ i.split('.')[0].upper() for i in fn.readlines() if len( i ) > 1 ]
    
def is_good( name ):
    return name in mutants 

not_allowed = []
l = []
for name in df.index:
    if is_good( name ):
        l.append( name )
    else:
        new = name[0] + str( int( name[1:-1] ) - 3 ) + name[-1]
        #print 'suspected off by three', 
        if is_good( new ):
            l.append( new )
            #print 'fixed' 
        else:
            l.append( 'drop_me' )
            not_allowed.append( name )
            #print ', cannot fix' 
            
df.index = df['mutant'] = l
df.drop( ['drop_me'], inplace=True )

# temperature keys as CSV files in data/
temp_key = pandas.read_csv( '../../reference/temperature_key.csv' )
temp_key.index = temp_key.Well
df['temp'] = df.well.str[0].map( temp_key.Celsius.to_dict() ) 
df.dropna( inplace=True )
pos_and_neg = len( df )

# diagnostics 
print 'Input has {} rates. {} will be used, {} will be dropped because of mutant name, {} because negative).'.format( raw_len, len( df ), raw_len - pos_and_neg , pos_and_neg - len( df ) ), 
print 'Samples {} were dropped because the given native residue does not match the BglB sequence and is not off by three'.format( set( not_allowed ) )

df.to_csv( 'assay_data.csv' ) 

Input has 3048 rates. 3000 will be used, 48 will be dropped because of mutant name, 0 because negative). Samples set(['Q384R']) were dropped because the given native residue does not match the BglB sequence and is not off by three


In [79]:
# define logistic equation 
def f(x, x0, k): 
    return 1/(1+exp(-k*(x-x0)))

# util function to fit a mutant 
def fit( df ):
    empty = array( [ nan ] * 4 ) 
    name = df.mutant.unique()[0]
    df.rate = df.rate / df.rate.max()
    
    # linear fit gets us sensible starting params for the logistic fit 
    reg = LinearRegression()
    reg.fit( df.temp.reshape(-1, 1), df.rate )
    slope = reg.coef_[0]
    
    # try fitting to logistic eqn using approximate params from linear fit
    try:
        p0 = ( df.temp.mean(), slope )
        popt, pcov = curve_fit( f, df.temp, df.rate, p0=p0 )
        perr = sqrt( diag( pcov ) ) 
        
        # error checking 
        my_index = ['tm', 'k', 'err_tm', 'err_k' ]
        my_params = concatenate( [ popt, perr ] ) 
        if 20 < popt[0] < 60 and popt[1] < 0 and perr[0] < 1: # biological assay limits, and make sure k is the right sign
            return pandas.Series( my_params, index=my_index ) 
        else:
            return pandas.Series( empty, index=my_index )
    except Exception as e:
        print name, e 
        return pandas.Series( empty, index=my_index )
    # done with error checking 

In [80]:
# apply the function to the data set 
grouped = df.groupby( by='mutant' )
est = grouped.apply( fit )

print 'Tm estimated for {} of {} samples'.format( len( est.dropna() ), len( grouped ) )

Tm estimated for 68 of 117 samples


In [81]:
print est.round( 1 )

          tm    k  err_tm  err_k
mutant                          
A192S   39.1 -0.9     0.3    0.3
A356A   39.9 -1.1     0.1    0.1
A357A   39.1 -0.3     0.5    0.1
A408A   40.0 -1.3     0.1    0.1
C167A   39.7 -1.4     0.5    0.6
C167Q   38.5 -0.5     0.2    0.0
D403A    NaN  NaN     NaN    NaN
E154D   38.7 -0.7     0.3    0.2
E164A    NaN  NaN     NaN    NaN
E164G    NaN  NaN     NaN    NaN
E164R    NaN  NaN     NaN    NaN
E177A   37.3 -0.5     0.2    0.1
E177K   36.6 -0.7     0.3    0.1
E177L   39.2 -0.5     0.5    0.1
E222A   36.7 -0.7     0.1    0.0
E222H   34.7 -0.6     0.1    0.1
E222K   38.5 -0.5     0.5    0.1
E222Q   39.4 -0.8     0.4    0.2
E222R   39.0 -0.8     0.2    0.1
E222Y   36.9 -0.8     0.5    0.3
E353A    NaN  NaN     NaN    NaN
E406A   39.9 -1.5     0.7    0.7
E406D   40.5 -1.0     0.3    0.2
E426S   39.5 -1.4     0.1    0.2
F415A    NaN  NaN     NaN    NaN
F415N    NaN  NaN     NaN    NaN
F72H    38.6 -0.4     0.4    0.1
G355A    NaN  NaN     NaN    NaN
H101R   40

In [89]:
# I looked at all the plots 

zero_list = [ 
    'E353A', 'H315E', 'M261D', 'P329N', 
    'Q313R', 'R76A', 'S16A', 'S16N', 
    'W325H', 'W325L', 'W407G', 'Y294F', 
]

for zero in zero_list:
    fits.loc[ zero ] = empty  

In [90]:
for index, dat in grouped:
    
    dat = dat[ ( dat.rate > 0 ) ] 
    rate = dat.rate / dat.rate.max() 
    my_params = est.loc[ index ]

    if my_params.tm != nan:
            
        plt.figure( figsize=(4,3) )
        plt.scatter( dat.temp, rate, color='k', lw=0, marker='.', label=index )
        x = linspace( dat.temp.min(), dat.temp.max(), 50 )
        plt.plot( x, f( x, *my_params[0:2] ), color='purple', label='est. params' )
        #plt.plot( x, f( x, 39.6, -.7 ), color='green', label='BglB' )
        plt.xlabel( 'T (C)' )
        plt.ylabel( 'Normalized rate')
        plt.xticks( [ 30, 40, 50 ] )
        plt.yticks( [ 0, 0.25, .5, 0.75, 1 ] )
        plt.legend( loc='lower left' )
        plt.tight_layout()
        plt.savefig( 'plots/%s.pdf' % index, format='pdf' )
        plt.close()

In [48]:
# add metadata
fits['native'] = fits.index.str[0]
fits['designed'] = fits.index.str[-1]
fits['position'] = fits.index.str[1:-1].astype( int )

In [49]:
# production and purification data 
pp = pandas.read_csv( 'raw/production.csv', index_col='mutant' )
#print pp.index.value_counts()

# this is a sanity-checking routine for "off-by-threes"
# let's just rename all the tubes
pp_not_allowed = []
l = []
pp.drop( [ 'WT', 'nan' ], axis=0, inplace=True ) 

from numpy import nan 

for name in pp.index: 
    if type( name ) == float:
        l.append( 'drop_me' ) 
    elif is_good( name ):
        l.append( name )
    else:
        new = name[0] + str( int( name[1:-1] ) - 3 ) + name[-1]
        if is_good( new ):
            l.append( new )
        else:
            l.append( 'drop_me' )
            not_allowed.append( name )

pp.index = l
joined = pp.join( fits )

In [50]:
plos = pandas.read_csv( '/Users/alex/Documents/bagel-data/clean_data/clean_for_pandas.csv', index_col='name' ) 
plos_join = joined.join( plos, rsuffix='PLOS' ) 
plos_join.to_csv( 'raw/join.csv' ) 