## Data input and cleanup 

Data were collected into a table with the columns `well`, `rate`, and `mutant` using Google Docs and exported to CSV, which is the input file here. 

In [7]:
%matplotlib inline
import matplotlib.pyplot as plt 
import pandas
from sklearn.linear_model import LinearRegression
from scipy.optimize import curve_fit 
from numpy import log, exp, linspace, sqrt, diag, array, nan
from numpy import concatenate 

In [8]:
df = pandas.read_csv( 'raw_data.csv', index_col='mutant' )
raw_len = len( df )

df.drop( ['WT', 'WT1'], inplace=True )
df.dropna( inplace=True ) 

with open( '../../reference/allowed_mutants.txt' ) as fn:
    mutants = [ i.split('.')[0].upper() for i in fn.readlines() if len( i ) > 1 ]
    
def is_good( name ):
    return name in mutants 

not_allowed = []
l = []
for name in df.index:
    if is_good( name ) or name == 'BglB':
        l.append( name )
    else:
        new = name[0] + str( int( name[1:-1] ) - 3 ) + name[-1]
        #print 'suspected off by three', name,  
        if is_good( new ):
            l.append( new )
            #print 'fixed' 
        else:
            l.append( 'drop_me' )
            not_allowed.append( name )
            #print ', cannot fix' 
            
df.index = df['mutant'] = l
df.drop( ['drop_me'], inplace=True )

# temperature keys as CSV files in data/
temp_key = pandas.read_csv( '../../reference/temperature_key.csv' )
temp_key.index = temp_key.Well
df['temp'] = df.well.str[0].map( temp_key.Celsius.to_dict() ) 
df.dropna( inplace=True )
pos_and_neg = len( df )

# diagnostics 
print 'Input has {} rates. {} will be used, {} will be dropped because of mutant name, {} because negative).'.format( raw_len, len( df ), raw_len - pos_and_neg , pos_and_neg - len( df ) ), 
print 'Samples {} were dropped because the given native residue does not match the BglB sequence and is not off by three'.format( set( not_allowed ) )

df.to_csv( 'assay_data.csv' ) 

Input has 2951 rates. 2951 will be used, 0 will be dropped because of mutant name, 0 because negative). Samples set([]) were dropped because the given native residue does not match the BglB sequence and is not off by three


In [11]:
# define logistic equation 
def f(x, x0, k): 
    return 1/(1+exp(-k*(x-x0)))

empty = array( [ nan ] * 4 ) 
# util function to fit a mutant 
def fit( df ):
  
    name = df.mutant.unique()[0]
    mean_highest_rates = df[ df.rate > df.rate.quantile() ].rate.mean() or 1  
    #print 'mean of highest rates', mean_highest_rates
    #print 'max rate', df.rate.max()
    #df.rate = df.rate / df.rate.max()
    df.rate = df.rate / mean_highest_rates
    
    # linear fit gets us sensible starting params for the logistic fit 
    reg = LinearRegression()
    reg.fit( df.temp.reshape(-1, 1), df.rate )
    slope = reg.coef_[0]
    
    my_index = ['tm', 'k', 'err_tm', 'err_k' ]
    # try fitting to logistic eqn using approximate params from linear fit
    try:
        p0 = ( df.temp.mean(), slope )
        popt, pcov = curve_fit( f, df.temp, df.rate, p0=p0 )
        perr = sqrt( diag( pcov ) ) 
        
        # error checking 
        
        my_params = concatenate( [ popt, perr ] ) 
        if 20 < popt[0] < 60 and popt[1] < 0 and perr[0] < 1: 
            # biological assay limits, and make sure k is the right sign
            return pandas.Series( my_params, index=my_index ) 
        else:
            return pandas.Series( empty, index=my_index )
    except Exception as e:
        print name, e 
        return pandas.Series( empty, index=my_index )
    # done with error checking 

In [12]:
# apply the function to the data set 
grouped = df.groupby( by='mutant' )
est = grouped.apply( fit )

print 'Tm estimated for {} of {} samples'.format( len( est.dropna() ), len( grouped ) )

mean of highest rates 0.142833333333
max rate 0.182
mean of highest rates 1.0
max rate 1.27421236873
mean of highest rates 0.407541666667
max rate 0.455
mean of highest rates 0.0908583333333
max rate 0.112
mean of highest rates 0.0061625
max rate 0.00781
mean of highest rates 2.3e-05
max rate 0.0001
mean of highest rates 0.02285
max rate 0.0294
mean of highest rates 7.7325e-05
max rate 0.000158
mean of highest rates 0.000131666666667
max rate 0.00021
mean of highest rates 0.00022285
max rate 0.00319
mean of highest rates 0.144641666667
max rate 0.198
mean of highest rates 0.00354916666667
max rate 0.00526
mean of highest rates 0.00824916666667
max rate 0.0103
mean of highest rates 0.005205
max rate 0.00752
mean of highest rates 0.0026175
max rate 0.00487
mean of highest rates 0.176533333333
max rate 0.231
mean of highest rates 0.134333333333
max rate 0.175
mean of highest rates 0.00025775
max rate 0.000302
mean of highest rates 0.000126916666667
max rate 0.000184
mean of highest rates 

In [13]:
print est.round( 2 )

           tm     k  err_tm  err_k
mutant                            
A192S   39.54 -1.92    0.45   0.87
BglB    40.24 -1.45    0.09   0.09
C167A     NaN   NaN     NaN    NaN
C167Q   39.50 -0.99    0.22   0.17
D403A     NaN   NaN     NaN    NaN
E154D   39.45 -1.21    0.34   0.38
E164A     NaN   NaN     NaN    NaN
E164G     NaN   NaN     NaN    NaN
E164R     NaN   NaN     NaN    NaN
E177A   38.91 -1.11    0.34   0.53
E177K   38.05 -1.28    0.55   0.94
E177L   40.36 -1.46    0.41   0.48
E222A   38.09 -1.22    0.46   0.75
E222H     NaN   NaN     NaN    NaN
E222K   39.90 -1.16    0.41   0.34
E222Q   40.31 -1.49    0.26   0.30
E222R   39.53 -1.01    0.30   0.23
E222Y     NaN   NaN     NaN    NaN
E353A     NaN   NaN     NaN    NaN
E406A     NaN   NaN     NaN    NaN
E406D   40.86 -1.34    0.28   0.49
E426S   39.68 -1.67    0.13   0.18
F415A     NaN   NaN     NaN    NaN
F415N     NaN   NaN     NaN    NaN
F72H    40.15 -0.84    0.36   0.20
G355A     NaN   NaN     NaN    NaN
H101R   40.41 -1.17 

In [14]:
# I looked at all the plots 

zero_list = [ 
    'E353A', 'H315E', 'M261D', 'P329N', 
    'Q313R', 'R76A', 'S16A', 'S16N', 
    'W325H', 'W325L', 'W407G', 'Y294F', 
]

for zero in zero_list:
    est.loc[ zero ] = empty  

In [15]:
for index, dat in grouped:
    
    dat = dat[ ( dat.rate > 0 ) ] 
    rate = dat.rate / dat.rate.max() 
    my_params = est.loc[ index ]
    
    if my_params.tm:
            
        plt.figure( figsize=(4,3) )
        plt.scatter( dat.temp, rate, color='k', lw=0, marker='.', label=index )
        x = linspace( dat.temp.min(), dat.temp.max(), 50 )
        plt.plot( x, f( x, *my_params[0:2] ), color='purple', label='est. params' )
        #plt.plot( x, f( x, 39.9, -.7 ), color='green', label='BglB' )
        plt.xlabel( 'T (C)' )
        plt.ylabel( 'Normalized rate')
        plt.xticks( [ 30, 40, 50 ] )
        plt.yticks( [ 0, 0.25, .5, 0.75, 1 ] )
        plt.legend( loc='lower left' )
        plt.tight_layout()
        plt.savefig( 'plots/%s.pdf' % index, format='pdf' )
        plt.close()

tm        39.541484
k         -1.917727
err_tm     0.453912
err_k      0.872830
Name: A192S, dtype: float64
tm        40.242588
k         -1.452577
err_tm     0.086724
err_k      0.093796
Name: BglB, dtype: float64
tm       NaN
k        NaN
err_tm   NaN
err_k    NaN
Name: C167A, dtype: float64
tm        39.497344
k         -0.986042
err_tm     0.220236
err_k      0.169818
Name: C167Q, dtype: float64
tm       NaN
k        NaN
err_tm   NaN
err_k    NaN
Name: D403A, dtype: float64
tm        39.447051
k         -1.211123
err_tm     0.344618
err_k      0.384009
Name: E154D, dtype: float64
tm       NaN
k        NaN
err_tm   NaN
err_k    NaN
Name: E164A, dtype: float64
tm       NaN
k        NaN
err_tm   NaN
err_k    NaN
Name: E164G, dtype: float64
tm       NaN
k        NaN
err_tm   NaN
err_k    NaN
Name: E164R, dtype: float64
tm        38.908691
k         -1.110711
err_tm     0.341797
err_k      0.531367
Name: E177A, dtype: float64
tm        38.049692
k         -1.280071
err_tm     0.549593
e

KeyboardInterrupt: 

Error in callback <function post_execute at 0x105dc27d0> (for post_execute):


KeyError: (22.0, 0.5, u'Normalized rate', u'k', u'bottom', u'center', 3534391420781609850, u'vertical', u'anchor', 80.0, 4838981008)

KeyError: (22.0, 0.5, u'Normalized rate', u'k', u'bottom', u'center', 3534391420781609850, u'vertical', u'anchor', 80.0, 4838981008)

<matplotlib.figure.Figure at 0x12047a2d0>

In [None]:
# diagnostics from June 21

# should have Tm but don't 
# S16A, W325L, Y294F