## Fits and plots 

Clean assay data (`assay_data.csv`) is used as an input file 

In [14]:
%matplotlib inline
import matplotlib.pyplot as plt 
import pandas

from sklearn.linear_model import LinearRegression
from scipy.optimize import curve_fit 
from numpy import log, exp, linspace, sqrt, diag, array, nan
from numpy import concatenate 

In [15]:
df = pandas.read_csv( 'assay_data.csv', index_col=0 ) 

In [22]:
# empty four-mer array 
empty = array( [ nan ] * 4 ) 

# define logistic equation 
def f(x, x0, k): 
    return 1/(1+exp(-k*(x-x0)))

# util function to fit a mutant 
def fit( df ):
  
    #name = df.mutant.unique()[0]
    df.rate = df.rate / df.rate.max()
    
    # linear fit gets us sensible starting params for the logistic fit 
    reg = LinearRegression()
    reg.fit( df.temp.reshape(-1, 1), df.rate )
    slope = reg.coef_[0]
    
    # try fitting to logistic eqn using approximate params from linear fit
    try:
        p0 = ( df.temp.mean(), slope )
        popt, pcov = curve_fit( f, df.temp, df.rate, p0=p0 )
        perr = sqrt( diag( pcov ) ) 
        
        # error checking 
        my_index = ['tm', 'k', 'err_tm', 'err_k' ]
        my_params = concatenate( [ popt, perr ] ) 
        if 20 < popt[0] < 60 and popt[1] < 0 and perr[0] < 1: 
        # biological assay limits, 
        # make sure k is the right sign
        # less than 100% error 
            return pandas.Series( my_params, index=my_index ) 
        else:
            return pandas.Series( empty, index=my_index )
    except Exception as e:
        print name, e 
        return pandas.Series( empty, index=my_index )
    # done with error checking 

In [23]:
# apply the function to the data set 
grouped = df.groupby( by='mutant' )
est = grouped.apply( fit )

print 'Tm estimated for {} of {} samples'.format( len( est.dropna() ), len( grouped ) )

Tm estimated for 66 of 115 samples


In [24]:
print est.round( 2 )

           tm     k  err_tm  err_k
mutant                            
A192S   39.08 -0.91    0.33   0.27
BglB    39.86 -1.15    0.11   0.09
C167A   39.74 -1.39    0.54   0.58
C167Q   38.47 -0.53    0.18   0.05
D403A     NaN   NaN     NaN    NaN
E154D   38.70 -0.70    0.35   0.16
E164A     NaN   NaN     NaN    NaN
E164G     NaN   NaN     NaN    NaN
E164R     NaN   NaN     NaN    NaN
E177A   37.31 -0.49    0.25   0.05
E177K   36.65 -0.67    0.25   0.10
E177L   39.22 -0.52    0.51   0.12
E222A   36.74 -0.66    0.13   0.05
E222H   34.71 -0.64    0.15   0.05
E222K   38.53 -0.47    0.51   0.10
E222Q   39.42 -0.82    0.39   0.22
E222R   39.04 -0.83    0.21   0.14
E222Y   36.88 -0.85    0.50   0.28
E353A     NaN   NaN     NaN    NaN
E406A   39.93 -1.52    0.66   0.71
E406D   40.55 -1.02    0.27   0.22
E426S   39.46 -1.45    0.11   0.15
F415A     NaN   NaN     NaN    NaN
F415N     NaN   NaN     NaN    NaN
F72H    38.61 -0.42    0.43   0.07
G355A     NaN   NaN     NaN    NaN
H101R   40.02 -0.91 

In [29]:
for index, dat in grouped:
    
    dat = dat[ ( dat.rate > 0 ) ] 
    rate = dat.rate / dat.rate.max() 
    my_params = est.loc[ index ]

    if my_params.tm > 20:
            
        plt.figure( figsize=(4,3) )
        plt.scatter( dat.temp, rate, color='k', lw=0, marker='.', label=index )
        x = linspace( dat.temp.min(), dat.temp.max(), 50 )
        plt.plot( x, f( x, *my_params[0:2] ), color='purple', label=index )
        plt.plot( x, f( x, 39.85, -1.1 ), color='green', label='BglB' )
        plt.xlabel( 'T (C)' )
        plt.ylabel( 'Normalized rate')
        plt.xticks( [ 30, 40, 50 ] )
        plt.yticks( [ 0, 0.25, .5, 0.75, 1 ] )
        plt.legend( loc='lower left' )
        plt.tight_layout()
        plt.savefig( 'plots/%s.pdf' % index, format='pdf' )
        plt.close()

In [27]:
plos = pandas.read_csv( '/Users/alex/Documents/bagel-benchmark/data_sets/experimental/plos2015.csv', index_col='name' )
plos['in_plos_paper'] = 1 
joined = est.join( plos )

# add metadata
joined['native'] = joined.index.str[0]
joined['designed'] = joined.index.str[-1]
joined['position'] = joined.index.str[1:-1].astype( int )
#joined['in_plos_paper'] = joined.in_plos_paper.astype( bool ).astype( int )

joined

Unnamed: 0_level_0,tm,k,err_tm,err_k,y,km,ekm,kcat,ekcat,ki,eki,kcatkm,ekcatkm,in_plos_paper,native,designed,position
mutant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
A192S,39.082598,-0.906300,0.334125,0.267097,1.17,5.09,0.18,946.0,10.0,,,185848.0,6994.0,1.0,A,S,192
BglB,39.859553,-1.145112,0.108344,0.087975,1.20,5.00,0.20,880.0,10.0,,,176000.0,8000.0,1.0,B,B,4591149604126578442
C167A,39.735604,-1.388438,0.540130,0.578609,0.48,14.56,1.27,479.0,14.0,,,32884.0,3026.0,1.0,C,A,167
C167Q,38.471618,-0.532181,0.179317,0.045004,0.94,4.92,0.19,504.0,6.0,590.71,86.56,102415.0,4149.0,1.0,C,Q,167
D403A,,,,,,,,,,,,,,1.0,D,A,403
E154D,38.698871,-0.699229,0.349911,0.158566,1.42,3.46,0.76,878.0,47.0,,,254004.0,57175.0,1.0,E,D,154
E164A,,,,,0.42,1.01,0.17,0.0,0.0,,,190.0,33.0,1.0,E,A,164
E164G,,,,,,,,,,,,,,,E,G,164
E164R,,,,,,,,,,,,,,,E,R,164
E177A,37.307311,-0.487548,0.249692,0.052367,0.96,5.98,0.22,986.0,10.0,,,164804.0,6408.0,1.0,E,A,177


In [63]:
# diagnostics from June 21

# should have Tm but don't 
# S16A, W325L, Y294F