## Data input and cleanup 

Data were collected into a table with the columns `well`, `rate`, and `mutant` using Google Docs and exported to CSV, which is the input file here. 

In [38]:
import pandas
from sklearn.linear_model import LinearRegression
from scipy.optimize import curve_fit 
from numpy import log, exp, linspace, sqrt, diag, array

df = pandas.read_csv( 'raw/assay_data.csv', index_col='mutant' )
raw_len = len( df )

df.drop( ['WT', 'WT1'], inplace=True )
df.dropna( inplace=True ) 

with open( 'lit/allowed_mutants.txt' ) as fn:
    mutants = [ i.split('.')[0].upper() for i in fn.readlines() if len( i ) > 1 ]
    
def is_good( name ):
    return name in mutants 

not_allowed = []
l = []
for name in df.index:
    if is_good( name ):
        l.append( name )
    else:
        new = name[0] + str( int( name[1:-1] ) - 3 ) + name[-1]
        if is_good( new ):
            l.append( new )
        else:
            l.append( 'drop_me' )
            not_allowed.append( name )
            
df.index = df['mutant'] = l
df.drop( ['drop_me'], inplace=True )

# temperature keys as CSV files in data/
temp_key = pandas.read_csv( 'lit/temperature_key.csv' )
temp_key.index = temp_key.Well
df['temp'] = df.well.str[0].map( temp_key.Celsius.to_dict() ) 
df.dropna( inplace=True )
pos_and_neg = len( df )
#df = df[( df.rate > 0 )]

# diagnostics 
print 'Input has {} rates. {} will be used, {} will be dropped because of mutant name, {} because negative).'.format( raw_len, len( df ), raw_len - pos_and_neg , pos_and_neg - len( df ) ), 
print 'Samples {} were dropped because the given native residue does not match the BglB sequence and is not off by three'.format( set( not_allowed ) )

df.to_csv( 'clean/assay.csv' ) 

Input has 3313 rates. 3145 will be used, 168 will be dropped because of mutant name, 0 because negative). Samples set(['E17S', 'E53A', 'Q384R', 'W299A']) were dropped because the given native residue does not match the BglB sequence and is not off by three


In [39]:
# define logistic equation 
def f(x, x0, k): 
    return 1/(1+exp(-k*(x-x0)))

# util function to fit a mutant 
def fit( df ):
    empty_pair = array( [ nan, nan ] ) 
    name = df.mutant.unique()[0]
    df.rate = df.rate / df.rate.max()
    
    # linear fit gets us sensible starting params for the logistic fit 
    reg = LinearRegression()
    reg.fit( df.temp.reshape(-1, 1), df.rate )
    slope = reg.coef_[0]
    
    # try fitting to logistic eqn using approximate params from linear fit
    try:
        p0 = ( df.temp.mean(), slope )
        popt, pcov = curve_fit( f, df.temp, df.rate, p0=p0 )
        perr = sqrt( diag( pcov ) ) 
        
        # error checking 
        if 20 < popt[0] < 60 and popt[1] < 0: # biological assay limits, and make sure k is the right sign
            return pandas.Series( popt, index=['tm', 'k'] )
        else:
            return pandas.Series( empty_pair, index=['tm', 'k'] )
    except Exception as e:
        print name, e 
        return pandas.Series( empty_pair, index=['tm', 'k'] )
    
grouped = df.groupby( by='mutant' )
fits = grouped.apply( fit )

print '{} of {} samples fit to the logistic equation'.format( len( fits.dropna() ), len( grouped ) )

86 of 113 samples fit to the logistic equation


In [42]:
%matplotlib inline
import matplotlib.pyplot as plt 

#!mkdir plots 
for index, df in grouped:
    name = df.mutant.unique()[0]
    rate = df.rate / df.rate.max() 
    plt.figure( figsize=(2,2) )
    plt.scatter( df.temp, rate, alpha=0.7, color='black', marker='.' )
    popt = fits.loc[ name ]
    if popt.size == 2:
        x_space = linspace( df.temp.min(), df.temp.max(), 100 )
        plt.plot( x_space, f( x_space, *popt ), alpha=0.8, color='purple' )
    plt.xlabel( 'T (C)' )
    plt.ylabel( 'Normalized rate')
    plt.xticks( [ 30, 40, 50 ] )
    plt.yticks( [ 0, 0.5, 1 ] )
    plt.title( name )
    plt.tight_layout()
    plt.savefig( 'plots/%s.pdf' % name, format='pdf' )
    plt.close()

In [43]:
# I looked at all the plots 

zero_list = [ 
    'E353A', 'H315E', 'M261D', 'P329N', 'Q313R', 'R76A', 'S16A', 
    'W325H', 'W325L', 'W407G', 'Y294F', 
]

from numpy import nan 

for zero in zero_list:
    fits.loc[ zero ] = ( nan, nan ) 

ValueError: cannot set a row with mismatched columns

In [None]:
# add metadata
fits['native'] = fits.index.str[0]
fits['designed'] = fits.index.str[-1]
fits['position'] = fits.index.str[1:-1].astype( int )

In [None]:
# production and purification data 
pp = pandas.read_csv( 'raw/production.csv', index_col='mutant' )
#print pp.index.value_counts()

# this is a sanity-checking routine for "off-by-threes"
# let's just rename all the tubes
pp_not_allowed = []
l = []
pp.drop( [ 'WT', 'nan' ], axis=0, inplace=True ) 

from numpy import nan 

for name in pp.index: 
    if type( name ) == float:
        l.append( 'drop_me' ) 
    elif is_good( name ):
        l.append( name )
    else:
        new = name[0] + str( int( name[1:-1] ) - 3 ) + name[-1]
        if is_good( new ):
            l.append( new )
        else:
            l.append( 'drop_me' )
            not_allowed.append( name )

pp.index = l
joined = pp.join( fits )

In [None]:
plos = pandas.read_csv( '/Users/alex/Documents/bagel-data/clean_data/clean_for_pandas.csv', index_col='name' ) 
plos_join = joined.join( plos, rsuffix='PLOS' ) 
plos_join.to_csv( 'raw/join.csv' ) 