## Data input and cleanup 

Data were collected into a table with the columns `well`, `rate`, and `mutant` using Google Docs and exported to CSV, which is the input file here. 

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt 
import pandas
from sklearn.linear_model import LinearRegression
from scipy.optimize import curve_fit 
from numpy import log, exp, linspace, sqrt, diag, array, nan
from numpy import concatenate 

In [4]:
df = pandas.read_csv( 'raw_data.csv', index_col='mutant' )
raw_len = len( df )

df.drop( ['WT', 'WT1'], inplace=True )
df.dropna( inplace=True ) 

with open( '../../reference/allowed_mutants.txt' ) as fn:
    mutants = [ i.split('.')[0].upper() for i in fn.readlines() if len( i ) > 1 ]
    
def is_good( name ):
    return name in mutants 

not_allowed = []
l = []
for name in df.index:
    if is_good( name ) or name == 'BglB':
        l.append( name )
    else:
        new = name[0] + str( int( name[1:-1] ) - 3 ) + name[-1]
        #print 'suspected off by three', name,  
        if is_good( new ):
            l.append( new )
            #print 'fixed' 
        else:
            l.append( 'drop_me' )
            not_allowed.append( name )
            #print ', cannot fix' 
            
df.index = df['mutant_old'] = l
df.drop( ['drop_me'], inplace=True )

# temperature keys as CSV files in data/
temp_key = pandas.read_csv( '../../reference/temperature_key.csv' )
temp_key.index = temp_key.Well
df['temp'] = df.well.str[0].map( temp_key.Celsius.to_dict() ) 
df.dropna( inplace=True )
pos_and_neg = len( df )

# diagnostics 
print 'Input has {} rates. {} will be used, {} will be dropped because of mutant name, {} because negative).'.format( raw_len, len( df ), raw_len - pos_and_neg , pos_and_neg - len( df ) ), 
print 'Samples {} were dropped because the given native residue does not match the BglB sequence and is not off by three'.format( set( not_allowed ) )

df.to_csv( 'assay_data.csv' ) 

Input has 2975 rates. 2975 will be used, 0 will be dropped because of mutant name, 0 because negative). Samples set([]) were dropped because the given native residue does not match the BglB sequence and is not off by three
