# `write_paper`

An incredibly rough draft of Python scripts to generate scientific literature 

In [1]:
import pandas

In [2]:
df = pandas.read_csv( 'data_set/master_sheet.csv', index_col=0, na_values=['<10'] ) 

In [3]:
n_mutants = len( df )
n_expressed = df.expression.sum()
print( 'Out of all {} mutants that were produced, purified, and assayed for kinetic parameters and functional melting temperature, {} ({:0.0f}%) were determined via SDS-PAGE to express as soluble monomeric protein in *Escherichia coli* BLR. The remaining {} mutants were found to not express after at least two independent production attempts.'.format( n_mutants, n_expressed, 100*(n_expressed/n_mutants), n_mutants-n_expressed ) )

Out of all 129 mutants that were produced, purified, and assayed for kinetic parameters and functional melting temperature, 92 (71%) were determined via SDS-PAGE to express as soluble monomeric protein in *Escherichia coli* BLR. The remaining 37 mutants were found to not express after at least two independent production attempts.


In [5]:
df[(df.expression==0)].index

Index(['G12N', 'S16N', 'Q19P', 'S32L', 'W34A', 'R76A', 'H119E', 'W120A',
       'D121F', 'N163E', 'N163K', 'E164G', 'E164R', 'Y166P', 'H178R', 'A236E',
       'R240E', 'A249E', 'M261D', 'N293D', 'N293K', 'Y294L', 'T296A', 'T296E',
       'H315N', 'M323K', 'W325G', 'P329N', 'F343S', 'G355A', 'H379T', 'D403A',
       'W407G', 'W407K', 'W407Q', 'W407R', 'W407Y'],
      dtype='object', name='mutant')

In [6]:
df.describe()



Unnamed: 0,sequence_pos,expression,tm,k,err_tm,err_k,kcat,err_kcat,km,err_km,kcatkm,err_kcatkm,ki,ki_percent_err,ki_err,gel_number
count,128.0,129.0,79.0,79.0,79.0,79.0,76.0,76.0,76.0,76.0,79.0,78.0,8.0,8.0,8.0,128.0
mean,241.53125,0.713178,39.463291,-0.838354,0.268734,0.156456,539.436842,13.068553,11.117895,1.000526,92361.15,5464.252564,227.7725,35.595,111.18625,10.390625
std,122.181737,0.454041,1.645206,0.374917,0.156683,0.15614,1263.384644,31.36236,13.437381,1.186895,191573.5,9229.112911,167.270111,22.658287,165.916493,5.749679
min,12.0,0.0,34.91,-1.73,0.04,0.02,0.2,0.01,0.4,0.02,11.0,1.0,95.24,13.7,13.05,1.0
25%,,0.0,,,,,,,,,,,,,,
50%,,1.0,,,,,,,,,,,,,,
75%,,1.0,,,,,,,,,,,,,,
max,423.0,1.0,45.99,-0.26,0.79,0.86,11011.0,258.0,89.18,5.89,1570000.0,57175.0,590.71,86.56,511.32,21.0


In [7]:
for cst_short_name, cst_long_name, cst_unit in [ 
    ('km', 'km', 'mM' ),  
    ('kcat', 'kcat', 'min-1' ), 
    ('kcatkm', 'kcatkm', 'M-1min-1' ), 
    ('tm', 'tm', '˚C' )]:
    print( '\nThe parameter {1} was measured for {3} mutants. The lowest {1} observed was found to be {4} {2}, while the highest {1} observed was {5} {2} (a range of {6:2.2f} {2}). The average {1} was found to be {7:2.2f}±{8:2.2f} {2}, compared to the native value of {9:2.2f}±{10:2.2f} {2}. The mutation in this data found to have the lowest {1} was {11}. The mutation found to have the highest {1} was {12}.'.format( 
        
        #0,1,2
        cst_short_name, cst_long_name, cst_unit, 
            
        #3,4,5,,6
        len(df[(df[cst_short_name]>0)]), df[cst_short_name].min(), df[cst_short_name].max(), 
        df[cst_short_name].max()-df[cst_short_name].min(),
            
        #7,8
        df[cst_short_name].mean(), df[cst_short_name].std(), 
            
        #9,10
        df.loc['BglB'][cst_short_name], df.loc['BglB']['err_{}'.format(cst_short_name)], 
        
        #11,12
        df[ ( df[ cst_short_name ] == df[ cst_short_name ].min() ) ].index[0], 
        df[ ( df[ cst_short_name ] == df[ cst_short_name ].max() ) ].index[0], 
    ) )


The parameter km was measured for 76 mutants. The lowest km observed was found to be 0.4 mM, while the highest km observed was 89.18 mM (a range of 88.78 mM). The average km was found to be 11.12±13.44 mM, compared to the native value of 5.00±0.20 mM. The mutation in this data found to have the lowest km was N220Y. The mutation found to have the highest km was W120H.

The parameter kcat was measured for 76 mutants. The lowest kcat observed was found to be 0.2 min-1, while the highest kcat observed was 11011.0 min-1 (a range of 11010.80 min-1). The average kcat was found to be 539.44±1263.38 min-1, compared to the native value of 880.00±10.00 min-1. The mutation in this data found to have the lowest kcat was W399R. The mutation found to have the highest kcat was R240A.

The parameter kcatkm was measured for 79 mutants. The lowest kcatkm observed was found to be 11.0 M-1min-1, while the highest kcatkm observed was 1570000.0 M-1min-1 (a range of 1569989.00 M-1min-1). The average kcatkm w

In [12]:
df[(abs(df.kcat-880)<200)]

Unnamed: 0_level_0,sequence_pos,expression,tm,k,err_tm,err_k,kcat,err_kcat,km,err_km,kcatkm,err_kcatkm,ki,ki_percent_err,ki_err,gel_number
mutant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
S17A,17.0,1,41.66,-1.31,0.14,0.44,848.0,76.0,18.45,3.72,45978.0,10135.0,,,,9.0
V52G,52.0,1,39.28,-1.13,0.12,0.13,687.0,13.0,8.25,0.54,83371.0,5707.0,,,,12.0
I91E,91.0,1,39.66,-0.55,0.32,0.08,846.0,35.0,6.71,0.79,126071.0,15714.0,,,,10.0
H101R,101.0,1,40.09,-0.95,0.1,0.07,1059.0,16.0,10.62,0.53,99708.0,5225.0,,,,12.0
E154D,154.0,1,38.83,-0.77,0.32,0.18,878.0,47.0,3.46,0.76,254004.0,57175.0,,,,14.0
L171A,171.0,1,38.8,-0.66,0.53,0.21,807.0,9.0,11.09,0.42,72719.0,2851.0,,,,11.0
T175R,175.0,1,38.06,-0.54,0.11,0.03,801.0,8.0,3.59,0.15,223033.0,9663.0,,,,7.0
E177A,177.0,1,37.54,-0.52,0.24,0.06,986.0,10.0,5.98,0.22,164804.0,6408.0,,,,14.0
A192S,192.0,1,39.24,-1.29,0.29,0.46,946.0,10.0,5.09,0.18,185848.0,6994.0,,,,14.0
R240K,240.0,1,39.33,-1.47,0.19,0.33,898.0,59.0,17.67,3.32,50829.0,10102.0,,,,3.0


In [6]:
within = lower = higher = 0
for tm in df.tm:
    if abs( 39.9 - tm ) < 1:
        within += 1 
    elif tm > ( 39.9 + 1 ):
        higher += 1
    elif tm > 0 and tm < 38.9:
        lower += 1
        
print( within, lower, higher ) 

43 26 10


In [8]:
df.describe()



Unnamed: 0,sequence_pos,expression,tm,k,err_tm,err_k,kcat,err_kcat,km,err_km,kcatkm,err_kcatkm,ki,ki_percent_err,ki_err,gel_number
count,128.0,129.0,79.0,79.0,79.0,79.0,76.0,76.0,76.0,76.0,79.0,78.0,8.0,8.0,8.0,128.0
mean,241.53125,0.713178,39.463291,-0.838354,0.268734,0.156456,539.436842,13.068553,11.117895,1.000526,92361.15,5464.252564,227.7725,35.595,111.18625,10.390625
std,122.181737,0.454041,1.645206,0.374917,0.156683,0.15614,1263.384644,31.36236,13.437381,1.186895,191573.5,9229.112911,167.270111,22.658287,165.916493,5.749679
min,12.0,0.0,34.91,-1.73,0.04,0.02,0.2,0.01,0.4,0.02,11.0,1.0,95.24,13.7,13.05,1.0
25%,,0.0,,,,,,,,,,,,,,
50%,,1.0,,,,,,,,,,,,,,
75%,,1.0,,,,,,,,,,,,,,
max,423.0,1.0,45.99,-0.26,0.79,0.86,11011.0,258.0,89.18,5.89,1570000.0,57175.0,590.71,86.56,511.32,21.0


In [9]:
df.sort_values( 'tm' ).head( 1 ) 

Unnamed: 0_level_0,sequence_pos,expression,tm,k,err_tm,err_k,kcat,err_kcat,km,err_km,kcatkm,err_kcatkm,ki,ki_percent_err,ki_err,gel_number
mutant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
E222H,222.0,1,34.91,-0.7,0.15,0.07,160.0,3.0,8.54,0.53,18695.0,1212.0,,,,2.0


In [None]:
text_str = 'Of 78 mutants for which Tm was determined, 41 mutants (53%) have a Tm that falls within 1 ˚C of the wild type Tm. Of the remaining 37 Tm values, 26 exhibited a lower melting temperature and 11 displayed a higher melting temperature. The highest Tm observed in this data set is for the mutation E164A, which increased the Tm to 45.99 ˚C (+6.09 ˚C), while the lowest Tm observed was for mutant H178A, which had a Tm of 34.25 ˚C (–5.6 ˚C).'
