# `sanity_checks`

Data consistency checks for BglB data set

In [1]:
from skbio import Protein 
import pandas
import numpy as np 

Collect data sets

In [3]:
df = pandas.read_csv( 'data_set/targets.csv', index_col=0 )
plos = pandas.read_csv( '/Users/alex/Documents/bagel-benchmark/data_sets/experimental/plos_2016.csv', index_col=0 )

In [5]:
kinetics = pandas.read_csv( 'data_set/assay_data/pub/kinetics.csv' )
thermal = pandas.read_csv( 'data_set/assay_data/pub/thermal.csv' )

In [6]:
kinetics.head()
thermal.head()

Unnamed: 0.1,Unnamed: 0,well,rate,mutant,temp
0,0,A4,0.00013,A192S,50.0
1,1,B4,8e-05,A192S,47.15
2,2,C4,0.00021,A192S,44.3
3,3,D4,0.00407,A192S,41.45
4,4,E4,0.114,A192S,38.6


### Basic sanity checks 

First, we check that the named residue is in fact the native BglB residue. 

In [13]:
#collector
#func_list = []
#def sanity_check( func ):
#    func_list.append( func )
    
#sanity check functions 
#@sanity_check 
def check_name( dat ):
    '''
    As in all these functions, 
    
    Input: 
        `dat`
        pandas Series with kcat, km, etc. params 
        
    Returns:
        True 
        if name checks out 
        
        False
        if the given native amino acid doesn't match the 
        amino acid in the native BglB sequence 
    '''
    
    pos = int( dat.name[1:-1] )
    protein = Protein.read( 'data_set/reference/bglb.pep' )
    nat = str( protein[pos-1] )
    
    if nat == dat.name[0]:
        return True 
    else:
        return False 

In [14]:
'W409Y' in kinetics.mutant.values

True

In [15]:
def has_kinetic_data( dat ):
    if dat.name in kinetics.mutant.values:
        return True
    else:
        return False 
    
def has_thermal_data( dat ):
    if dat.name in thermal.mutant.values:
        return True
    else:
        return False 
    
def thermal_fit_matches_data( dat ):
    
    if dat.name in thermal.mutant.values:
        return True
    else:
        return False 

In [16]:
func_list = [
    #true_check, 
    #false_check, 
    check_name,
    has_kinetic_data,
    has_thermal_data, 
]

In [17]:
def run_tests():
    print('Mutant\t' + '\t'.join( [ i.__name__ for i in func_list ] ))
    for mutant_name, dat in df.iterrows():
        if mutant_name != 'BglB':
            print( mutant_name, end='\t' ) 
            for f in func_list:
                print( f(dat), end='\t' )
            print()

### Results of sanity checks

### Discussion 

Wilson's idea, automate justin's thought process/questions as algorithm

In [18]:
run_tests()

Mutant	check_name	has_kinetic_data	has_thermal_data
G12N	True	False	False	
S14A	True	False	True	
T15A	True	True	True	
S16A	True	False	True	
S16N	True	False	False	
S17A	True	False	True	
S17E	True	False	True	
Y18A	True	False	True	
Q19S	True	False	True	
Q19A	True	False	False	
Q19C	True	False	False	
Q19P	True	False	False	
S32L	True	False	False	
W34A	True	False	False	
V52G	True	False	True	
F72A	True	False	True	
R76A	True	False	False	
I91E	True	False	True	
H101R	True	False	True	
H119N	True	False	True	
H119A	True	False	True	
H119E	True	False	False	
W120H	True	False	True	
W120F	True	False	True	
W120A	True	False	False	
D121F	True	False	False	
E154D	True	False	True	
N163A	True	False	True	
N163D	True	False	True	
N163C	True	False	True	
N163E	True	False	False	
N163K	True	False	False	
E164A	True	False	True	
E164G	True	False	False	
E164R	True	False	False	
Y166P	True	False	False	
C167A	True	False	True	
C167Q	True	False	True	
L171A	True	False	True	
L171R	True	False	True	
T175R	True	False	True	
E177L	Tr