In [330]:
# Let's try to re-weight beta_cst to give us better performance on a benchmark, any benchmark
# for now, we appoximate good protocol with repack single residue only 

In [347]:
import pyrosetta 
import rosetta 
import pandas 
import numpy as np 

from sklearn.linear_model import SGDRegressor
from sklearn import linear_model, cross_validation, metrics

In [332]:
pyrosetta.init( '-beta -extra_res_fa reference_files/pNPG.params' )

Found rosetta database at: /usr/local/bin/anaconda3/lib/python3.5/site-packages/pyrosetta-4.0-py3.5.egg/database; using it....
PyRosetta-4 2016 [Rosetta 2016 unknown:7862c3a14bb5e8d6b2a4c3e0a497795bc06e9e4b 2016-12-13 14:39:52 -0500] retrieved from: git@github.com:RosettaCommons/main.git
(C) Copyright Rosetta Commons Member Institutions.
Created in JHU by Sergey Lyskov and PyRosetta Team.



In [333]:
fa_elec = rosetta.core.scoring.score_type_from_name( 'fa_elec' ) 
s.get_weight( fa_elec ) 

1.0

In [334]:
s = pyrosetta.create_score_function( 'beta_cst' )
s.set_weight( fa_sol, 7 ) 

In [335]:
fmt = dict( zip( 'ANDRCQEGHILKMPFSTWYV', [
    'ALA','ASN','ASP','ARG','CYS','GLN','GLU',
    'GLY','HIS','ILE','LEU','LYS','MET','PRO','PHE','SER',
    'THR','TRP','TYR','VAL' ] ) )

In [336]:
score_types = s.get_nonzero_weighted_scoretypes()

In [337]:
score_types

vector1_core_scoring_ScoreType[fa_atr, fa_rep, fa_sol, fa_intra_atr_xover4, fa_intra_rep_xover4, fa_intra_sol_xover4, lk_ball, lk_ball_iso, lk_ball_bridge, lk_ball_bridge_uncpl, fa_elec, fa_intra_elec, pro_close, hbond_sr_bb, hbond_lr_bb, hbond_bb_sc, hbond_sc, dslf_fa13, atom_pair_constraint, coordinate_constraint, angle_constraint, dihedral_constraint, omega, fa_dun_dev, fa_dun_rot, fa_dun_semi, p_aa_pp, hxl_tors, ref, chainbreak, rama_prepro, res_type_constraint]

In [339]:
def generate_features( param_vector ):
    
    p = pyrosetta.pose_from_file( 'reference_files/bglb.pdb' ) 
    s = pyrosetta.create_score_function( 'beta_cst' ) 
    
    # update values using values from params_vector  
    fa_sol = rosetta.core.scoring.score_type_from_name( 'fa_sol' ) 
    fa_elec = rosetta.core.scoring.score_type_from_name( 'fa_elec' ) 
    atom_pair_constraint = rosetta.core.scoring.score_type_from_name( 'atom_pair_constraint' ) 
    s.set_weight( fa_sol, param_vector[ 0 ] ) 
    s.set_weight( fa_elec, param_vector[ 1 ] ) 
    s.set_weight( atom_pair_constraint, param_vector[ 2 ] ) 
    
    # capture initial pose params 
    orig_score = s( p ) 

    # run through all mutants for which we have data to save processing time 
    with open( 'reference_files/mutant_list.txt' ) as fn:
        mutant_list = fn.read().split()
        
    mutant_features = []
    for mutant in mutant_list:
        
        # create and sample the mutant 
        my_pose = p.clone()
        mutate = rosetta.protocols.simple_moves.MutateResidue( int(mutant[1:-1]), fmt[ mutant[ -1 ] ] ) 
        mutate.apply( my_pose )
        # sampling? 
        
        # calculate features 
        my_features = []
        my_features.append( s( my_pose ) ) # total_score 
        
        # return package 
        mutant_features.append( [mutant] + my_features )         
        
    return mutant_features 

In [340]:
initial_param_vector = ( 1, 1, 1 ) 
m_feats = generate_features( initial_param_vector )

In [341]:
def score_function( mutant_features_list ): 
    target_name = 'target_tm' 
    targets = pandas.read_csv( '/Users/alex/Documents/bagel-thermal/data_set/targets.csv', index_col=0 )[[target_name]] 
    df = pandas.DataFrame( mutant_features_list )
    df.set_index( 0, inplace=True ) 
    J = targets.join( df ) 
    return J.corr()[ target_name ].drop( target_name ).values[0]

In [342]:
def ml_score_function( mutant_features_list ): 
    target_name = 'target_tm' 
    targets = pandas.read_csv( '/Users/alex/Documents/bagel-thermal/data_set/targets.csv', index_col=0 )[[target_name]] 
    df = pandas.DataFrame( mutant_features_list )
    df.set_index( 0, inplace=True ) 
    J = targets.join( df ).dropna() 
    y = J.ix[:, 0].ravel()
    X = J.ix[:, 1:]
    clf = linear_model.SGDRegressor()
    mean_score = cross_validation.cross_val_score( clf, X, y ).mean()
    return mean_score 

In [None]:
#Score function (or loss function) with signature score_func(y, y_pred, **kwargs)
def score_func( y, y_pred, **kwargs ):
    
    return score 
    
scorer = metrics.make_scorer( score_func, greater_is_better=True, needs_proba=False, needs_threshold=False )

In [343]:
score = ml_score_function( m_feats )

In [344]:
score

-5.3039238421345818e+28

In [345]:
# now, implement the Monte Carlo search for "good params", or params that minimize the score function 

In [346]:
def search_for_params( initial_param_vector ):
    
    low_score = 1e9 # a billion! 
    trials = 10 
    param_vector = np.array(initial_param_vector)
    trial_params = param_vector
    
    for i in range( trials ): 
        
        # generate features and score
        m_feats = generate_features( trial_params )
        score = ml_score_function( m_feats )
        print( 'trial: {}\nscore: {}\nparams: {}'.format( i, score, trial_params ) ) 
        
        # no temperature here yet, just finds low score 
        if score < low_score: 
            print( 'Monte Carlo: accept' ) 
            low_score = score 
            param_vector = trial_params 
        else:
            print( 'Monte Carlo: reject' ) 

        # now randomly preturb the params? 
        preturb_vec = np.random.normal( loc=0.0, scale=1.0, size=len(param_vector) )
        trial_params = param_vector + preturb_vec        
            
    # at end of simulation, produce results 
    result = low_score, param_vector 
    return result

result = search_for_params( (1,1,1) )
result 

trial: 0
score: -9.856978179347566e+29
params: [1 1 1]
Monte Carlo: accept
trial: 1
score: -1.5032375971598514e+30
params: [ 1.15320257 -0.06085274  1.01684054]
Monte Carlo: accept
trial: 2
score: -1.2322171982871256e+32
params: [ 2.52095711 -0.23910619  1.19759233]
Monte Carlo: accept
trial: 3
score: -3.7094430689191845e+30
params: [ 2.09858477 -0.53420776  0.98095366]
Monte Carlo: reject
trial: 4
score: -2.1874411513558305e+31
params: [ 0.96708889 -0.79622965  0.63706753]
Monte Carlo: reject
trial: 5
score: -1.324252042798274e+30
params: [ 0.77927606  0.12237184  1.63228932]
Monte Carlo: reject
trial: 6
score: -4.667372629765229e+31
params: [ 3.33512766  0.88159889  0.53311386]
Monte Carlo: reject
trial: 7
score: -7.1984900164010775e+31
params: [ 2.82670176 -0.4283028   0.16218927]
Monte Carlo: reject
trial: 8
score: -3.3073068865058767e+30
params: [ 1.09008247 -0.16093653  1.39063741]
Monte Carlo: reject
trial: 9
score: -2.493638008452948e+31
params: [ 1.91113292 -2.39977756  1.7268

(-1.2322171982871256e+32, array([ 2.52095711, -0.23910619,  1.19759233]))