# Blind systematic steps in sequence-functional landscpe for more diverse starting sequences for protein design [lead to better models?]

## Description

Here, we systematically mutate each residue in the active site to all 20 possible residues (note: 1 out of 20 is wild type), then run 100 design simulations. We'll compare this to a stock protocol.

In [1]:
! ls 

dehydrogenase.enzdes.cst  list_of_pos.txt	   mdh.pdb
design_run		  list_of_res.txt	   MEL.params
enzdes_protocol.xml	  log.txt		   NAD.params
FE2.params		  make-list.py		   out
flags			  mdh_A26V_A31V_A169V.pdb  sub.sh
force_list		  mdh_a26v_a31v.pdb	   Untitled.ipynb
list			  mdh_a26v.pdb


In [3]:
with open( 'list_of_res.txt' ) as fn:                                                                                              
    residues = [ i.strip() for i in fn.readlines() ]                                                                                 
                                                                                                                                   
with open( 'list_of_pos.txt' ) as fn:                                                                                              
    list_of_pos = [ i.strip() for i in fn.readlines() ]                                                                              

nstruct = 100 
with open( 'list', 'w' ) as fn:    
    for k in range( nstruct ):
        for j in residues:                                                                                                                 
            for i in list_of_pos:
                fn.write( '-parser:script_vars target={} new_res={} '.format( i, j ) ) 
                fn.write( '-suffix _{}_{}_{:04d} \n'.format( i, j, k ) )

In [4]:
! head list 

-parser:script_vars target=46 new_res=ALA -suffix _46_ALA_0000 
-parser:script_vars target=48 new_res=ALA -suffix _48_ALA_0000 
-parser:script_vars target=106 new_res=ALA -suffix _106_ALA_0000 
-parser:script_vars target=109 new_res=ALA -suffix _109_ALA_0000 
-parser:script_vars target=146 new_res=ALA -suffix _146_ALA_0000 
-parser:script_vars target=149 new_res=ALA -suffix _149_ALA_0000 
-parser:script_vars target=151 new_res=ALA -suffix _151_ALA_0000 
-parser:script_vars target=156 new_res=ALA -suffix _156_ALA_0000 
-parser:script_vars target=160 new_res=ALA -suffix _160_ALA_0000 
-parser:script_vars target=190 new_res=ALA -suffix _190_ALA_0000 


In [5]:
! cat sub.sh

#!/bin/sh
#
#SBATCH --job-name=forced 
#SBATCH --output=log.txt 

EXTRA_FLAGS=$( head -${SLURM_ARRAY_TASK_ID} list | tail -1 )
/share/work/alex/rosetta/source/bin/rosetta_scripts.linuxgccrelease @flags $EXTRA_FLAGS 


In [7]:
! sbatch --array=1-20 sub.sh

Submitted batch job 481753


## Results

In [14]:
import numpy 
import pandas
from glob import glob 
from collections import Counter
from Bio.PDB.Polypeptide import PPBuilder
from Bio.PDB import PDBParser
from subprocess import call 

# function that will return the sequence of PDB from disk
def util_function( pdb ):
    parser = PDBParser()
    structure = parser.get_structure( pdb[:-4], pdb )
    ppb = PPBuilder() # lol why don't these have PDBParser( 'XZY1.pdb' )-style constructors? 
    for pp in ppb.build_peptides( structure ):
        sequence = pp.get_sequence()
        return sequence 

# collect scorefiles 
outfile = 'wt_out/'
dfs = [ pandas.read_csv( sf, sep='\s+' ) for sf in glob( '{}/*sc'.format( outfile ) ) ]
df = pandas.concat( dfs )

# filter (this gets lowest 100 by total_score)
low = df.sort( 'total_score' ).head( 100 )
path_to_low = [ '{}/{}.pdb'.format( outfile, i ) for i in low.description ]

In [22]:
df.sample( 10 )[[ 'description', 'total_score' ]]

Unnamed: 0,description,total_score
41,mdh_194_PRO_0042,-240.57
62,mdh_146_PRO_0063,-1503.69
52,mdh_370_LYS_0053,-1507.41
60,mdh_109_LEU_0061,-1507.83
33,mdh_48_TRP_0034,-1360.77
66,mdh_370_TRP_0067,-1498.46
21,mdh_151_MET_0022,-1512.29
90,mdh_106_ARG_0091,-1514.51
10,mdh_370_LEU_0011,-1505.01
94,mdh_370_VAL_0095,-1512.26


In [23]:
# this builds sequences of each PDB and takes 0.1 seconds per PDB
with open( 'alignment.fasta', 'w' ) as fn:
    l = [] 
    seqs = [ util_function( pdb ) for pdb in path_to_low ]
    for record in seqs:
        l += [ list( record ) ]
        fn.write( '>name\n{}\n'.format( record ) )

In [24]:
# this tars them all together so you can download and look at them
#cmd = [ 'tar', '--create', '--verbose', '--file', 'low_100.tar' ] + path_to_low
#call( cmd ) # creates a file called 'low_100.tar' 

In [25]:
# build a numpy array that contains positions that were mutated and what they were mutated to
# supa fast

not_allowed = [ 158, 109, 106, 149, 146 ]
# mutations to these residues should be rejected 

native_seq = util_function( 'mdh.pdb' )
a = numpy.array( l ) 
pos = []
for n, i in enumerate( a.T ):
    c = Counter( i )
    if len( c ) > 1 and n + 1 not in not_allowed: # limits us to spots where mutations have been designed
        pos.append( str( n + 1 ) ) 
        print '{}{}'.format( native_seq[ n ], n+1 ) , 
        for item in c.items():
            print item, 
        print 
        
print 'create muts, resi', '+'.join( pos )

G48 ('A', 8) ('G', 92)
T160 ('F', 3) ('I', 2) ('R', 2) ('T', 79) ('W', 2) ('V', 1) ('Y', 11)
K167 ('A', 17) ('D', 75) ('G', 1) ('N', 3) ('P', 2) ('V', 2)
T204 ('A', 99) ('S', 1)
F261 ('A', 1) ('F', 99)
L266 ('L', 99) ('G', 1)
A273 ('A', 99) ('G', 1)
N288 ('A', 6) ('S', 1) ('T', 1) ('G', 1) ('N', 91)
M370 ('A', 4) ('D', 1) ('M', 85) ('N', 4) ('S', 2) ('T', 2) ('V', 2)
create muts, resi 48+160+167+204+261+266+273+288+370


Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
Exception ignored.
Some atoms or residues may be missing in the data structure.
