## Sequence Alignment and get canonical numbering of the residues and pdb id

In [12]:
from Bio import AlignIO
from pathlib import Path
from Bio import SeqIO
from asapdiscovery.data.backend.plip import get_interactions_plip
from asapdiscovery.data.backend.openeye import (
    load_openeye_sdf,
    load_openeye_pdb
)
from openeye import oechem

In [2]:
local_path = Path('/home/pengs/fold_zika')
align = AlignIO.read(local_path/"output.fasta", "fasta")

In [3]:
# Shove in the alignIO sequences from mafft and get the new number for each amino acid in the sequence
def get_new_renumber(align):
    res_num = dict()
    
    # Reference is the first sequence
    ref = align[0]
    # Initialize annotation for reference sequence, assume first residue is #1
    ref.letter_annotations["resnum"] = range(1,len(ref)+1)
    # Save the residue number
    res_num[ref.id] = list(ref.letter_annotations["resnum"])
    
    # Get a list of all other sequences to map back to the reference
    to_renumber = align[1:]
    for rec in to_renumber:
        # Initiate a new annotation for the sequences
        rec.letter_annotations["resnum"]=[None]*len(rec)
        # Set new residue numbers in rec based on alignment
        reslist = [[i,ref.letter_annotations["resnum"][i]] for i in range(len(ref)) if rec[i] != '-']
        for [i,r] in reslist:
            rec.letter_annotations["resnum"][i]=r
        # Set new residue numbers in the structure
        newresnums = [i for i in rec.letter_annotations["resnum"][:] if i != None]
        # Save to dict
        res_num[rec.id] = newresnums
    # Return the residue numbers of the aligned files
    # Dictionary with sequence id as the key and the residue numbers in a list as the value
    return res_num

align_num = get_new_renumber(align)

In [5]:
align_num

{'ref|zika_ns2b3|': [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  15

## Plip returns the pdb id of the active residue

In [13]:
# https://stackoverflow.com/questions/42851010/getting-residue-number-and-residue-name-in-biopython-pdb-module
from Bio import *
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Polypeptide import PPBuilder

pdb_file = '/home/pengs/fold_zika/test_prep/zikv_ns2b3.pdb' 

In [None]:
# get id of Pdb residues from the pdb file and save in dictionary by the sequence number
def get_pdb_id(pdb_file):
    structure = PDBParser().get_structure('prot', pdb_file)   
    pdb_id = dict()
    count = 1
    for model in structure:
        for chain in model:
            chain_id = chain.get_full_id()[2]
            for i in chain.get_residues():
                resname = i.resname
                resnum = i.get_full_id()[3][1]
                id = (resname, resnum, chain_id)
                pdb_id[count] = id
                count += 1
    return pdb_id

In [42]:
pdb_id = get_pdb_id(pdb_file)
pdb_id_flip = {v:k for k,v in pdb_id.items()}



## get the openeye residue id required for running protein prep

In [47]:
#### Iterate over the residues of oeGraphMol and get the openeye residue ids
def find_oe_res_id(protein):
    #### Get the residues from the molecule
    residues = oechem.OEGetResidues(protein)
    # Getting the residue ids in the protein file
    oe_id = dict()
    count = 1
    for residue in residues:
        #oe_thing[(residue.GetName(),residue.GetResidueNumber(),residue.GetExtChainID())] = count
        # Since did not get the chain ID before, would just go with A now
        oe_id[count] = (residue.GetName(),residue.GetResidueNumber(),'A')
        count += 1
    return oe_id

In [14]:
zikv_oe = load_openeye_pdb(pdb_file)

In [49]:
find_oe_res_id(zikv_oe)

{1: ('ASP', 50, 'A'),
 2: ('MET', 51, 'A'),
 3: ('TYR', 52, 'A'),
 4: ('ILE', 53, 'A'),
 5: ('GLU', 54, 'A'),
 6: ('ARG', 55, 'A'),
 7: ('ALA', 56, 'A'),
 8: ('GLY', 57, 'A'),
 9: ('ASP', 58, 'A'),
 10: ('ILE', 59, 'A'),
 11: ('THR', 60, 'A'),
 12: ('TRP', 61, 'A'),
 13: ('GLU', 62, 'A'),
 14: ('LYS', 63, 'A'),
 15: ('ASP', 64, 'A'),
 16: ('ALA', 65, 'A'),
 17: ('GLU', 66, 'A'),
 18: ('VAL', 67, 'A'),
 19: ('THR', 68, 'A'),
 20: ('GLY', 69, 'A'),
 21: ('ASN', 70, 'A'),
 22: ('SER', 71, 'A'),
 23: ('PRO', 72, 'A'),
 24: ('ARG', 73, 'A'),
 25: ('LEU', 74, 'A'),
 26: ('ASP', 75, 'A'),
 27: ('VAL', 76, 'A'),
 28: ('ALA', 77, 'A'),
 29: ('LEU', 78, 'A'),
 30: ('ASP', 79, 'A'),
 31: ('GLU', 80, 'A'),
 32: ('SER', 81, 'A'),
 33: ('GLY', 82, 'A'),
 34: ('ASP', 83, 'A'),
 35: ('PHE', 84, 'A'),
 36: ('SER', 85, 'A'),
 37: ('LEU', 86, 'A'),
 38: ('VAL', 87, 'A'),
 39: ('GLU', 88, 'A'),
 40: ('GLY', 16, 'A'),
 41: ('GLU', 17, 'A'),
 42: ('THR', 18, 'A'),
 43: ('THR', 19, 'A'),
 44: ('ASP', 20, 'A'

In [50]:
pdb_id

{1: ('ASP', 50, 'A'),
 2: ('MET', 51, 'A'),
 3: ('TYR', 52, 'A'),
 4: ('ILE', 53, 'A'),
 5: ('GLU', 54, 'A'),
 6: ('ARG', 55, 'A'),
 7: ('ALA', 56, 'A'),
 8: ('GLY', 57, 'A'),
 9: ('ASP', 58, 'A'),
 10: ('ILE', 59, 'A'),
 11: ('THR', 60, 'A'),
 12: ('TRP', 61, 'A'),
 13: ('GLU', 62, 'A'),
 14: ('LYS', 63, 'A'),
 15: ('ASP', 64, 'A'),
 16: ('ALA', 65, 'A'),
 17: ('GLU', 66, 'A'),
 18: ('VAL', 67, 'A'),
 19: ('THR', 68, 'A'),
 20: ('GLY', 69, 'A'),
 21: ('ASN', 70, 'A'),
 22: ('SER', 71, 'A'),
 23: ('PRO', 72, 'A'),
 24: ('ARG', 73, 'A'),
 25: ('LEU', 74, 'A'),
 26: ('ASP', 75, 'A'),
 27: ('VAL', 76, 'A'),
 28: ('ALA', 77, 'A'),
 29: ('LEU', 78, 'A'),
 30: ('ASP', 79, 'A'),
 31: ('GLU', 80, 'A'),
 32: ('SER', 81, 'A'),
 33: ('GLY', 82, 'A'),
 34: ('ASP', 83, 'A'),
 35: ('PHE', 84, 'A'),
 36: ('SER', 85, 'A'),
 37: ('LEU', 86, 'A'),
 38: ('VAL', 87, 'A'),
 39: ('GLU', 88, 'A'),
 40: ('GLY', 16, 'B'),
 41: ('GLU', 17, 'B'),
 42: ('THR', 18, 'B'),
 43: ('THR', 19, 'B'),
 44: ('ASP', 20, 'B'

## The Openeye perceived Molecule residues have the same numbering as the pdb ids

## Trying to make a design unit using the different ways online sources defined residues in OEGraphMol molecules

In [5]:
from asapdiscovery.modeling.modeling import make_design_unit,get_oe_prep_opts,get_oe_structure_metadata_from_sequence
from asapdiscovery.data.backend.openeye import (
    oechem,
    oedocking,
    oegrid,
    oespruce,
    openeye_perceive_residues,
)
from asapdiscovery.data.backend.openeye import (
    load_openeye_sdf,
    load_openeye_pdb
)

In [3]:
opts = get_oe_prep_opts()
opts
#make_design_unit(zikv_oe, 'ASP:129::B')

<oespruce.OEMakeDesignUnitOptions; proxy of <Swig Object of type 'OESpruce::OEMakeDesignUnitOptions *' at 0x11b02d890> >

In [4]:
metadata = oespruce.OEStructureMetadata()

In [7]:
prot_file = '/Users/choderalab/temp_storage/separate/output/lig_9_protein.pdb'
prot_lig9 = load_openeye_pdb(prot_file)
prot = load_openeye_pdb('/Users/choderalab/temp_storage/separate/pdb_intermediates/protein.pdb')
lig = load_openeye_sdf('/Users/choderalab/temp_storage/separate/lig_sdfs/lig_9.sdf')

https://openkinome.org/kinoml/notebooks/OpenEye_structural_featurizer.html

In [60]:
du = oespruce.OEMakeDesignUnits(prot, metadata, opts, 'TYR-161')

Processing BU # 1 with title: , chains AB


In [64]:
du = oespruce.OEMakeDesignUnits(prot, metadata, opts, 'TYR 161  A 1')

Processing BU # 1 with title: , chains AB


## This means something is wrong with the way I am defining residues (most likely wrong syntax)

In [23]:
from simtk.openmm import app



In [48]:
hv = oechem.OEHierView(
            prot, oechem.OEAssumption_BondedResidue +
            oechem.OEAssumption_ResPerceived + oechem.OEAssumption_PDBOrder)
topology = app.Topology()

for chain in hv.GetChains():
    for frag in chain.GetFragments():
        for hres in frag.GetResidues():
            # Get OE residue
            oe_res = hres.GetOEResidue()
            print(str(oe_res))

ASP 50   A 1  
MET 51   A 1  
TYR 52   A 1  
ILE 53   A 1  
GLU 54   A 1  
ARG 55   A 1  
ALA 56   A 1  
GLY 57   A 1  
ASP 58   A 1  
ILE 59   A 1  
THR 60   A 1 A
TRP 61   A 1  
GLU 62   A 1  
LYS 63   A 1  
ASP 64   A 1  
ALA 65   A 1  
GLU 66   A 1  
VAL 67   A 1  
THR 68   A 1  
GLY 69   A 1  
ASN 70   A 1  
SER 71   A 1  
PRO 72   A 1  
ARG 73   A 1  
LEU 74   A 1  
ASP 75   A 1  
VAL 76   A 1  
ALA 77   A 1  
LEU 78   A 1  
ASP 79   A 1  
GLU 80   A 1  
SER 81   A 1  
GLY 82   A 1  
ASP 83   A 1  
PHE 84   A 1  
SER 85   A 1  
LEU 86   A 1  
VAL 87   A 1  
GLU 88   A 1  
GLY 16   B 2  
GLU 17   B 2  
THR 18   B 2 A
THR 19   B 2  
ASP 20   B 2  
GLY 21   B 2  
VAL 22   B 2  
TYR 23   B 2  
ARG 24   B 2  
VAL 25   B 2  
MET 26   B 2  
THR 27   B 2  
ARG 28   B 2  
ARG 29   B 2  
LEU 30   B 2  
LEU 31   B 2  
GLY 32   B 2  
SER 33   B 2  
THR 34   B 2  
GLN 35   B 2  
VAL 36   B 2  
GLY 37   B 2  
VAL 38   B 2  
GLY 39   B 2  
VAL 40   B 2  
MET 41   B 2  
GLN 42   B 2  
GLU 43   B

## OEChem may have defined the residues differently from OEGraphMol?

https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/small-molecule-ligands

In [66]:
du = oespruce.OEMakeDesignUnits(prot, metadata, opts, 'TYR161:B')

Processing BU # 1 with title: , chains AB


## Trying to find out how molecules are being perceived after being loaded in by load_openeye_pdb

In [1]:
from asapdiscovery.data.backend.openeye import (
    oechem,
    oedocking,
    oegrid,
    oespruce,
    openeye_perceive_residues,
)

In [39]:
initial_prot = openeye_perceive_residues(zikv_oe,preserve_all=False)
initial_prot

<oechem.OEGraphMol; proxy of <Swig Object of type 'OEGraphMolWrapper *' at 0x2aba8e49cff0> >

In [30]:
site_residue = 'ASP:129::B'
if site_residue:
    print('ran')
    dus = list(oespruce.OEMakeDesignUnits(initial_prot, metadata, opts, site_residue))

ran


Processing BU # 1 with title: , chains AB


In [34]:
oespruce.OEMakeDesignUnits(initial_prot, metadata, opts, site_residue)

Processing BU # 1 with title: , chains AB


<oedocking.OEDesignUnitIter; proxy of <Swig Object of type 'OESystem::OEIter< OEBio::OEDesignUnit > *' at 0x2aba8ec70d20> >

## I think this is how the openeye object perceived id goes
https://github.com/choderalab/asapdiscovery/blob/main/asapdiscovery-data/asapdiscovery/data/backend/openeye.py#L15

## I don't think I can make the oe_active_residue to work (could not for the life of me find what the oe active residue is in the way openeye has defined the residue within their own sequence)

In [3]:
from openeye import oechem

In [10]:
file = '/home/pengs/fold_zika/test_prep/orig_bind/lig_0_protein.pdb'
comb = load_openeye_pdb(file)
oechem.OEPerceiveInteractionHints(comb)

TypeError: Wrong number or type of arguments for overloaded function 'OEPerceiveInteractionHints'.
  Possible C/C++ prototypes are:
    OEBio::OEPerceiveInteractionHints(OEBio::OEInteractionHintContainer &)
    OEBio::OEPerceiveInteractionHints(OEBio::OEInteractionHintContainer &,OEBio::OEPerceiveInteractionOptions const &)


In [37]:
from rdkit import Chem
sdf = Chem.SDMolSupplier( '/home/pengs/fold_zika/test_dock/combined_ligs.sdf' )
with open('/home/pengs/fold_zika/test_dock/combined_ligs.smi', 'w') as f:
    for mol in sdf:
        smi = Chem.MolToSmiles(mol)
        f.write("{}\n".format(smi))