In [120]:
from Bio import AlignIO,SeqIO,ExPASy,SwissProt
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
import os
from Bio import PDB
from Bio import pairwise2
from Bio.SubsMat.MatrixInfo import blosum62


In [121]:
file_name = 'cAbl-imatinib_nowat'
id_name = file_name
pdb_name = file_name + '.pdb'

In [122]:
parser = PDB.PDBParser(PERMISSIVE=1)
structure = parser.get_structure(id_name, pdb_name)

In [123]:
oneletter = {
'ASP':'D','GLU':'E','ASN':'N','GLN':'Q',
'ARG':'R','LYS':'K','PRO':'P','GLY':'G',
'CYS':'C','THR':'T','SER':'S','MET':'M',
'TRP':'W','PHE':'F','TYR':'Y','HIS':'H',
'ALA':'A','VAL':'V','LEU':'L','ILE':'I', 
}

cap_res = ['NME', 'NMA', 'ACE']

pdbseq_list = []
for residue in structure.get_residues():
    three_letter = residue.get_resname()
    if three_letter in cap_res:
        pass
    else: 
        one_letter_name = oneletter[three_letter]
        pdbseq_list.append(one_letter_name)
    
pdbseq_str = ''.join(pdbseq_list)

In [124]:
pdbseq_str

'SPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLKEDTMEVEEFLKEAAVMKEIKHPNLVQLLGVCTREPPFYIITEFMTYGNLLDYLRECNRQEVNAVVLLYMATQISSAMEYLEKKNFIHRDLAARNCLVGENHLVKVADFGLSRLMTGDTYTAHAGAKFPIKWTAPESLAYNKFSIKSDVWAFGVLLWEIATYGMSPYPGIDLSQVYELLEKDYRMERPEGCPEKVYELMRACWQWNPSDRPSFAEIHQAFETMFQESSISDEVEKELGK'

In [125]:
alnPDBseq=SeqRecord(Seq(pdbseq_str,IUPAC.protein),id=file_name)
SeqIO.write(alnPDBseq,"%s.fasta"%file_name,"fasta")

1

In [126]:
# Retrieve reference sequence
accession = "P00519"  # eventually this can be specified by user somehow (or read in from the csv file) 
handle = ExPASy.get_sprot_raw(accession)
swissseq = SwissProt.read(handle)
refseq=SeqRecord(Seq(swissseq.sequence,IUPAC.protein),id=accession)
SeqIO.write(refseq, "%s.fasta"%accession,"fasta")

1

In [127]:
seq1 = SeqIO.read("%s.fasta"%accession, "fasta")
seq2 = SeqIO.read("%s.fasta"%file_name, "fasta")
alignments = pairwise2.align.localds(seq1.seq, seq2.seq, blosum62, -10, -0.5)
os.remove("%s.fasta"%file_name)
os.remove("%s.fasta"%accession)

In [128]:
alignment_start = alignments[0][3]
alignment_end = alignments[0][4]

In [129]:
new_resnums = list(range(alignment_start, alignment_end+1))

In [130]:
for i,residue in enumerate(structure.get_residues()):
    three_letter = residue.get_resname()
    print(three_letter)
    if three_letter == 'ACE':
        res_id = list(residue.id)
        res_id[1] = new_resnums[0] - 1
        residue.id = tuple(res_id)
    elif three_letter == 'NME' or three_letter == 'NMA':
        res_id = list(residue.id)
        res_id[1] = new_resnums[-1] + 1
        residue.id = tuple(res_id)
    else: 
        index = i -1 
        res_id = list(residue.id)
        res_id[1] = new_resnums[index]
        residue.id = tuple(res_id)

ACE
SER
PRO
ASN
TYR
ASP
LYS
TRP
GLU
MET
GLU
ARG
THR
ASP
ILE
THR
MET
LYS
HIS
LYS
LEU
GLY
GLY
GLY
GLN
TYR
GLY
GLU
VAL
TYR
GLU
GLY
VAL
TRP
LYS
LYS
TYR
SER
LEU
THR
VAL
ALA
VAL
LYS
THR
LEU
LYS
GLU
ASP
THR
MET
GLU
VAL
GLU
GLU
PHE
LEU
LYS
GLU
ALA
ALA
VAL
MET
LYS
GLU
ILE
LYS
HIS
PRO
ASN
LEU
VAL
GLN
LEU
LEU
GLY
VAL
CYS
THR
ARG
GLU
PRO
PRO
PHE
TYR
ILE
ILE
THR
GLU
PHE
MET
THR
TYR
GLY
ASN
LEU
LEU
ASP
TYR
LEU
ARG
GLU
CYS
ASN
ARG
GLN
GLU
VAL
ASN
ALA
VAL
VAL
LEU
LEU
TYR
MET
ALA
THR
GLN
ILE
SER
SER
ALA
MET
GLU
TYR
LEU
GLU
LYS
LYS
ASN
PHE
ILE
HIS
ARG
ASP
LEU
ALA
ALA
ARG
ASN
CYS
LEU
VAL
GLY
GLU
ASN
HIS
LEU
VAL
LYS
VAL
ALA
ASP
PHE
GLY
LEU
SER
ARG
LEU
MET
THR
GLY
ASP
THR
TYR
THR
ALA
HIS
ALA
GLY
ALA
LYS
PHE
PRO
ILE
LYS
TRP
THR
ALA
PRO
GLU
SER
LEU
ALA
TYR
ASN
LYS
PHE
SER
ILE
LYS
SER
ASP
VAL
TRP
ALA
PHE
GLY
VAL
LEU
LEU
TRP
GLU
ILE
ALA
THR
TYR
GLY
MET
SER
PRO
TYR
PRO
GLY
ILE
ASP
LEU
SER
GLN
VAL
TYR
GLU
LEU
LEU
GLU
LYS
ASP
TYR
ARG
MET
GLU
ARG
PRO
GLU
GLY
CYS
PRO
GLU
LYS
VAL
TYR
GLU
LEU
MET
ARG
ALA
CYS
TRP
GLN


In [131]:
pdb_io = PDB.PDBIO()
pdb_io.set_structure(structure)

In [132]:
pdb_io.save(file_name + "-renumbered.pdb")