In [1]:
from utils import *
from utils2 import *
from utils3 import *
from plotting import *
from gpcrdb_soup import *

In [2]:
import pandas as pd
from tqdm import tqdm
from os import walk
import requests

from Bio.PDB.Polypeptide import *
from Bio.PDB import PDBParser

In [3]:
PATH = 'data/pdb/active/'

In [4]:
class Download():
    def __init__(self, 
                 path='data/',
                 fileformat='pdb'):
        if fileformat='pdb':
            self.path_pdb = path + 'pdb/'
        else:
            self.path_pdb = path + 'mmcif/'
        self.path_alignment = path + 'alignments/'
        self.path_table = path + 'gpcrdb/'
        
        self.fileformat = fileformat
        
        self.table = None
        self.filenames, self.pdb_ids = get_pdb_files(path=self.path_pdb)
    
    # ======================================================================================================================
        
    def download_alignment(self):
        print("Not Implemented! TBD manually.")
    
    def download_pdbs(self, reload=False, update=False):
        pdb_table_ids = self.table['PDB'].tolist()
        missing = [x for x in pdb_table_ids if x not in self.pdb_ids]
        if reload or (len(missing)>0):
            print("Reloading pdb files...")
            print("Missing pdbs:", missing)
            for pdb in tqdm(missing):
                url = get_rcsb_download(pdb, self.fileformat)
                download_pdb(url, folder=self.path_pdb, fileformat=self.fileformat)
        elif update:
            self.update_pdbs()
        self.filenames, self.pdb_ids = get_pdb_files(path=self.path_pdb)
                
    def download_table(self, reload=True, filename='structures.pkl'):
        table_path = self.path_table + filename
        if reload or (not os.path.isfile(self.path_table+filename)):
            self. table = get_table(reload=True, uniprot=False)
            self.table = self.table.drop(columns=['filler', 'filler2', 'receptor family', 
                                                  'Species', 'Method', 'Refined Structure', 'Degree active %', 
                                                  '% of Seq1', 'Family', 'Subtype', 'Name2', 'Fusion', 'Note', 
                                                  '% of Seq2', 'Antibodies', 'Name1', 'Type1', 'Type2', 
                                                  'D2x50 - S3x39', 'Sodium in structure', 'Authors', 'Reference', 
                                                  'PDB date', 'Annotated', 'pdb_link'])
        else:
            self.table = get_table(reload=False, uniprot=False)
            self.table = self.table[self.table['Species'].str.contains('Human')]
        self.table.to_pickle(self.path_table + filename)
        
    # ======================================================================================================================
    
    def update_pdbs(self):
        updatepdbs(self.path_pdb)
    
    # ======================================================================================================================
    
    

In [5]:
D = Download()
D.download_alignment()
D.download_table(reload=True)
D.download_pdbs(reload=True)

Not Implemented! TBD manually.


0it [00:00, ?it/s]

Reloading pdb files...
Missing pdbs: []





In [6]:
D.table

Unnamed: 0,uniprot(gene),Cl.,PDB,Resolution,Preferred Chain,State,Function
0,STE2,D1(Ste2-likefungalpheromone),7AD3,3.5,A,Active,Agonist
1,CCR2,A(Rhodopsin),5T1A,2.8,A,Inactive,AntagonistNAM
2,OPRM,A(Rhodopsin),4DKL,2.8,A,Inactive,Antagonist
3,CNR2,A(Rhodopsin),5ZTY,2.8,A,Inactive,Antagonist
4,5HT1B,A(Rhodopsin),6G79,3.8,S,Active,Agonist
...,...,...,...,...,...,...,...
523,OPSD,A(Rhodopsin),6PH7,2.9,A,Active,unknown
524,NTR1,A(Rhodopsin),7L0S,4.5,C,Active,Agonist
525,AA2AR,A(Rhodopsin),6S0Q,2.7,A,Inactive,Antagonist
526,GLP1R,B1(Secretin),7LCJ,2.8,R,Active,Agonist


In [7]:
# A.alignment

In [8]:
class PDBProcessor():
    def __init__(self,
                 path='data/',
                 limit=5,
                 verbose=1):
        
        self.verbose = verbose
        self.limit = limit
        
        columns = ['Name',
                   'State',
                   'Uniprot', 'Uniprot_Sequence', 
                   'PDB', 'PDB_Sequence', 'PDB_Sequence_padded'
                   'Numbering',
                   'Res_names', 'Res_ids', 'Res_pos',
                   'n_residues', 'Gaps'
                   'Structure', 'Ca','CaC','Psi','Phi','Omega']
        self.data = pd.DataFrame(columns=columns)
        self.path_pdb = path + 'pdb/'
        self.path_table = path + 'gpcrdb/' + 'structures.pkl'
        self.table = pd.read_pickle(self.path_table)
        
        # Filter settings
        self.setting_explained = {0: "Only C-alphas - labelled according to their residue",
                         1: "Hetero-representation - Backbone atoms are labelled according to their element,"\
                             " except C-alpha, which is labelled according to its AA",
                         2: "Hetero-representation - Backbone atoms are labelled according to their element,"\
                             "C-beta is labelled according to its AA",
                         3: "Hetero-representation 3, we include H atoms",
                         4: "Hetero-representation 4, we include H atoms",
                         5: "Full Protein is represented as element-wise pointcloud "\
                             "(drastically reduces embedding space)"}
        
        self.settings = {0: ['CA'],
                         1: ATOM_LIST,
                         2: ATOM_LIST+['CB'],
                         3: ATOM_LIST+['HN'],
                         4: ATOM_LIST+['CB']+['HN'],
                         5: None}
    
    def load_pdbs(self):
        files = self.get_pdb_files()
        p = PDBParser()
        
        for _, file in enumerate(tqdm(files)):
            # gpcrdb table
            table_row = self.table[self.table['PDB']==file[-8:-4]].iloc[0]
            cl = table_row['Cl.']
            state = table_row['State']
            pref_chain = table_row['Preferred Chain']
            function = table_row['Function']
            
            # pdb
            structure = p.get_structure(file[-8:-4], file)
            
            sequence, name, uniprot = None, None, None
            for _, model in enumerate(structure):
                res_names, res_ids, psi, phi, omega, atoms, seq, res_chain = self.parsing_pdb(model, pref_chain)
            seq = ''.join(seq)
            
            # numbering / position assignment
            numbering = self.get_numbering_json(file[-8:-4])   
            
            if res_names == None:
                pass
            else:
                sifts = get_uniprot_pdb_residue_mapping(file[-8:-4], chain_id=pref_chain)
                uniprot = sifts[0]
                if uniprot == None:
                    try: 
                        uniprot = pdbtouniprot(file[-8:-4])
                    except:
                        pass
                if uniprot != None:
                    sequence, name = get_sequence_name(uniprot)
                    gaps = self.map_gaps(res_ids)
                    row = {'Name': name, 
                           'Cl.': cl, 'State': state, 'Preferred Chain': pref_chain, 'Function': function,
                           'Uniprot': uniprot, 'Uniprot_Sequence': sequence,
                           'PDB': file[-8:-4], 'PDB_Sequence': seq, 'PDB_Sequence_padded': self.pad_gaps(gaps, seq),
                           'Numbering': numbering,
                           'Res_names': res_names, 'Res_ids': res_ids, 'Res_pos': None,
                           'n_residues':len(res_ids), 'Gaps': gaps, 'Gapsum': self.gap_sum(gaps),
                           'Psi': psi, 'Phi': phi, 'Omega': omega,
                           'Structure': atoms,
                           'Mapping': sifts}
                    self.data = self.data.append(row, ignore_index=True)

    # ======================================================================================================================
    
    def get_pdb_files(self):
        files = [os.path.join(self.path_pdb, f) for f in os.listdir(self.path_pdb) \
                 if (os.path.isfile(os.path.join(self.path_pdb, f)) and 'pdb' in f)]
        if self.limit != None:
            if self.verbose==1:
                print("Found {} files. Using {} files.".format(len(files), self.limit))
            return files[:self.limit]
        else:
            if self.verbose==1:
                print("Found {} files.".format(len(files)))
            return files
        
    def filter_by_chain(self, res_names, res_ids, psi, phi, omega, atoms, seq, res_chain, chain_id):
        chain_mask = [True if rc == chain_id else False for rc in res_chain]
        res_names = [b for a, b in zip(chain_mask, res_names) if a]
        res_ids = [b for a, b in zip(chain_mask, res_ids) if a]
        psi = [b for a, b in zip(chain_mask, psi) if a]
        phi = [b for a, b in zip(chain_mask, phi) if a]
        omega = [b for a, b in zip(chain_mask, omega) if a]
        atoms = [b for a, b in zip(chain_mask, atoms) if a]
        seq = [b for a, b in zip(chain_mask, seq) if a]
        atoms = [b for a, b in zip(chain_mask, atoms) if a]
        res_chain = [b for a, b in zip(chain_mask, res_chain) if a]
        return res_names, res_ids, psi, phi, omega, atoms, seq, res_chain
        
    def map_gaps(self, res_ids):
        min_num = -100
        gaps = []
        for i, z in enumerate(res_ids):
            new_num = int(z)
            if not abs(new_num - min_num) <= 1:
                gaps.append((i, min_num, new_num))
                min_num = new_num
            else:
                min_num += 1
        return gaps[1:]
    
    def gap_sum(self, gaps):
        sum = 0
        for gap in gaps:
            diff = gap[2] - gap[1]
            if diff > 0:
                sum += diff
        return sum
    
    def pad_gaps(self, gaps, seq):
        _ = 0
        for gap in gaps:
            start = gap[1]+_
            end = gap[2]+_
            length = end-start
            gap_str = (length-1) * '_'
            seq = seq[_:start] + gap_str + seq[start:]
            end = _
        return seq
        
    
    def get_numbering_json(self, pdb_id: str, reload=False):
        files = {
            'pdb_file': ('data/pdb/'+pdb_id+'.pdb', open('data/pdb/'+pdb_id+'.pdb', 'r')),
        }
        if not os.path.isfile('data/numbering/'+pdb_id+'.txt') or reload:
            response = requests.post('https://gpcrdb.org/services/structure/assign_generic_numbers', files=files)
            with open('filler.json', 'w') as f:
                f.write(response.text)
            os.rename(r'filler.json',r'data/numbering/'+pdb_id+'.txt')
        res_dict = {}
        f = open('data/numbering/'+pdb_id+'.txt', 'r')
        for line in f.readlines():
            line_ = list(filter(None, line.split(' ')))
            if 'CA' in line_:
                if len(line_) == 13:
                    if len(line_[-3]) == 4:
                        res_dict.update({int(line_[5]): (line_[3], float(line_[-3]))})
        f.close()
        return res_dict
    
    # TODO: CHECK / FIX PROBLEMS WITH RESIDUE NUMBERS (THEY NEED TO BE IDENTICAL)
    
    def parsing_pdb(self, model, pref_chain, water=False):
        """
        parse the pdb structure information into our dataframe columns
        model: first structure in the pdb file (since model 2 etc may not be formatted correctly)
        water: boolean to specify if water should be included
        """
        # Todo: implement a check of whether water is included
        try:
            model.atom_to_internal_coordinates()
            res_names = []
            res_ids = []
            psi = []
            phi = []
            omega = []
            atoms = []
            seq = []
            res_chain=[]
            chain = 0
            prev_resseq = -1
            skipped = []
            for _, res in enumerate(model.get_residues()):
                name = res.get_resname()
                id_ = str(res.get_id()[1]).zfill(3)
                chain_str = res.get_full_id()[2]
                if chain_str in pref_chain:
                    if name in RESIDUE_LIST:
                        for atom in res:
                            if atom.get_id() in ATOM_LIST:
                                atoms.append([chain_str, id_, atom.get_coord(), atom.get_id()])
                        seq.append(three_to_one(name))
                        res_names.append(name)
                        res_ids.append(id_)
                        res_chain.append(chain)
                        try:
                            psi.append(res.internal_coord.get_angle("psi"))
                        except:
                            psi.append(None)
                        try:
                            phi.append(res.internal_coord.get_angle("phi"))
                        except:
                            phi.append(None)
                        try:
                            omega.append(res.internal_coord.get_angle("omega"))
                        except:
                            omega.append(None)
                else:
                    if chain_str not in skipped:
                        skipped.append(chain_str)
            # print("Selected preferred chain {}, skipped chains {}.".format(pref_chain, skipped))
            return res_names, res_ids, psi, phi, omega, atoms, seq, res_chain
        except:
            return None, None, None, None, None, None, None, None
        
    def update_seq(self):
        """
        Update the sequence based on residues (e.g. after adressing gaps)
        """
        def update_seq_(residues):
            seq = []
            for r in residues:
                seq.append(three_to_one(r))
            return seq
        self.data['']
        
        

In [9]:
pdb = PDBProcessor(limit=96)

In [10]:
pdb.load_pdbs()

  0%|                                                                                                                                                                                                              | 0/96 [00:00<?, ?it/s]

Found 528 files. Using 96 files.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 96/96 [04:49<00:00,  3.02s/it]


In [11]:
pdb.data[['PDB', 'Uniprot', 'PDB_Sequence', 'Res_ids', 'Uniprot_Sequence', 'Mapping', 'Preferred Chain']].iloc[12]

PDB                                                              2R4R
Uniprot                                                        P07550
PDB_Sequence        GIVMSLIVLAIVFGNVLVITAIAFERLQTVTNYFITSLACADLVMG...
Res_ids             [037, 038, 039, 040, 041, 042, 043, 044, 045, ...
Uniprot_Sequence    MGQPGNGSAFLLAPNGSHAPDHDVTQERDEVWVVGMGIVMSLIVLA...
Mapping                         (P07550, ADRB2_HUMAN, 1, 365, 1, 365)
Preferred Chain                                                     A
Name: 12, dtype: object

In [12]:
pdb.data[['PDB', 'Uniprot', 'PDB_Sequence', 'Res_ids', 'Uniprot_Sequence', 'Mapping', 'Preferred Chain']].iloc[13]

PDB                                                              2R4S
Uniprot                                                        P07550
PDB_Sequence        GIVMSLIVLAIVFGNVLVITAIAFERLQTVTNYFITSLACADLVMG...
Res_ids             [037, 038, 039, 040, 041, 042, 043, 044, 045, ...
Uniprot_Sequence    MGQPGNGSAFLLAPNGSHAPDHDVTQERDEVWVVGMGIVMSLIVLA...
Mapping                        (P07550, ADRB2_HUMAN, 1, 342, 24, 365)
Preferred Chain                                                     A
Name: 13, dtype: object

If there is no numbering via our base-query we have to query the structure with the pdb id, 
use the entity name to get generic numbers and then use sequence alignment to get the numbers in... 

Sifts seems to not work very well (comparing the two samples above the indices are off)

In [13]:
pdb.data.to_pickle('data.pkl')

In [14]:
select = pdb.data[['Numbering', 'Res_ids', 'Res_names']]

In [15]:
select

Unnamed: 0,Numbering,Res_ids,Res_names
0,"{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...","[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ..."
1,"{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...","[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ..."
2,"{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...","[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ..."
3,"{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...","[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ..."
4,"{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...","[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ..."
...,...,...,...
91,{},"[001, 002, 003, 004, 005, 006, 007, 008, 009, ...","[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ..."
92,{},"[-01, 000, 001, 002, 003, 004, 005, 006, 007, ...","[GLY, THR, ALA, ASP, LEU, GLU, ASP, ASN, TRP, ..."
93,"{115: ('HIS', 1.35), 116: ('TYR', 1.36), 117: ...","[117, 118, 119, 120, 121, 122, 123, 124, 125, ...","[HIS, VAL, ALA, ALA, ILE, ILE, ASN, TYR, LEU, ..."
94,"{131: ('GLN', 1.29), 132: ('LYS', 1.3), 133: (...","[1001, 1002, 1003, 1004, 1005, 1006, 1007, 100...","[ALA, ASP, LEU, GLU, ASP, ASN, TRP, GLU, THR, ..."


In [16]:
res_ids = select['Res_ids'].iloc[0]

In [17]:
res_names = select['Res_names'].iloc[0]

In [18]:
numbering = select['Numbering'].iloc[0]

In [50]:
pdb.data[pdb.data['Cl.'].str.contains('A')]

Unnamed: 0,Name,State,Uniprot,Uniprot_Sequence,PDB,PDB_Sequence,PDB_Sequence_paddedNumbering,Res_names,Res_ids,Res_pos,...,Cl.,Function,Gaps,Gapsum,Mapping,Numbering,PDB_Sequence_padded,Preferred Chain,Structure,Uniprot_trunc
0,sp|P02699|OPSD_BOVIN,Inactive,P02699,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,1F88,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,,"[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...",,...,A(Rhodopsin),Inverseagonist,"[(235, 235, 240), (323, 327, 334)]",12.0,"(P02699, OPSD_BOVIN, 1, 348, 1, 348)","{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...",MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,A,"[[A, 001, [43.958, -5.98, -27.758], N], [A, 00...",MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
1,sp|P02699|OPSD_BOVIN,Inactive,P02699,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,1GZM,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,,"[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...",,...,A(Rhodopsin),Inverseagonist,"[(326, 326, 330)]",4.0,"(P02699, OPSD_BOVIN, 2, 349, 1, 348)","{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...",MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,A,"[[A, 001, [18.066, 43.422, 11.514], N], [A, 00...",MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
2,sp|P02699|OPSD_BOVIN,Inactive,P02699,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,1HZX,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,,"[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...",,...,A(Rhodopsin),Inverseagonist,"[(235, 235, 241), (325, 330, 334)]",10.0,"(P02699, OPSD_BOVIN, 2, 349, 1, 348)","{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...",MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,A,"[[A, 001, [44.059, -5.585, -29.335], N], [A, 0...",MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
3,sp|P02699|OPSD_BOVIN,Inactive,P02699,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,1L9H,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,,"[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...",,...,A(Rhodopsin),Inverseagonist,"[(235, 235, 241), (325, 330, 334)]",10.0,"(P02699, OPSD_BOVIN, 2, 349, 1, 348)","{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...",MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,A,"[[A, 001, [42.583, -4.962, 35.923], N], [A, 00...",MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
4,sp|P02699|OPSD_BOVIN,Inactive,P02699,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,1U19,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,,"[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...",,...,A(Rhodopsin),Inverseagonist,[],0.0,"(P02699, OPSD_BOVIN, 2, 349, 1, 348)","{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...",MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,A,"[[A, 001, [53.284, -5.731, 35.67], N], [A, 001...",MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,sp|P0ABE7|C562_ECOLX,Inactive,P0ABE7,MRKSLLAILAVSSLVFSSASFAADLEDNMETLNDNLKVIEKADNAA...,4IAQ,YIYQDSISLPWKVLLVMLLALITLATTLSNAFVIATVYRTRKLHTP...,,"[TYR, ILE, TYR, GLN, ASP, SER, ILE, SER, LEU, ...","[038, 039, 040, 041, 042, 043, 044, 045, 046, ...",,...,A(Rhodopsin),Agonist,"[(153, 190, 197), (196, 239, 1001), (213, 1017...",791.0,"(P0ABE7, C562_ECOLX, 211, 316, 23, 128)","{45: ('SER', 1.28), 46: ('LEU', 1.29), 52: ('L...",YIYQDSISLPWKVLLVMLLALITLATTLSNAFVIATVYRTRKLHTP...,A,"[[A, 038, [-14.433, 26.356, 16.716], N], [A, 0...",VIEKADNAAQVKDALTKMRAAALDAQKATPPKLEDKSPDSPEMKDF...
89,sp|P0ABE7|C562_ECOLX,Inactive,P0ABE7,MRKSLLAILAVSSLVFSSASFAADLEDNMETLNDNLKVIEKADNAA...,4IAR,YIYQDSISLPWKVLLVMLLALITLATTLSNAFVIATVYRTRKLHTP...,,"[TYR, ILE, TYR, GLN, ASP, SER, ILE, SER, LEU, ...","[038, 039, 040, 041, 042, 043, 044, 045, 046, ...",,...,A(Rhodopsin),Agonist,"[(153, 190, 197), (196, 239, 1001), (302, 1106...",775.0,"(P0ABE7, C562_ECOLX, 211, 316, 23, 128)","{45: ('SER', 1.28), 46: ('LEU', 1.29), 47: ('P...",YIYQDSISLPWKVLLVMLLALITLATTLSNAFVIATVYRTRKLHTP...,A,"[[A, 038, [-27.66, -11.221, 13.64], N], [A, 03...",VIEKADNAAQVKDALTKMRAAALDAQKATPPKLEDKSPDSPEMKDF...
90,sp|P41595|5HT2B_HUMAN,Intermediate,P41595,MALSYRVSELQSTIPEHILQSTFVHVISSNWSGLQTESIPEEMKQI...,4IB4,EEQGNKLHWAALLILMVIIPTIGGNTLVILAVSLEKKLQYATNYFL...,,"[GLU, GLU, GLN, GLY, ASN, LYS, LEU, HIS, TRP, ...","[048, 049, 050, 051, 052, 053, 054, 055, 056, ...",,...,A(Rhodopsin),Agonist,"[(150, 197, 201), (198, 248, 1001), (240, 1042...",774.0,"(P41595, 5HT2B_HUMAN, 11, 223, 36, 248)","{53: ('LYS', 1.32), 55: ('HIS', 1.34), 56: ('T...",EEQGNKLHWAALLILMVIIPTIGGNTLVILAVSLEKKLQYATNYFL...,A,"[[A, 048, [18.652, 38.221, 29.331], N], [A, 04...",EEQGNKLHWAALLILMVIIPTIGGNTLVILAVSLEKKLQYATNYFL...
91,sp|P02699|OPSD_BOVIN,Active,P02699,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,4J4Q,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,,"[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...",,...,A(Rhodopsin),unknown,[],0.0,"(P02699, OPSD_BOVIN, 1, 348, 1, 348)",{},MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,A,"[[A, 001, [12.395, 71.733, 40.451], N], [A, 00...",MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...


In [51]:
pdb.data[['Uniprot_Sequence', 'PDB_Sequence', 'Res_ids']].iloc[93]

Uniprot_Sequence    MGGHPQLRLVKALLLLGLNPVSASLQDQHCESLSLASNISGLQCNA...
PDB_Sequence        HVAAIINYLGHCISLVALLVAFVLFLRARSIRCLRNIIHANLIAAF...
Res_ids             [117, 118, 119, 120, 121, 122, 123, 124, 125, ...
Name: 93, dtype: object

In [21]:
def represented_seq(x, y):
    try: 
        return ''.join(list(x)[int(y)-1:])
    except:
        return None

In [22]:
pdb.data['Uniprot_trunc'] = pdb.data.apply(lambda x: represented_seq(x.Uniprot_Sequence, x.Res_ids[0]), axis=1)

In [23]:
pdb.data[['Uniprot_trunc', 'PDB_Sequence']]

Unnamed: 0,Uniprot_trunc,PDB_Sequence
0,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
1,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
2,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
3,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
4,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
...,...,...
91,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
92,DF,GTADLEDNWETLNDNLKVIEKADNAAQVKDALTKMRAAALDAQKAT...
93,HVAVIINYLGHCISLVALLVAFVLFLRLRPGCTHWGDQADGALEVG...,HVAAIINYLGHCISLVALLVAFVLFLRARSIRCLRNIIHANLIAAF...
94,,ADLEDNWETLNDNLKVIEKADNAAQVKDALTKMRAAALDAQKATPP...


In [53]:
for i in range(len(pdb.data)):
    if pdb.data['Uniprot_trunc'].iloc[i][20:23] != pdb.data['PDB_Sequence'].iloc[i][20:23] :
        print()
        print(i)
        print(pdb.data['Uniprot_trunc'].iloc[i])
        print()
        print(pdb.data['PDB_Sequence'].iloc[i])


14
IGHLLTKSPSLNAAKSELDKAIGRNCNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRCALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSIWYNQTPNRAKRVITTFRTGTWDAYKNL

DEVWVVGMGIVMSLIVLAIVFGNVLVITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEFWTSIDVLCVTASIETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQEAINCYAEETCCDFFTNQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKFCLKEHKALKTLGIIMGTFTLCWLPFFIVNIVHVIQDNLIRKEVYILLNWIGYVNSGFNPLIYCRSPDFRIAFQELLCL

35
LLTKSPSLNAAKSELDKAIGRNCNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRCALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSIWYNQTPNRAKRVITTFRTGTWDAYKNL

WVVGMGIVMSLIVLAIVFGNVLVITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEFWTSIDVLCVTASIWTLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQEAINCYAEETCCDFFTNQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGI

In [25]:
entity_data = None

In [26]:
res_data = None

In [27]:
atom_data = None

In [28]:
j = 0
l_err = []
for i in range(len(pdb.data)):
    numbering = pdb.data['Numbering'].iloc[i]
    if numbering == {}:
        j+=1
        l_err.append(pdb.data['PDB'].iloc[i])
    weird = False
    for num in numbering:
        try:
            num_str = str(num).zfill(3)
            idx = res_ids.index(num_str)
            if numbering[num][0] != res_names[idx]:
                weird = True
        except:
            pass
            # weird = True
            # print("Didnt find num_str", num_str)
        print(num, numbering[num][0], res_names[idx])
    if weird:
        pass

33 GLU GLU
34 PRO PRO
35 TRP TRP
36 GLN GLN
37 PHE PHE
38 SER SER
39 MET MET
40 LEU LEU
41 ALA ALA
42 ALA ALA
43 TYR TYR
44 MET MET
45 PHE PHE
46 LEU LEU
47 LEU LEU
48 ILE ILE
49 MET MET
50 LEU LEU
51 GLY GLY
52 PHE PHE
53 PRO PRO
54 ILE ILE
55 ASN ASN
56 PHE PHE
57 LEU LEU
58 THR THR
59 LEU LEU
60 TYR TYR
61 VAL VAL
62 THR THR
63 VAL VAL
64 GLN GLN
65 HIS HIS
70 THR THR
71 PRO PRO
72 LEU LEU
73 ASN ASN
74 TYR TYR
75 ILE ILE
76 LEU LEU
77 LEU LEU
78 ASN ASN
79 LEU LEU
80 ALA ALA
81 VAL VAL
82 ALA ALA
83 ASP ASP
84 LEU LEU
85 PHE PHE
86 MET MET
87 VAL VAL
88 PHE PHE
90 GLY GLY
91 PHE PHE
92 THR THR
93 THR THR
94 THR THR
95 LEU LEU
96 TYR TYR
97 THR THR
98 SER SER
99 LEU LEU
100 HIS HIS
106 GLY GLY
107 PRO PRO
108 THR THR
109 GLY GLY
110 CYS CYS
111 ASN ASN
112 LEU LEU
113 GLU GLU
114 GLY GLY
115 PHE PHE
116 PHE PHE
117 ALA ALA
118 THR THR
119 LEU LEU
120 GLY GLY
121 GLY GLY
122 GLU GLU
123 ILE ILE
124 ALA ALA
125 LEU LEU
126 TRP TRP
127 SER SER
128 LEU LEU
129 VAL VAL
130 VAL VAL
131 LE

90 GLY GLY
91 PHE PHE
92 THR THR
93 THR THR
94 THR THR
95 LEU LEU
96 TYR TYR
97 THR THR
98 SER SER
99 LEU LEU
100 HIS HIS
106 GLY GLY
107 PRO PRO
108 THR THR
109 GLY GLY
110 CYS CYS
111 ASN ASN
112 LEU LEU
113 GLU GLU
114 GLY GLY
115 PHE PHE
116 PHE PHE
117 ALA ALA
118 THR THR
119 LEU LEU
120 GLY GLY
121 GLY GLY
122 GLU GLU
123 ILE ILE
124 ALA ALA
125 LEU LEU
126 TRP TRP
127 SER SER
128 LEU LEU
129 VAL VAL
130 VAL VAL
131 LEU LEU
132 ALA ALA
133 ILE ILE
134 GLU GLU
135 ARG ARG
136 TYR TYR
137 VAL VAL
138 VAL VAL
139 VAL VAL
140 CYS CYS
141 LYS LYS
149 GLY GLY
150 GLU GLU
151 ASN ASN
152 HIS HIS
153 ALA ALA
154 ILE ILE
155 MET MET
156 GLY GLY
157 VAL VAL
158 ALA ALA
159 PHE PHE
160 THR THR
161 TRP TRP
162 VAL VAL
163 MET MET
164 ALA ALA
165 LEU LEU
166 ALA ALA
167 CYS CYS
168 ALA ALA
169 ALA ALA
170 PRO PRO
171 PRO PRO
172 LEU LEU
173 VAL VAL
199 ASN ASN
200 ASN ASN
201 GLU GLU
202 SER SER
203 PHE PHE
204 VAL VAL
205 ILE ILE
206 TYR TYR
207 MET MET
208 PHE PHE
209 VAL VAL
210 VAL VAL
21

214 ILE ILE
215 PRO PRO
216 LEU LEU
217 ILE ILE
218 VAL VAL
219 ILE ILE
220 PHE PHE
221 PHE PHE
222 CYS CYS
223 TYR TYR
224 GLY GLY
225 GLN GLN
226 LEU LEU
227 VAL VAL
228 PHE PHE
229 THR THR
230 VAL VAL
231 LYS LYS
232 GLU GLU
233 ALA ALA
234 ALA ALA
235 ALA ALA
236 GLN ALA
237 GLN ALA
241 ALA ALA
242 THR THR
243 THR THR
244 GLN GLN
245 LYS LYS
246 ALA ALA
247 GLU GLU
248 LYS LYS
249 GLU GLU
250 VAL VAL
251 THR THR
252 ARG ARG
253 MET MET
254 VAL VAL
255 ILE ILE
256 ILE ILE
257 MET MET
258 VAL VAL
259 ILE ILE
260 ALA ALA
261 PHE PHE
262 LEU LEU
263 ILE ILE
264 CYS CYS
265 TRP TRP
266 LEU LEU
267 PRO PRO
268 TYR TYR
269 ALA ALA
270 GLY GLY
271 VAL VAL
272 ALA ALA
273 PHE PHE
274 TYR TYR
275 ILE ILE
276 PHE PHE
277 THR THR
278 HIS HIS
284 GLY GLY
285 PRO PRO
286 ILE ILE
287 PHE PHE
288 MET MET
289 THR THR
290 ILE ILE
291 PRO PRO
292 ALA ALA
293 PHE PHE
294 PHE PHE
295 ALA ALA
296 LYS LYS
297 THR THR
298 SER SER
299 ALA ALA
300 VAL VAL
301 TYR TYR
302 ASN ASN
303 PRO PRO
304 VAL VAL
305 

200 ASN ASN
201 GLU GLU
202 SER SER
203 PHE PHE
204 VAL VAL
205 ILE ILE
206 TYR TYR
207 MET MET
208 PHE PHE
209 VAL VAL
210 VAL VAL
212 PHE PHE
213 ILE ILE
214 ILE ILE
215 PRO PRO
216 LEU LEU
217 ILE ILE
218 VAL VAL
219 ILE ILE
220 PHE PHE
221 PHE PHE
222 CYS CYS
223 TYR TYR
224 GLY GLY
225 GLN GLN
226 LEU LEU
227 VAL VAL
228 PHE PHE
229 THR THR
241 ALA ALA
242 THR THR
243 THR THR
244 GLN GLN
245 LYS LYS
246 ALA ALA
247 GLU GLU
248 LYS LYS
249 GLU GLU
250 VAL VAL
251 THR THR
252 ARG ARG
253 MET MET
254 VAL VAL
255 ILE ILE
256 ILE ILE
257 MET MET
258 VAL VAL
259 ILE ILE
260 ALA ALA
261 PHE PHE
262 LEU LEU
263 ILE ILE
264 CYS CYS
265 TRP TRP
266 LEU LEU
267 PRO PRO
268 TYR TYR
269 ALA ALA
270 GLY GLY
271 VAL VAL
272 ALA ALA
273 PHE PHE
274 TYR TYR
275 ILE ILE
276 PHE PHE
277 THR THR
278 HIS HIS
284 GLY GLY
285 PRO PRO
286 ILE ILE
287 PHE PHE
288 MET MET
289 THR THR
290 ILE ILE
291 PRO PRO
292 ALA ALA
293 PHE PHE
294 PHE PHE
295 ALA ALA
296 LYS LYS
297 THR THR
298 SER SER
299 ALA ALA
300 

283 THR PHE
284 LEU GLY
285 CYS PRO
286 TRP ILE
287 LEU PHE
288 PRO MET
289 PHE THR
290 PHE ILE
291 ILE PRO
311 LEU LYS
312 ASN GLN
313 TRP PHE
314 ILE ARG
315 GLY ASN
316 TYR CYS
317 VAL MET
318 ASN VAL
319 SER THR
320 GLY THR
321 PHE LEU
322 ASN CYS
323 PRO CYS
324 LEU GLY
325 ILE LYS
326 TYR ASN
327 CYS PRO
328 ARG PRO
329 SER PRO
330 PRO PRO
331 ASP PRO
332 PHE PRO
333 ARG PRO
334 ILE SER
335 ALA THR
336 PHE THR
337 GLN VAL
338 GLU SER
339 LEU LYS
340 LEU THR
341 CYS GLU
85 ILE PHE
141 LEU LYS
142 VAL PRO
144 GLY SER
165 THR LEU
170 LEU PRO
177 LEU ARG
181 VAL GLU
182 THR GLY
184 PRO GLN
194 THR PRO
195 CYS HIS
198 ALA THR
199 HIS ASN
37 GLY PHE
38 ILE SER
39 VAL MET
40 MET LEU
41 SER ALA
42 LEU ALA
43 ILE TYR
44 VAL MET
45 LEU PHE
46 ALA LEU
47 ILE LEU
48 VAL ILE
49 PHE MET
50 GLY LEU
51 ASN GLY
52 VAL PHE
53 LEU PRO
54 VAL ILE
55 ILE ASN
56 THR PHE
57 ALA LEU
58 ILE THR
59 ALA LEU
61 PHE VAL
66 THR LYS
67 VAL LYS
68 THR LEU
69 ASN ARG
70 TYR THR
71 PHE PRO
72 ILE LEU
73 THR ASN
7

49 ALA MET
50 LEU LEU
51 VAL GLY
52 VAL PHE
53 LEU PRO
54 LEU ILE
55 ILE ASN
56 VAL PHE
57 ALA LEU
58 GLY THR
59 ASN LEU
60 VAL TYR
61 LEU VAL
62 VAL THR
63 ILE VAL
64 ALA GLN
65 ALA HIS
66 ILE LYS
67 GLY LYS
69 THR ARG
74 THR TYR
75 LEU ILE
76 THR LEU
77 ASN LEU
78 LEU ASN
79 PHE LEU
80 ILE ALA
81 THR VAL
82 SER ALA
83 LEU ASP
84 ALA LEU
85 CYS PHE
86 ALA MET
87 ASP VAL
88 LEU PHE
89 VAL GLY
91 GLY PHE
92 LEU THR
94 VAL THR
95 VAL LEU
96 PRO TYR
97 PHE THR
98 GLY SER
99 ALA LEU
100 THR HIS
101 LEU GLY
102 VAL TYR
103 VAL PHE
104 ARG VAL
105 GLY PHE
110 GLY CYS
111 SER ASN
112 PHE LEU
113 LEU GLU
114 CYS GLY
115 GLU PHE
117 TRP ALA
118 THR THR
119 SER LEU
120 LEU GLY
121 ASP GLY
122 VAL GLU
123 LEU ILE
124 CYS ALA
125 VAL LEU
126 THR TRP
127 ALA SER
128 SER LEU
129 ILE VAL
130 GLU VAL
131 THR LEU
132 LEU ALA
133 CYS ILE
134 VAL GLU
135 ILE ARG
136 ALA TYR
137 ILE VAL
138 ASP VAL
139 ARG VAL
140 TYR CYS
141 LEU LYS
142 ALA PRO
143 ILE MET
144 THR SER
145 SER ASN
154 THR ILE
155 ARG MET


88 LEU PHE
89 VAL GLY
91 GLY PHE
92 LEU THR
94 VAL THR
95 VAL LEU
96 PRO TYR
97 PHE THR
98 GLY SER
99 ALA LEU
100 THR HIS
101 LEU GLY
102 VAL TYR
103 VAL PHE
104 ARG VAL
105 GLY PHE
110 GLY CYS
111 SER ASN
112 PHE LEU
113 LEU GLU
114 CYS GLY
115 GLU PHE
117 TRP ALA
118 THR THR
119 SER LEU
120 LEU GLY
121 ASP GLY
122 VAL GLU
123 LEU ILE
124 CYS ALA
125 VAL LEU
126 THR TRP
127 ALA SER
128 SER LEU
129 ILE VAL
130 GLU VAL
131 THR LEU
132 LEU ALA
133 CYS ILE
134 VAL GLU
135 ILE ARG
136 ALA TYR
137 ILE VAL
138 ASP VAL
139 ARG VAL
140 TYR CYS
141 LEU LYS
142 ALA PRO
143 ILE MET
144 THR SER
145 SER ASN
154 THR ILE
155 ARG MET
156 ALA GLY
157 ARG VAL
158 ALA ALA
159 LYS PHE
160 VAL THR
161 ILE TRP
162 ILE VAL
163 CYS MET
164 THR ALA
165 VAL LEU
166 TRP ALA
167 ALA CYS
168 ILE ALA
169 SER ALA
170 ALA PRO
171 LEU PRO
172 VAL LEU
173 SER VAL
174 PHE GLY
175 LEU TRP
176 PRO SER
177 ILE ARG
178 MET TYR
179 MET ILE
180 HIS PRO
204 ASN VAL
205 ARG ILE
206 ALA TYR
207 TYR MET
208 ALA PHE
209 ILE VAL
21

175 LEU TRP
176 PRO SER
177 ILE ARG
178 MET TYR
179 MET ILE
180 HIS PRO
204 ASN VAL
205 ARG ILE
206 ALA TYR
207 TYR MET
208 ALA PHE
209 ILE VAL
210 ALA VAL
211 SER HIS
212 SER PHE
213 ILE ILE
214 ILE ILE
216 PHE LEU
217 TYR ILE
218 ILE VAL
219 PRO ILE
220 LEU PHE
221 LEU PHE
222 ILE CYS
223 MET TYR
224 ILE GLY
225 PHE GLN
226 VAL LEU
228 LEU PHE
229 ARG THR
230 VAL VAL
231 TYR LYS
232 ARG GLU
233 GLU ALA
234 ALA ALA
235 LYS ALA
236 GLU ALA
237 GLN ALA
238 ILE ALA
239 ARG ALA
275 ARG ILE
276 LYS PHE
279 ARG GLN
280 VAL GLY
281 MET SER
283 MET PHE
284 ARG GLY
285 GLU PRO
286 HIS ILE
287 LYS PHE
288 ALA MET
289 LEU THR
290 LYS ILE
291 THR PRO
292 LEU ALA
293 GLY PHE
294 ILE PHE
295 ILE ALA
296 MET LYS
297 GLY THR
298 VAL SER
299 PHE ALA
300 THR VAL
301 LEU TYR
302 CYS ASN
303 TRP PRO
304 LEU VAL
305 PRO ILE
306 PHE TYR
307 PHE ILE
308 LEU MET
309 VAL MET
310 ASN ASN
311 ILE LYS
312 VAL GLN
313 ASN PHE
314 VAL ARG
315 PHE ASN
316 ASN CYS
321 PRO LEU
322 ASP CYS
323 TRP CYS
324 LEU GLY
325 

174 MET GLY
175 ASN TRP
176 TYR SER
177 MET ARG
178 VAL TYR
180 PHE PRO
181 ASN GLU
182 PHE GLY
183 PHE MET
184 ALA GLN
186 VAL SER
187 LEU CYS
188 VAL GLY
189 PRO ILE
190 LEU ASP
191 LEU TYR
192 LEU TYR
193 MET THR
194 LEU PRO
195 GLY HIS
196 VAL GLU
197 TYR GLU
198 LEU THR
199 ARG ASN
200 ILE ASN
201 PHE GLU
202 LEU SER
203 ALA PHE
204 ALA VAL
205 ARG ILE
206 ARG TYR
207 GLN MET
208 LEU PHE
209 LYS VAL
210 GLN VAL
211 MET HIS
212 GLU PHE
213 SER ILE
223 SER TYR
224 THR GLY
225 LEU GLN
226 GLN LEU
227 LYS VAL
228 GLU PHE
229 VAL THR
230 HIS VAL
231 ALA LYS
232 ALA GLU
233 LYS ALA
234 SER ALA
235 LEU ALA
236 ALA ALA
237 ILE ALA
238 ILE ALA
239 VAL ALA
240 GLY SER
241 LEU ALA
242 PHE THR
243 ALA THR
244 LEU GLN
245 CYS LYS
246 TRP ALA
247 LEU GLU
248 PRO LYS
249 LEU GLU
250 HIS VAL
251 ILE THR
252 ILE ARG
253 ASN MET
254 CYS VAL
255 PHE ILE
256 THR ILE
257 PHE MET
258 PHE VAL
259 CYS ILE
266 PRO LEU
267 LEU PRO
268 TRP TYR
269 LEU ALA
270 MET GLY
271 TYR VAL
272 LEU ALA
273 ALA PHE
274 

213 ILE ILE
214 LEU ILE
215 ILE PRO
216 ILE LEU
217 PHE ILE
218 PHE VAL
219 CYS ILE
220 TYR PHE
221 PHE PHE
222 ASN CYS
223 ILE TYR
224 VAL GLY
225 MET GLN
226 SER LEU
227 VAL VAL
228 SER PHE
229 ASN THR
230 HIS VAL
231 GLU LYS
232 LYS GLU
233 GLU ALA
234 MET ALA
235 ALA ALA
236 ALA ALA
237 MET ALA
238 ALA ALA
239 LYS ALA
243 ALA THR
244 LYS GLN
245 GLU LYS
246 LEU ALA
247 ARG GLU
248 LYS LYS
249 ALA GLU
250 GLN VAL
251 ALA THR
252 GLY ARG
253 ALA MET
254 ASN VAL
255 ALA ILE
256 GLU ILE
257 MET MET
258 ARG VAL
259 LEU ILE
260 ALA ALA
261 LYS PHE
262 ILE LEU
263 SER ILE
264 ILE CYS
265 VAL TRP
266 ILE LEU
267 VAL PRO
268 SER TYR
269 GLN ALA
270 PHE GLY
271 LEU VAL
272 LEU ALA
273 SER PHE
274 TRP TYR
275 SER ILE
276 PRO PHE
277 TYR THR
278 ALA HIS
279 VAL GLN
280 VAL GLY
281 ALA SER
282 LEU ASP
283 LEU PHE
284 ALA GLY
285 GLN PRO
286 PHE ILE
287 GLY PHE
293 THR PHE
294 PRO PHE
295 TYR ALA
296 ALA LYS
297 ALA THR
298 GLN SER
299 LEU ALA
300 PRO VAL
301 VAL TYR
302 MET ASN
303 PHE PRO
304 

278 HIS HIS
284 GLY GLY
285 PRO PRO
286 ILE ILE
287 PHE PHE
288 MET MET
289 THR THR
290 ILE ILE
291 PRO PRO
292 ALA ALA
293 PHE PHE
294 PHE PHE
295 ALA ALA
296 LYS LYS
297 THR THR
298 SER SER
299 ALA ALA
300 VAL VAL
301 TYR TYR
302 ASN ASN
303 PRO PRO
304 VAL VAL
305 ILE ILE
306 TYR TYR
307 ILE ILE
308 MET MET
309 MET MET
310 ASN ASN
311 LYS LYS
312 GLN GLN
313 PHE PHE
314 ARG ARG
315 ASN ASN
316 CYS CYS
317 MET MET
318 VAL VAL
319 THR THR
320 THR THR
321 LEU LEU
322 CYS CYS
33 GLU GLU
34 PRO PRO
35 TRP TRP
36 GLN GLN
37 PHE PHE
38 SER SER
39 MET MET
40 LEU LEU
41 ALA ALA
42 ALA ALA
43 TYR TYR
44 MET MET
45 PHE PHE
46 LEU LEU
47 LEU LEU
48 ILE ILE
49 MET MET
50 LEU LEU
51 GLY GLY
52 PHE PHE
53 PRO PRO
54 ILE ILE
55 ASN ASN
56 PHE PHE
57 LEU LEU
58 THR THR
59 LEU LEU
60 TYR TYR
61 VAL VAL
62 THR THR
63 VAL VAL
64 GLN GLN
65 HIS HIS
70 THR THR
71 PRO PRO
72 LEU LEU
73 ASN ASN
74 TYR TYR
75 ILE ILE
76 LEU LEU
77 LEU LEU
78 ASN ASN
79 LEU LEU
80 ALA ALA
81 VAL VAL
82 ALA ALA
83 ASP ASP
84 

255 PHE ILE
256 THR ILE
257 PHE MET
258 PHE VAL
259 CYS ILE
266 PRO LEU
267 LEU PRO
268 TRP TYR
269 LEU ALA
270 MET GLY
271 TYR VAL
272 LEU ALA
273 ALA PHE
274 ILE TYR
275 VAL ILE
276 LEU PHE
277 SER THR
278 HIS HIS
279 THR GLN
280 ASN GLY
281 SER SER
282 VAL ASP
283 VAL PHE
284 ASN GLY
285 PRO PRO
286 PHE ILE
287 ILE PHE
288 TYR MET
289 ALA THR
290 TYR ILE
291 ARG PRO
292 ILE ALA
293 ARG PHE
294 GLU PHE
295 PHE ALA
296 ARG LYS
297 GLN THR
298 THR SER
299 PHE ALA
300 ARG VAL
301 LYS TYR
302 ILE ASN
303 ILE PRO
304 ARG VAL
305 SER ILE
306 HIS TYR
307 VAL ILE
308 LEU MET
309 ARG MET
310 GLN ASN
35 GLY TRP
36 MET GLN
37 GLY PHE
38 ILE SER
39 VAL MET
40 MET LEU
41 SER ALA
42 LEU ALA
43 ILE TYR
44 VAL MET
45 LEU PHE
46 ALA LEU
47 ILE LEU
48 VAL ILE
49 PHE MET
50 GLY LEU
51 ASN GLY
52 VAL PHE
53 LEU PRO
54 VAL ILE
55 ILE ASN
56 THR PHE
57 ALA LEU
58 ILE THR
59 ALA LEU
61 PHE VAL
66 THR LYS
67 VAL LYS
68 THR LEU
69 ASN ARG
70 TYR THR
71 PHE PRO
72 ILE LEU
73 THR ASN
74 SER TYR
75 LEU ILE
76 A

114 VAL GLY
115 LEU PHE
116 CYS PHE
117 VAL ALA
118 THR THR
119 ALA LEU
120 SER GLY
121 ILE GLY
123 THR ILE
124 LEU ALA
125 CYS LEU
126 VAL TRP
127 ILE SER
128 ALA LEU
129 VAL VAL
130 ASP VAL
131 ARG LEU
132 TYR ALA
133 PHE ILE
134 ALA GLU
135 ILE ARG
136 THR TYR
137 SER VAL
146 THR PHE
147 LYS ARG
148 ASN PHE
149 LYS GLY
150 ALA GLU
151 ARG ASN
152 VAL HIS
153 ILE ALA
154 ILE ILE
155 LEU MET
156 MET GLY
157 VAL VAL
158 TRP ALA
159 ILE PHE
160 VAL THR
161 SER TRP
162 GLY VAL
163 LEU MET
164 THR ALA
165 SER LEU
166 PHE ALA
167 LEU CYS
168 PRO ALA
169 ILE ALA
170 GLN PRO
171 MET PRO
172 HIS LEU
196 ASN GLU
197 GLN GLU
198 ALA THR
199 TYR ASN
200 ALA ASN
201 ILE GLU
202 ALA SER
203 SER PHE
204 SER VAL
205 ILE ILE
206 VAL TYR
208 PHE PHE
209 TYR VAL
210 VAL VAL
211 PRO HIS
212 LEU PHE
213 VAL ILE
214 ILE ILE
215 MET PRO
216 VAL LEU
217 PHE ILE
218 VAL VAL
219 TYR ILE
220 SER PHE
221 ARG PHE
222 VAL CYS
223 PHE TYR
224 GLN GLY
225 GLU GLN
226 ALA LEU
227 LYS VAL
228 ARG PHE
229 GLN THR
230 

286 ILE ILE
287 THR PHE
288 GLU MET
289 ALA THR
290 LEU ILE
291 ALA PRO
292 PHE ALA
293 PHE PHE
294 HIS PHE
295 CYS ALA
296 CYS LYS
297 LEU THR
298 ASN SER
299 PRO ALA
300 ILE VAL
301 LEU TYR
302 TYR ASN
303 ALA PRO
304 PHE VAL
305 LEU ILE
306 GLY TYR
307 ALA ILE
308 LYS MET
309 PHE MET
310 LYS ASN
311 THR LYS
312 SER GLN
26 GLN ALA
28 ARG GLN
29 ASP TYR
30 GLU TYR
31 VAL LEU
32 TRP ALA
33 VAL GLU
34 VAL PRO
35 GLY TRP
36 MET GLN
37 GLY PHE
38 ILE SER
39 VAL MET
40 MET LEU
41 SER ALA
42 LEU ALA
43 ILE TYR
44 VAL MET
45 LEU PHE
46 ALA LEU
47 ILE LEU
48 VAL ILE
49 PHE MET
50 GLY LEU
51 ASN GLY
52 VAL PHE
53 LEU PRO
54 VAL ILE
55 ILE ASN
56 THR PHE
57 ALA LEU
58 ILE THR
59 ALA LEU
60 LYS TYR
61 PHE VAL
66 THR LYS
67 VAL LYS
68 THR LEU
69 ASN ARG
70 TYR THR
71 PHE PRO
72 ILE LEU
73 THR ASN
74 SER TYR
75 LEU ILE
76 ALA LEU
77 CYS LEU
78 ALA ASN
79 ASP LEU
80 LEU ALA
81 VAL VAL
82 MET ALA
83 GLY ASP
84 LEU LEU
86 VAL MET
87 VAL VAL
88 PRO PHE
89 PHE GLY
90 GLY GLY
91 ALA PHE
92 ALA THR
93 HI

44 MET MET
45 PHE PHE
46 LEU LEU
47 LEU LEU
48 ILE ILE
49 MET MET
50 LEU LEU
51 GLY GLY
52 PHE PHE
53 PRO PRO
54 ILE ILE
55 ASN ASN
56 PHE PHE
57 LEU LEU
58 THR THR
59 LEU LEU
60 TYR TYR
61 VAL VAL
62 THR THR
63 VAL VAL
64 GLN GLN
65 HIS HIS
70 THR THR
71 PRO PRO
72 LEU LEU
73 ASN ASN
74 TYR TYR
75 ILE ILE
76 LEU LEU
77 LEU LEU
78 ASN ASN
79 LEU LEU
80 ALA ALA
81 VAL VAL
82 ALA ALA
83 ASP ASP
84 LEU LEU
85 PHE PHE
86 MET MET
87 VAL VAL
88 PHE PHE
90 GLY GLY
91 PHE PHE
92 THR THR
93 THR THR
94 THR THR
95 LEU LEU
96 TYR TYR
97 THR THR
98 SER SER
99 LEU LEU
100 HIS HIS
106 GLY GLY
107 PRO PRO
108 THR THR
109 GLY GLY
110 CYS CYS
111 ASN ASN
112 LEU LEU
113 GLU GLU
114 GLY GLY
115 PHE PHE
116 PHE PHE
117 ALA ALA
118 THR THR
119 LEU LEU
120 GLY GLY
121 GLY GLY
122 GLU GLU
123 ILE ILE
124 ALA ALA
125 LEU LEU
126 TRP TRP
127 SER SER
128 LEU LEU
129 VAL VAL
130 VAL VAL
131 LEU LEU
132 ALA ALA
133 ILE ILE
134 GLU GLU
135 ARG ARG
136 TYR TYR
137 VAL VAL
138 VAL VAL
139 VAL VAL
140 CYS CYS
141 LYS

18 VAL GLY
19 LEU VAL
20 ALA VAL
21 ILE ARG
22 LEU SER
23 GLY PRO
24 ASN PHE
25 VAL GLU
26 LEU ALA
27 VAL PRO
28 CYS GLN
29 TRP TYR
30 ALA TYR
31 VAL LEU
32 TRP ALA
33 LEU GLU
34 ASN PRO
39 ASN MET
40 VAL LEU
41 THR ALA
42 ASN ALA
43 TYR TYR
44 PHE MET
45 VAL PHE
46 VAL LEU
47 SER LEU
48 LEU ILE
49 ALA MET
50 ALA LEU
51 ALA GLY
52 ASP PHE
53 ILE PRO
55 VAL ASN
56 GLY PHE
57 VAL LEU
59 ALA LEU
60 ILE TYR
61 PRO VAL
62 PHE THR
63 ALA VAL
64 ILE GLN
65 THR HIS
66 ILE LYS
67 SER LYS
68 THR LEU
69 GLY ARG
73 ALA ASN
74 CYS TYR
75 HIS ILE
76 GLY LEU
77 CYS LEU
78 LEU ASN
79 PHE LEU
80 ILE ALA
81 ALA VAL
82 CYS ALA
83 PHE ASP
84 VAL LEU
85 LEU PHE
86 VAL MET
87 LEU VAL
89 GLN GLY
90 SER GLY
91 SER PHE
92 ILE THR
93 PHE THR
94 SER THR
95 LEU LEU
96 LEU TYR
97 ALA THR
98 ILE SER
99 ALA LEU
100 ILE HIS
101 ASP GLY
102 ARG TYR
103 TYR PHE
104 ILE VAL
105 ALA PHE
106 ILE GLY
108 ILE THR
117 THR ALA
118 GLY THR
119 THR LEU
120 ARG GLY
121 ALA GLY
123 GLY ILE
124 ILE ALA
125 ILE LEU
126 ALA TRP
127 

40 VAL LEU
41 THR ALA
42 ASN ALA
43 TYR TYR
44 PHE MET
45 VAL PHE
46 VAL LEU
47 SER LEU
48 LEU ILE
49 ALA MET
50 ALA LEU
51 ALA GLY
52 ASP PHE
53 ILE PRO
55 VAL ASN
56 GLY PHE
57 VAL LEU
59 ALA LEU
60 ILE TYR
61 PRO VAL
62 PHE THR
63 ALA VAL
64 ILE GLN
65 THR HIS
66 ILE LYS
67 SER LYS
68 THR LEU
69 GLY ARG
73 ALA ASN
74 CYS TYR
75 HIS ILE
76 GLY LEU
77 CYS LEU
78 LEU ASN
79 PHE LEU
80 ILE ALA
81 ALA VAL
82 CYS ALA
83 PHE ASP
84 VAL LEU
85 LEU PHE
86 VAL MET
87 LEU VAL
89 GLN GLY
90 SER GLY
91 SER PHE
92 ILE THR
93 PHE THR
94 SER THR
95 LEU LEU
96 LEU TYR
97 ALA THR
98 ILE SER
99 ALA LEU
100 ILE HIS
101 ASP GLY
102 ARG TYR
103 TYR PHE
104 ILE VAL
105 ALA PHE
106 ILE GLY
108 ILE THR
117 THR ALA
118 GLY THR
119 THR LEU
120 ARG GLY
121 ALA GLY
123 GLY ILE
124 ILE ALA
125 ILE LEU
126 ALA TRP
127 ILE SER
128 CYS LEU
129 TRP VAL
130 VAL VAL
131 LEU LEU
132 SER ALA
133 PHE ILE
134 ALA GLU
135 ILE ARG
136 GLY TYR
137 LEU VAL
138 THR VAL
139 PRO VAL
140 MET CYS
141 LEU LYS
142 GLY PRO
173 PRO VA

167 CYS CYS
168 TRP ALA
169 VAL ALA
170 ILE PRO
171 SER PRO
172 LEU LEU
173 ILE VAL
174 LEU GLY
175 GLY TRP
176 GLY SER
177 LEU ARG
178 PRO TYR
179 ILE ILE
180 MET PRO
181 GLY GLU
199 HIS ASN
200 LYS ASN
201 HIS GLU
202 TYR SER
203 ILE PHE
204 LEU VAL
205 PHE ILE
206 CYS TYR
207 THR MET
208 THR PHE
209 VAL VAL
210 PHE VAL
211 THR HIS
212 LEU PHE
213 LEU ILE
214 LEU ILE
215 LEU PRO
216 SER LEU
217 ILE ILE
218 VAL VAL
219 ILE ILE
220 LEU PHE
221 TYR PHE
222 CYS CYS
223 ARG TYR
224 ILE GLY
225 TYR GLN
226 SER LEU
227 LEU VAL
228 VAL PHE
229 ARG THR
230 THR VAL
231 ARG LYS
246 SER ALA
247 ARG GLU
248 SER LYS
249 SER GLU
250 GLU VAL
253 ALA MET
254 LEU VAL
255 LEU ILE
256 LYS ILE
257 THR MET
258 VAL VAL
259 ILE ILE
260 ILE ALA
261 VAL PHE
262 LEU LEU
263 SER ILE
264 VAL CYS
265 PHE TRP
266 ILE LEU
267 ALA PRO
268 CYS TYR
269 TRP ALA
270 ALA GLY
271 PRO VAL
272 LEU ALA
273 PHE PHE
274 ILE TYR
275 LEU ILE
276 LEU PHE
277 LEU THR
278 LEU HIS
279 ASP GLN
280 VAL GLY
281 GLY SER
282 CYS ASP
288 

273 ALA PHE
274 PHE TYR
275 SER ILE
276 ALA PHE
277 VAL THR
278 PHE HIS
279 PHE GLN
280 PHE GLY
281 VAL SER
282 PRO ASP
283 LEU PHE
284 ILE GLY
285 ILE PRO
286 SER ILE
287 THR PHE
288 VAL MET
289 CYS THR
290 TYR ILE
291 VAL PRO
292 SER ALA
293 ILE PHE
294 ILE PHE
295 ARG ALA
296 CYS LYS
297 LEU THR
298 SER SER
299 SER ALA
304 ASN VAL
305 ARG ILE
306 SER TYR
307 LYS ILE
308 LYS MET
309 SER MET
310 ARG ASN
311 ALA LYS
312 LEU GLN
313 PHE PHE
314 LEU ARG
315 SER ASN
316 ALA CYS
317 ALA MET
318 VAL VAL
319 PHE THR
320 CYS THR
321 ILE LEU
322 PHE CYS
323 ILE CYS
325 CYS LYS
326 PHE ASN
327 GLY PRO
328 PRO PRO
329 THR PRO
330 ASN PRO
331 VAL PRO
332 LEU PRO
333 LEU PRO
334 ILE SER
335 ALA THR
336 HIS THR
337 TYR VAL
338 SER SER
339 PHE LYS
340 LEU THR
346 THR ALA
347 GLU PRO
348 ALA ALA
349 ALA ALA
350 TYR ALA
351 PHE ALA
352 ALA ALA
353 TYR ALA
354 LEU ALA
355 LEU ALA
356 CYS ALA
357 VAL ALA
358 CYS ALA
359 VAL ALA
360 SER ALA
361 SER ALA
362 ILE ALA
363 SER ALA
364 CYS ALA
365 CYS ALA
366 

242 ASP THR
243 ARG THR
275 ARG ILE
276 LYS PHE
277 THR THR
278 SER HIS
279 ARG GLN
280 VAL GLY
33 ALA GLU
34 GLU PRO
35 LEU TRP
36 LEU GLN
37 SER PHE
38 GLN SER
39 GLN MET
40 TRP LEU
41 GLU ALA
42 ALA ALA
43 GLY TYR
44 MET MET
45 SER PHE
46 LEU LEU
47 LEU LEU
48 MET ILE
49 ALA MET
50 LEU LEU
51 VAL GLY
52 VAL PHE
53 LEU PRO
54 LEU ILE
55 ILE ASN
56 VAL PHE
57 ALA LEU
58 GLY THR
59 ASN LEU
60 VAL TYR
61 LEU VAL
62 VAL THR
63 ILE VAL
64 ALA GLN
65 ALA HIS
66 ILE LYS
67 GLY LYS
69 THR ARG
74 THR TYR
75 LEU ILE
76 THR LEU
77 ASN LEU
78 LEU ASN
79 PHE LEU
80 ILE ALA
81 THR VAL
82 SER ALA
83 LEU ASP
84 ALA LEU
85 CYS PHE
86 ALA MET
87 ASP VAL
88 LEU PHE
89 VAL GLY
91 GLY PHE
92 LEU THR
94 VAL THR
95 VAL LEU
96 PRO TYR
97 PHE THR
98 GLY SER
99 ALA LEU
100 THR HIS
101 LEU GLY
102 VAL TYR
103 VAL PHE
104 ARG VAL
105 GLY PHE
110 GLY CYS
111 SER ASN
112 PHE LEU
113 LEU GLU
114 CYS GLY
115 GLU PHE
117 TRP ALA
118 THR THR
119 SER LEU
120 LEU GLY
121 ASP GLY
122 VAL GLU
123 LEU ILE
124 CYS ALA
125 

190 ILE ASP
191 SER TYR
192 ALA TYR
193 ILE THR
194 TRP PRO
195 LEU HIS
196 ALA GLU
197 SER GLU
198 ALA THR
199 LEU ASN
200 LEU ASN
201 ALA GLU
202 ILE SER
203 PRO PHE
204 MET VAL
205 LEU ILE
206 PHE TYR
207 THR MET
208 MET PHE
231 THR LYS
232 ALA GLU
233 THR ALA
235 LYS ALA
236 VAL ALA
237 VAL ALA
238 ILE ALA
239 GLN ALA
240 VAL SER
241 ASN ALA
242 THR THR
243 PHE THR
244 MET GLN
246 PHE ALA
247 LEU GLU
248 PHE LYS
249 PRO GLU
250 MET VAL
251 LEU THR
252 VAL ARG
254 SER VAL
255 ILE ILE
256 LEU ILE
257 ASN MET
258 THR VAL
259 VAL ILE
260 ILE ALA
261 ALA PHE
262 ASN LEU
263 LYS ILE
264 LEU CYS
265 THR TRP
266 VAL LEU
267 MET PRO
268 VAL TYR
269 HIS ALA
270 GLN GLY
297 PRO THR
298 GLY SER
299 ARG ALA
300 VAL VAL
301 GLN TYR
302 ALA ASN
303 LEU PRO
304 ARG VAL
306 GLY TYR
307 VAL ILE
308 LEU MET
309 VAL MET
310 LEU ASN
311 ARG LYS
312 ALA GLN
313 VAL PHE
314 VAL ARG
315 ILE ASN
316 ALA CYS
317 PHE MET
318 VAL VAL
319 VAL THR
320 CYS THR
321 TRP LEU
322 LEU CYS
323 PRO CYS
324 TYR GLY
325 

330 MET PRO
331 PHE PRO
333 TYR PRO
334 ILE SER
340 THR THR
341 THR GLU
343 LEU SER
344 PHE GLN
345 ASP VAL
346 PHE ALA
347 TYR PRO
348 HIS ALA
349 TYR ALA
350 PHE ALA
351 TYR ALA
352 MET ALA
353 LEU ALA
355 ASN ALA
356 ALA ALA
357 LEU ALA
359 TYR ALA
360 VAL ALA
361 SER ALA
363 ALA ALA
364 ILE ALA
365 ASN ALA
366 PRO ALA
367 ILE ALA
368 LEU ALA
369 TYR ALA
370 ASN ALA
371 LEU ALA
372 VAL ALA
373 SER ALA
374 ALA ALA
375 ASN ALA
376 PHE ALA
377 ARG ALA
378 GLN ALA
379 VAL ALA
380 PHE ALA
381 LEU ALA
382 SER ALA
383 THR ALA
384 LEU ALA
385 ALA ALA
386 CYS ALA
387 LEU ALA
388 CYS ALA
91 ARG PHE
92 LYS THR
269 HIS ALA
270 GLN GLY
271 ALA VAL
272 ALA ALA
65 MET HIS
67 THR LYS
68 ALA LEU
69 ILE ARG
70 THR THR
71 ILE PRO
72 MET LEU
73 ALA ASN
74 LEU TYR
75 TYR ILE
76 SER LEU
77 ILE LEU
78 VAL ASN
79 CYS LEU
80 VAL ALA
81 VAL VAL
82 GLY ALA
83 LEU ASP
84 PHE LEU
85 GLY PHE
86 ASN MET
87 PHE VAL
88 LEU PHE
89 VAL GLY
90 MET GLY
91 TYR PHE
92 VAL THR
93 ILE THR
94 VAL THR
95 ARG LEU
96 TYR TYR
1

141 MET LYS
142 MET PRO
143 SER MET
144 VAL SER
145 ASP ASN
146 ARG PHE
147 TYR ARG
148 ILE PHE
149 ALA GLY
150 VAL GLU
151 CYS ASN
152 HIS HIS
161 THR TRP
162 PRO VAL
163 ALA MET
164 LYS ALA
165 ALA LEU
166 LYS ALA
167 LEU CYS
168 ILE ALA
169 ASN ALA
170 ILE PRO
171 CYS PRO
172 ILE LEU
173 TRP VAL
174 VAL GLY
175 LEU TRP
176 ALA SER
177 SER ARG
178 GLY TYR
179 VAL ILE
180 GLY PRO
181 VAL GLU
182 PRO GLY
183 ILE MET
184 MET GLN
185 VAL CYS
186 MET SER
187 ALA CYS
205 PRO ILE
206 SER TYR
207 TRP MET
208 TYR PHE
209 TRP VAL
210 ASP VAL
211 THR HIS
212 VAL PHE
213 THR ILE
214 LYS ILE
215 ILE PRO
216 CYS LEU
217 VAL ILE
218 PHE VAL
219 LEU ILE
220 PHE PHE
222 PHE CYS
223 VAL TYR
224 VAL GLY
225 PRO GLN
226 ILE LEU
227 LEU VAL
228 ILE PHE
229 ILE THR
230 THR VAL
231 VAL LYS
232 CYS GLU
233 TYR ALA
234 GLY ALA
235 LEU ALA
236 MET ALA
237 LEU ALA
238 LEU ALA
239 ARG ALA
240 LEU SER
241 ARG ALA
242 SER THR
243 VAL THR
251 GLU THR
252 LYS ARG
253 ASP MET
254 ARG VAL
255 SER ILE
256 LEU ILE
257 

152 CYS HIS
153 THR ALA
154 TYR ILE
155 ALA MET
156 THR GLY
157 ALA VAL
158 LEU ALA
159 ASN PHE
160 VAL THR
161 ALA TRP
162 SER VAL
163 LEU MET
164 SER ALA
165 VAL LEU
167 ARG CYS
168 TYR ALA
169 LEU ALA
170 ALA PRO
171 ILE PRO
172 CYS LEU
173 HIS VAL
182 SER GLY
183 ARG MET
184 SER GLN
185 ARG CYS
186 THR SER
187 LYS CYS
188 LYS GLY
189 PHE ILE
190 ILE ASP
191 SER TYR
192 ALA TYR
193 ILE THR
194 TRP PRO
195 LEU HIS
196 ALA GLU
197 SER GLU
198 ALA THR
199 LEU ASN
200 LEU ASN
201 ALA GLU
202 ILE SER
203 PRO PHE
204 MET VAL
205 LEU ILE
206 PHE TYR
207 THR MET
208 MET PHE
231 THR LYS
232 ALA GLU
233 THR ALA
234 VAL ALA
235 LYS ALA
236 VAL ALA
237 VAL ALA
238 ILE ALA
239 GLN ALA
240 VAL SER
241 ASN ALA
242 THR THR
243 PHE THR
244 MET GLN
246 PHE ALA
247 LEU GLU
248 PHE LYS
249 PRO GLU
250 MET VAL
251 LEU THR
252 VAL ARG
253 ILE MET
254 SER VAL
255 ILE ILE
256 LEU ILE
257 ASN MET
258 THR VAL
259 VAL ILE
260 ILE ALA
261 ALA PHE
262 ASN LEU
263 LYS ILE
264 LEU CYS
265 THR TRP
266 VAL LEU
267 

390 ALA ALA
391 PHE ALA
393 ARG ALA
394 TYR ALA
395 ILE ALA
115 HIS PHE
116 TYR PHE
117 HIS ALA
118 VAL THR
119 ALA LEU
121 ILE GLY
122 ILE GLU
123 ASN ILE
124 TYR ALA
125 LEU LEU
126 GLY TRP
127 HIS SER
128 CYS LEU
129 ILE VAL
130 SER VAL
131 LEU LEU
132 VAL ALA
133 ALA ILE
134 LEU GLU
135 LEU ARG
136 VAL TYR
137 ALA VAL
138 PHE VAL
139 VAL VAL
140 LEU CYS
141 PHE LYS
142 LEU PRO
143 ARG MET
149 CYS GLY
150 LEU GLU
151 ARG ASN
152 ASN HIS
153 ILE ALA
154 ILE ILE
155 HIS MET
157 ASN VAL
158 LEU ALA
159 ILE PHE
161 ALA TRP
162 PHE VAL
163 ILE MET
164 LEU ALA
165 ARG LEU
166 ASN ALA
167 ALA CYS
168 THR ALA
169 TRP ALA
170 PHE PRO
171 VAL PRO
172 VAL LEU
173 GLN VAL
174 LEU GLY
175 THR TRP
185 VAL CYS
186 GLY SER
187 TRP CYS
188 CYS GLY
189 ARG ILE
190 LEU ASP
191 VAL TYR
192 THR TYR
193 ALA THR
194 ALA PRO
195 TYR HIS
196 ASN GLU
197 TYR GLU
198 PHE THR
199 HIS ASN
200 VAL ASN
201 THR GLU
202 ASN SER
203 PHE PHE
204 PHE VAL
205 TRP ILE
206 MET TYR
207 PHE MET
208 GLY PHE
209 GLU VAL
210 

In [29]:
print("Number of pdbs without corresponding position numberings: {}.".format(j))

Number of pdbs without corresponding position numberings: 15.


In [30]:
uniprot_l = []
for r in l_err:
    uniprot = pdb.data[pdb.data['PDB']==r]['Uniprot'].iloc[0]
    if uniprot not in uniprot_l:
        uniprot_l.append(uniprot)

In [31]:
len(uniprot_l)

4

In [32]:
for u in uniprot_l:
    lu = list(pdb.data[pdb.data['Uniprot']==u]['PDB'])
    ite = 0
    for u_ in lu:
        if u_ not in l_err:
            ite += 1
            
    print('{}: {} / {} of given uniprot id are numbered correctly.'.format(u, ite, len(lu)))
    print()
    print()

P02699: 18 / 24 of given uniprot id are numbered correctly.


P00720: 13 / 20 of given uniprot id are numbered correctly.


P35462: 0 / 1 of given uniprot id are numbered correctly.


Q99835: 0 / 1 of given uniprot id are numbered correctly.




In [33]:
print("Number of pdbs without corresponding position numberings: {}.".format(j))

Number of pdbs without corresponding position numberings: 15.


In [57]:
pdb.data[pdb.data['Uniprot']=='Q99835'][['PDB', 'Uniprot']]

Unnamed: 0,PDB,Uniprot
92,4JKV,Q99835


In [55]:
uniprot = pdbtouniprot('4JKV')

In [56]:
uniprot

'P0ABE7'

In [36]:
def get_numbering_json(entry_name: str):
    if not os.path.isfile('data/numbering/'+entry_name+'.txt') or reload:
        response = requests.get('https://gpcrdb.org/services/residues/extended/'+entry_name+'/')
        return response.json()

In [37]:
def get_entry_name(pdb_id:str):
    return requests.get('https://gpcrdb.org/services/structure/'+pdb_id+'/').json()

In [38]:
entry=get_entry_name('7LJC')

In [39]:
entry

{'pdb_code': '7LJC',
 'protein': 'drd1_human',
 'family': '001_001_004_001',
 'species': 'Homo sapiens',
 'preferred_chain': 'R',
 'resolution': 3.0,
 'publication_date': '2021-03-03',
 'type': 'Electron microscopy',
 'state': 'Active',
 'distance': 9.33,
 'publication': 'https://dx.doi.org/10.1101/2021.02.07.430101',
 'ligands': [{'name': 'Mevidalen',
   'type': 'small molecule',
   'function': 'PAM'},
  {'name': '743408-71-1', 'type': 'small molecule', 'function': 'Agonist'}]}

In [40]:
entry_name=entry['protein']

In [41]:
output = get_numbering_json(entry_name)

In [42]:
#output

In [43]:
parsed = []
for dic in output:
    seq_num = dic['sequence_number']
    aa = dic['amino_acid']
    num = dic['display_generic_number']
    parsed.append([seq_num, aa, num])

In [44]:
parsed

[[1, 'M', None],
 [2, 'R', None],
 [3, 'T', None],
 [4, 'L', None],
 [5, 'N', None],
 [6, 'T', None],
 [7, 'S', None],
 [8, 'A', None],
 [9, 'M', None],
 [10, 'D', None],
 [11, 'G', None],
 [12, 'T', None],
 [13, 'G', None],
 [14, 'L', None],
 [15, 'V', None],
 [16, 'V', None],
 [17, 'E', None],
 [18, 'R', None],
 [19, 'D', None],
 [20, 'F', None],
 [21, 'S', '1.30x30'],
 [22, 'V', '1.31x31'],
 [23, 'R', '1.32x32'],
 [24, 'I', '1.33x33'],
 [25, 'L', '1.34x34'],
 [26, 'T', '1.35x35'],
 [27, 'A', '1.36x36'],
 [28, 'C', '1.37x37'],
 [29, 'F', '1.38x38'],
 [30, 'L', '1.39x39'],
 [31, 'S', '1.40x40'],
 [32, 'L', '1.41x41'],
 [33, 'L', '1.42x42'],
 [34, 'I', '1.43x43'],
 [35, 'L', '1.44x44'],
 [36, 'S', '1.45x45'],
 [37, 'T', '1.46x46'],
 [38, 'L', '1.47x47'],
 [39, 'L', '1.48x48'],
 [40, 'G', '1.49x49'],
 [41, 'N', '1.50x50'],
 [42, 'T', '1.51x51'],
 [43, 'L', '1.52x52'],
 [44, 'V', '1.53x53'],
 [45, 'C', '1.54x54'],
 [46, 'A', '1.55x55'],
 [47, 'A', '1.56x56'],
 [48, 'V', '1.57x57'],
 [49,

In [45]:
pdb.data.Structure.iloc[14]

[['A', '029', array([-52.822,  -1.611,  23.137], dtype=float32), 'N'],
 ['A', '029', array([-51.922,  -2.262,  22.148], dtype=float32), 'CA'],
 ['A', '029', array([-52.178,  -1.713,  20.742], dtype=float32), 'C'],
 ['A', '029', array([-51.291,  -1.1  ,  20.143], dtype=float32), 'O'],
 ['A', '030', array([-53.394,  -1.944,  20.236], dtype=float32), 'N'],
 ['A', '030', array([-53.821,  -1.515,  18.887], dtype=float32), 'CA'],
 ['A', '030', array([-54.424,  -0.104,  18.879], dtype=float32), 'C'],
 ['A', '030', array([-54.197,   0.649,  17.943], dtype=float32), 'O'],
 ['A', '031', array([-55.19 ,   0.248,  19.918], dtype=float32), 'N'],
 ['A', '031', array([-55.757,   1.618,  20.079], dtype=float32), 'CA'],
 ['A', '031', array([-54.643,   2.678,  20.185], dtype=float32), 'C'],
 ['A', '031', array([-54.838,   3.837,  19.803], dtype=float32), 'O'],
 ['A', '032', array([-53.486,   2.259,  20.709], dtype=float32), 'N'],
 ['A', '032', array([-52.267,   3.063,  20.735], dtype=float32), 'CA'],
 [

In [46]:
for i in range(len(pdb.data)):
    try:
        u = len(pdb.data['Uniprot_Sequence'].iloc[i])
        p = len(pdb.data['PDB_Sequence_padded'].iloc[i])
        print('uniprot: {} | pdb_padded: {} | missing tail length: {}'.format(u, p, u-p))
    except:
        pass

uniprot: 348 | pdb_padded: 348 | missing tail length: 0
uniprot: 348 | pdb_padded: 332 | missing tail length: 16
uniprot: 348 | pdb_padded: 348 | missing tail length: 0
uniprot: 348 | pdb_padded: 348 | missing tail length: 0
uniprot: 348 | pdb_padded: 348 | missing tail length: 0
uniprot: 348 | pdb_padded: 348 | missing tail length: 0
uniprot: 348 | pdb_padded: 348 | missing tail length: 0
uniprot: 348 | pdb_padded: 332 | missing tail length: 16
uniprot: 348 | pdb_padded: 326 | missing tail length: 22
uniprot: 348 | pdb_padded: 329 | missing tail length: 19
uniprot: 348 | pdb_padded: 327 | missing tail length: 21
uniprot: 348 | pdb_padded: 348 | missing tail length: 0
uniprot: 413 | pdb_padded: 312 | missing tail length: 101
uniprot: 413 | pdb_padded: 312 | missing tail length: 101
uniprot: 164 | pdb_padded: 1213 | missing tail length: -1049
uniprot: 483 | pdb_padded: 321 | missing tail length: 162
uniprot: 348 | pdb_padded: 326 | missing tail length: 22
uniprot: 483 | pdb_padded: 326 

In [47]:
for i in range(len(pdb.data)):
    print(pdb.data['Uniprot_Sequence'].iloc[i])
    print("\n")
    print(pdb.data['PDB_Sequence_padded'].iloc[i])
    print("\n\n\n\n\n")

MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA


MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAA____SATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNP______STTVSKTETSQVAPA






MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFF

In [60]:
print(pdb.data['PDB_Sequence_padded'].iloc[14])

DEVWVVGMGIVMSLIVLAIVFGNVLVITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEFWTSIDVLCVTASIETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQEAINCYAEETCCDFFTNQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLNIFEMLRIDEGLRLKIYKDTEGYYTIGI__________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________

In [82]:
names = pdb.data['Res_names'].iloc[14]

In [83]:
ids = pdb.data['Res_ids'].iloc[14]

In [87]:
zipped = zip(ids, names)

In [88]:
sort_z = sorted(zipped)

In [89]:
sort_z

[('029', 'ASP'),
 ('030', 'GLU'),
 ('031', 'VAL'),
 ('032', 'TRP'),
 ('033', 'VAL'),
 ('034', 'VAL'),
 ('035', 'GLY'),
 ('036', 'MET'),
 ('037', 'GLY'),
 ('038', 'ILE'),
 ('039', 'VAL'),
 ('040', 'MET'),
 ('041', 'SER'),
 ('042', 'LEU'),
 ('043', 'ILE'),
 ('044', 'VAL'),
 ('045', 'LEU'),
 ('046', 'ALA'),
 ('047', 'ILE'),
 ('048', 'VAL'),
 ('049', 'PHE'),
 ('050', 'GLY'),
 ('051', 'ASN'),
 ('052', 'VAL'),
 ('053', 'LEU'),
 ('054', 'VAL'),
 ('055', 'ILE'),
 ('056', 'THR'),
 ('057', 'ALA'),
 ('058', 'ILE'),
 ('059', 'ALA'),
 ('060', 'LYS'),
 ('061', 'PHE'),
 ('062', 'GLU'),
 ('063', 'ARG'),
 ('064', 'LEU'),
 ('065', 'GLN'),
 ('066', 'THR'),
 ('067', 'VAL'),
 ('068', 'THR'),
 ('069', 'ASN'),
 ('070', 'TYR'),
 ('071', 'PHE'),
 ('072', 'ILE'),
 ('073', 'THR'),
 ('074', 'SER'),
 ('075', 'LEU'),
 ('076', 'ALA'),
 ('077', 'CYS'),
 ('078', 'ALA'),
 ('079', 'ASP'),
 ('080', 'LEU'),
 ('081', 'VAL'),
 ('082', 'MET'),
 ('083', 'GLY'),
 ('084', 'LEU'),
 ('085', 'ALA'),
 ('086', 'VAL'),
 ('087', 'VAL'

In [81]:
tuples = zip(*sort_z)
ids, list2 = [ list(tuple) for tuple in  tuples]

In [72]:
mapping = pdb.data['Mapping'].iloc[14]

In [73]:
mapping

('P00720', 'ENLYS_BPT4', 238, 398, 2, 162)

In [74]:
start = mapping[2]

In [75]:
end = mapping[3]

In [76]:
start_uniprot = mapping[4]

In [77]:
end_uniprot = mapping[5]

In [63]:
pdb.data['Uniprot_Sequence'].iloc[14]  # 2RH1

'MNIFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNCNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRCALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSIWYNQTPNRAKRVITTFRTGTWDAYKNL'

Something weird is going on! I think this possibly has to do with the complex adding ligands/gproteins to the preferred chain. Will try to fix this by using the residue numbers.. (i.e. everything below 500)

In [None]:
pdb.data.iloc[14]

In [None]:
pdb.data.apply(lambda x: [i for i, _ in enumerate(x.Phi) if _ == None], axis=1)  
#  this is how we can mask residues without angles

In [None]:
pdb.data.apply(lambda x: [len(x.Res_ids), len(x.Psi), len(x.Phi)], axis=1)

In [None]:
# mmcif parser, oes it have both numbers (authors and residues)

In [None]:
class Gproteins():
    def __init__(self,
                 path='data/'):
        #https://gpcrdb.org/structure/g_protein_structure_browser
        pass

In [181]:
class GPCR():
    def __init__(self,
                 path='data/',
                 columns=['PDB', 'Name', 
                          'Uniprot', 
                          'Sequence', 'Sections', 'Gaps',
                          'Structure', 'Ca','CaC','Psi','Phi','Omega',
                          'State', 'G-protein'],
                 alignment="GPCRdb_alignment_TM7_ICL4_H8.csv",
                 limit=5,
                 verbose=1):
        
        """
        path: pdb file path
        columns: [
            Index:         Identifier for our data
            Type:          G-Protein/Receptor/?
            Function:      Agonist/Antagonist
            Seq:           Protein sequence from PDBs
            Uniprot_Seq:   Protein sequence from uniprot entry
            Sections:      Polypeptides with structure (Sequence)
            Gaps:          Indices of starts/ends in full protein sequence of the polypeptides
            Position_idx:  GPCRdb alignment position index (if possible)
            XYZ:           Structure Data contained in the PDB (saved as per-residue)
            XYZ_padded:    Including NaN values for AAs in Gaps
            Ca:            Position of the C-alphas
            CaC:           Direction of backbone
            Psi:           Psi angle
            Phi:           Phi angle
            Omega:         Omega angle
            State:         Active/Inactive (for Receptors)
            G-Protein:     Primary target I guess
            Selectivities: Selectivity Values
            References:    ???
            ]
        limit: maximum number of proteins loaded
        """
        self.verbose = verbose
        self.limit = limit
        
        self.path_pdb = path + 'pdb/'
        self.path_alignment = path + 'alignments/' + alignment
        self.path_table = 'gpcrdb/'
        
        self.alignment = None
        self.data = pd.DataFrame(columns = columns)
    
    # ======================================================================================================================
    
    def load_pkl(self, path: str):
        self.data = pd.read_pickle(path)
        
    def save_pkl(self, path: str):
        self.data.save_pickle(path)
    
    # ======================================================================================================================
    
    def filter_pos(self, start, end):
        pass
    
    
    def view(self, pdb=None, uniprot=None, idx=None, start=None, end=None, start_idx=None, end_idx=None):
        # implement a view of a protein
        # implement view of overlap of proteins
        pass
    
    
    
        
    def plot_distr(self, mode='len', idx=None, save=False):
        """
        plot different data visualizations
        modes: - 'len': plot distribution of amino sequence lengths
               - 'residues': plot occurances of residues overall
        """
        assert mode in ['len', 'residues'], print("make sure mode is either 'len' or 'residues'")
        if mode == 'len':
            plot_len_hist(self.data, save)
        else:
            plot_res_hist(self.data, idx, save)

            
    def get_res_list(self):
        """
        Run this on your dataset, and copypaste the list to utils: "RES_LIST"
        """
        flatten = lambda l: [item for sublist in l for item in sublist]
        l = self.data['PDB_Sequence'].tolist()
        return sorted(list(set(flatten(l))))
    
    
    def rl_remove_variants(self, n_aa=20):
        """
        {'LEU': 13034, 'GLU': 9023, ' Ca': 5, ..} --> Remove proteins where rare variants occur,
        use a cutoff of n_aa = 20 to remove all rare variants by only selecting (the normal) 20 aminoacids.
        """
        d = Counter(flat_l(self.data['PDB_Sequence'].tolist()))
        v, k = zip(*sorted(zip(d.values(), d.keys())))
        self.res_list = dict(zip(k[-n_aa:], v[-n_aa:]))
        return k[:-n_aa], k[-n_aa:]  # return k[-n_aa] just so I can copy it to utils.py as an initialization array
    
    
    def remove_variants(self):
        c_tot = self.data.count().len
        k, _ = self.rl_remove_variants()
        new = self.data
        for key in k:
            new = new[new['PDB_Sequence'].apply(lambda x: key not in x)]
        self.data = new.reset_index()
        c_red = self.data.count().len
        print("After removing {} variants our data contains {} proteins.".format(c_tot-c_red, c_red))
        self.res_list = _
        
    
    def remove_outliers(self, min_len=100, max_len=6000):
        c_tot = self.data.count().len
        new = self.data[(min_len < self.data['n_residues']) & (self.data['n_residues'] < max_len)]
        self.data = new.reset_index()
        c_red = self.data.count().len
        print("After removing {} outliers with {} > length > {} our data contains {} proteins."\
              .format(c_tot-c_red, min_len, max_len, c_red))
        del new
        return
    
    
    def label_ca(self):
        def mkdict(rl):
            d = {}
            for i, l in enumerate(reversed(rl)):
                d.update({l:i})
            return d
        def mapaa(aas, d):
            y = []
            for aa in aas:
                y.append(d[aa])
            return np.asarray(y)
        ltodict = mkdict(self.get_common_aas())
        temp = self.data.copy()
        self.data['y'] = temp[['res']].applymap(lambda x: mapaa(x, ltodict))
        del temp
        return

In [182]:
loader = GPCR()

In [183]:
loader.data

Unnamed: 0,PDB,Name,Uniprot,Sequence,Sections,Gaps,Structure,Ca,CaC,Psi,Phi,Omega,State,G-protein


In [36]:
loader.load_pdbs()

  0%|                                                                                                                                                                                                                                                                                            | 0/5 [00:00<?, ?it/s]

Found 528 files. Using 5 files.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.09s/it]
