In [64]:
from utils import *
from utils2 import *
from utils3 import *
from plotting import *
from gpcrdb_soup import *

In [65]:
import pandas as pd
from tqdm import tqdm
from os import walk
import requests

from Bio.PDB.Polypeptide import *
from Bio.PDB import PDBParser

In [66]:
PATH = 'data/pdb/active/'

In [67]:
class Download():
    def __init__(self, 
                 path='data/',
                 fileformat='pdb'):
        self.path_pdb = path + 'pdb/'
        self.path_alignment = path + 'alignments/'
        self.path_table = path + 'gpcrdb/'
        
        self.fileformat = fileformat
        
        self.table = None
        self.filenames, self.pdb_ids = get_pdb_files(path=self.path_pdb)
    
    # ======================================================================================================================
        
    def download_alignment(self):
        print("Not Implemented! TBD manually.")
    
    def download_pdbs(self, reload=False, update=False):
        pdb_table_ids = self.table['PDB'].tolist()
        missing = [x for x in pdb_table_ids if x not in self.pdb_ids]
        if reload or (len(missing)>0):
            print("Reloading pdb files...")
            print("Missing pdbs:", missing)
            for pdb in tqdm(missing):
                url = get_rcsb_download(pdb, self.fileformat)
                download_pdb(url, folder=self.path_pdb, fileformat=self.fileformat)
        elif update:
            self.update_pdbs()
        self.filenames, self.pdb_ids = get_pdb_files(path=self.path_pdb)
                
    def download_table(self, reload=True, filename='structures.pkl'):
        table_path = self.path_table + filename
        if reload or (not os.path.isfile(self.path_table+filename)):
            self. table = get_table(reload=True, uniprot=False)
            self.table = self.table.drop(columns=['filler', 'filler2', 'receptor family', 
                                                  'Species', 'Method', 'Refined Structure', 'Degree active %', 
                                                  '% of Seq1', 'Family', 'Subtype', 'Name2', 'Fusion', 'Note', 
                                                  '% of Seq2', 'Antibodies', 'Name1', 'Type1', 'Type2', 
                                                  'D2x50 - S3x39', 'Sodium in structure', 'Authors', 'Reference', 
                                                  'PDB date', 'Annotated', 'pdb_link'])
        else:
            self.table = get_table(reload=False, uniprot=False)
            self.table = self.table[self.table['Species'].str.contains('Human')]
        self.table.to_pickle(self.path_table + filename)
        
    # ======================================================================================================================
    
    def update_pdbs(self):
        updatepdbs(self.path_pdb)
    
    # ======================================================================================================================
    
    

In [68]:
D = Download()
D.download_alignment()
D.download_table(reload=True)
D.download_pdbs(reload=True)

Not Implemented! TBD manually.


0it [00:00, ?it/s]

Reloading pdb files...
Missing pdbs: []





In [69]:
D.table

Unnamed: 0,uniprot(gene),Cl.,PDB,Resolution,Preferred Chain,State,Function
0,STE2,D1(Ste2-likefungalpheromone),7AD3,3.5,A,Active,Agonist
1,CCR2,A(Rhodopsin),5T1A,2.8,A,Inactive,AntagonistNAM
2,OPRM,A(Rhodopsin),4DKL,2.8,A,Inactive,Antagonist
3,CNR2,A(Rhodopsin),5ZTY,2.8,A,Inactive,Antagonist
4,5HT1B,A(Rhodopsin),6G79,3.8,S,Active,Agonist
...,...,...,...,...,...,...,...
523,OPSD,A(Rhodopsin),6PH7,2.9,A,Active,unknown
524,NTR1,A(Rhodopsin),7L0S,4.5,C,Active,Agonist
525,AA2AR,A(Rhodopsin),6S0Q,2.7,A,Inactive,Antagonist
526,GLP1R,B1(Secretin),7LCJ,2.8,R,Active,Agonist


In [70]:
# A.alignment

In [71]:
class PDBProcessor():
    def __init__(self,
                 path='data/',
                 limit=5,
                 verbose=1):
        
        self.verbose = verbose
        self.limit = limit
        
        columns = ['Name',
                   'State',
                   'Uniprot', 'Uniprot_Sequence', 
                   'PDB', 'PDB_Sequence', 'PDB_Sequence_padded'
                   'Numbering',
                   'Res_names', 'Res_ids', 'Res_pos',
                   'n_residues', 'Gaps'
                   'Structure', 'Ca','CaC','Psi','Phi','Omega']
        self.data = pd.DataFrame(columns=columns)
        self.path_pdb = path + 'pdb/'
        self.path_table = path + 'gpcrdb/' + 'structures.pkl'
        self.table = pd.read_pickle(self.path_table)
        
        # Filter settings
        self.setting_explained = {0: "Only C-alphas - labelled according to their residue",
                         1: "Hetero-representation - Backbone atoms are labelled according to their element,"\
                             " except C-alpha, which is labelled according to its AA",
                         2: "Hetero-representation - Backbone atoms are labelled according to their element,"\
                             "C-beta is labelled according to its AA",
                         3: "Hetero-representation 3, we include H atoms",
                         4: "Hetero-representation 4, we include H atoms",
                         5: "Full Protein is represented as element-wise pointcloud "\
                             "(drastically reduces embedding space)"}
        
        self.settings = {0: ['CA'],
                         1: ATOM_LIST,
                         2: ATOM_LIST+['CB'],
                         3: ATOM_LIST+['HN'],
                         4: ATOM_LIST+['CB']+['HN'],
                         5: None}
    
    def load_pdbs(self):
        files = self.get_pdb_files()
        p = PDBParser()
        
        for _, file in enumerate(tqdm(files)):
            # gpcrdb table
            table_row = self.table[self.table['PDB']==file[-8:-4]].iloc[0]
            cl = table_row['Cl.']
            state = table_row['State']
            pref_chain = table_row['Preferred Chain']
            function = table_row['Function']
            
            # pdb
            structure = p.get_structure(file[-8:-4], file)
            
            sequence, name, uniprot = None, None, None
            for _, model in enumerate(structure):
                res_names, res_ids, psi, phi, omega, atoms, seq, res_chain = self.parsing_pdb(model, pref_chain)
            seq = ''.join(seq)
            
            # numbering / position assignment
            numbering = self.get_numbering_json(file[-8:-4])   
            
            if res_names == None:
                pass
            else:
                sifts = get_uniprot_pdb_residue_mapping(file[-8:-4], chain_id=pref_chain)
                uniprot = sifts[0]
                if uniprot == None:
                    try: 
                        uniprot = pdbtouniprot(file[-8:-4])
                    except:
                        pass
                if uniprot != None:
                    sequence, name = get_sequence_name(uniprot)
                    gaps = self.map_gaps(res_ids)
                    row = {'Name': name, 
                           'Cl.': cl, 'State': state, 'Preferred Chain': pref_chain, 'Function': function,
                           'Uniprot': uniprot, 'Uniprot_Sequence': sequence,
                           'PDB': file[-8:-4], 'PDB_Sequence': seq, 'PDB_Sequence_padded': self.pad_gaps(gaps, seq),
                           'Numbering': numbering,
                           'Res_names': res_names, 'Res_ids': res_ids, 'Res_pos': None,
                           'n_residues':len(res_ids), 'Gaps': gaps, 'Gapsum': self.gap_sum(gaps),
                           'Psi': psi, 'Phi': phi, 'Omega': omega,
                           'Structure': atoms,
                           'Mapping': sifts}
                    self.data = self.data.append(row, ignore_index=True)

    # ======================================================================================================================
    
    def get_pdb_files(self):
        files = [os.path.join(self.path_pdb, f) for f in os.listdir(self.path_pdb) \
                 if (os.path.isfile(os.path.join(self.path_pdb, f)) and 'pdb' in f)]
        if self.limit != None:
            if self.verbose==1:
                print("Found {} files. Using {} files.".format(len(files), self.limit))
            return files[:self.limit]
        else:
            if self.verbose==1:
                print("Found {} files.".format(len(files)))
            return files
        
    def filter_by_chain(self, res_names, res_ids, psi, phi, omega, atoms, seq, res_chain, chain_id):
        chain_mask = [True if rc == chain_id else False for rc in res_chain]
        res_names = [b for a, b in zip(chain_mask, res_names) if a]
        res_ids = [b for a, b in zip(chain_mask, res_ids) if a]
        psi = [b for a, b in zip(chain_mask, psi) if a]
        phi = [b for a, b in zip(chain_mask, phi) if a]
        omega = [b for a, b in zip(chain_mask, omega) if a]
        atoms = [b for a, b in zip(chain_mask, atoms) if a]
        seq = [b for a, b in zip(chain_mask, seq) if a]
        atoms = [b for a, b in zip(chain_mask, atoms) if a]
        res_chain = [b for a, b in zip(chain_mask, res_chain) if a]
        return res_names, res_ids, psi, phi, omega, atoms, seq, res_chain
        
    def map_gaps(self, res_ids):
        min_num = -100
        gaps = []
        for i, z in enumerate(res_ids):
            new_num = int(z)
            if not abs(new_num - min_num) <= 1:
                gaps.append((i, min_num, new_num))
                min_num = new_num
            else:
                min_num += 1
        return gaps[1:]
    
    def gap_sum(self, gaps):
        sum = 0
        for gap in gaps:
            diff = gap[2] - gap[1]
            if diff > 0:
                sum += diff
        return sum
    
    def pad_gaps(self, gaps, seq):
        _ = 0
        for gap in gaps:
            start = gap[1]+_
            end = gap[2]+_
            length = end-start
            gap_str = (length-1) * '_'
            seq = seq[_:start] + gap_str + seq[start:]
            end = _
        return seq
        
    
    def get_numbering_json(self, pdb_id: str, reload=False):
        files = {
            'pdb_file': ('data/pdb/'+pdb_id+'.pdb', open('data/pdb/'+pdb_id+'.pdb', 'r')),
        }
        if not os.path.isfile('data/numbering/'+pdb_id+'.txt') or reload:
            response = requests.post('https://gpcrdb.org/services/structure/assign_generic_numbers', files=files)
            with open('filler.json', 'w') as f:
                f.write(response.text)
            os.rename(r'filler.json',r'data/numbering/'+pdb_id+'.txt')
        res_dict = {}
        f = open('data/numbering/'+pdb_id+'.txt', 'r')
        for line in f.readlines():
            line_ = list(filter(None, line.split(' ')))
            if 'CA' in line_:
                if len(line_) == 13:
                    if len(line_[-3]) == 4:
                        res_dict.update({int(line_[5]): (line_[3], float(line_[-3]))})
        f.close()
        return res_dict
    
    # TODO: CHECK / FIX PROBLEMS WITH RESIDUE NUMBERS (THEY NEED TO BE IDENTICAL)
    
    def parsing_pdb(self, model, pref_chain, water=False):
        """
        parse the pdb structure information into our dataframe columns
        model: first structure in the pdb file (since model 2 etc may not be formatted correctly)
        water: boolean to specify if water should be included
        """
        # Todo: implement a check of whether water is included
        try:
            model.atom_to_internal_coordinates()
            res_names = []
            res_ids = []
            psi = []
            phi = []
            omega = []
            atoms = []
            seq = []
            res_chain=[]
            chain = 0
            prev_resseq = -1
            skipped = []
            for _, res in enumerate(model.get_residues()):
                name = res.get_resname()
                id_ = str(res.get_id()[1]).zfill(3)
                chain_str = res.get_full_id()[2]
                if chain_str in pref_chain:
                    if name in RESIDUE_LIST:
                        for atom in res:
                            if atom.get_id() in ATOM_LIST:
                                atoms.append([chain_str, id_, atom.get_coord(), atom.get_id()])
                        seq.append(three_to_one(name))
                        res_names.append(name)
                        res_ids.append(id_)
                        res_chain.append(chain)
                        try:
                            psi.append(res.internal_coord.get_angle("psi"))
                        except:
                            psi.append(None)
                        try:
                            phi.append(res.internal_coord.get_angle("phi"))
                        except:
                            phi.append(None)
                        try:
                            omega.append(res.internal_coord.get_angle("omega"))
                        except:
                            omega.append(None)
                else:
                    if chain_str not in skipped:
                        skipped.append(chain_str)
            # print("Selected preferred chain {}, skipped chains {}.".format(pref_chain, skipped))
            return res_names, res_ids, psi, phi, omega, atoms, seq, res_chain
        except:
            return None, None, None, None, None, None, None, None
        
    def update_seq(self):
        """
        Update the sequence based on residues (e.g. after adressing gaps)
        """
        def update_seq_(residues):
            seq = []
            for r in residues:
                seq.append(three_to_one(r))
            return seq
        self.data['']
        
        

In [72]:
pdb = PDBProcessor(limit=None)

In [73]:
pdb.load_pdbs()

  0%|                                                                                                                                                                                                                                                                                          | 0/528 [00:00<?, ?it/s]

Found 528 files.


 33%|████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                                                       | 172/528 [04:36<09:31,  1.61s/it]


IndexError: list index out of range

In [76]:
pdb.data[['PDB', 'Uniprot', 'PDB_Sequence', 'Res_ids', 'Uniprot_Sequence', 'Mapping', 'Preferred Chain']].iloc[12]

PDB                                                              2R4R
Uniprot                                                        P07550
PDB_Sequence        GIVMSLIVLAIVFGNVLVITAIAFERLQTVTNYFITSLACADLVMG...
Res_ids             [037, 038, 039, 040, 041, 042, 043, 044, 045, ...
Uniprot_Sequence    MGQPGNGSAFLLAPNGSHAPDHDVTQERDEVWVVGMGIVMSLIVLA...
Mapping                         (P07550, ADRB2_HUMAN, 1, 365, 1, 365)
Preferred Chain                                                     A
Name: 12, dtype: object

In [77]:
pdb.data[['PDB', 'Uniprot', 'PDB_Sequence', 'Res_ids', 'Uniprot_Sequence', 'Mapping', 'Preferred Chain']].iloc[13]

PDB                                                              2R4S
Uniprot                                                        P07550
PDB_Sequence        GIVMSLIVLAIVFGNVLVITAIAFERLQTVTNYFITSLACADLVMG...
Res_ids             [037, 038, 039, 040, 041, 042, 043, 044, 045, ...
Uniprot_Sequence    MGQPGNGSAFLLAPNGSHAPDHDVTQERDEVWVVGMGIVMSLIVLA...
Mapping                        (P07550, ADRB2_HUMAN, 1, 342, 24, 365)
Preferred Chain                                                     A
Name: 13, dtype: object

If there is no numbering via our base-query we have to query the structure with the pdb id, 
use the entity name to get generic numbers and then use sequence alignment to get the numbers in... 

Sifts seems to not work very well (comparing the two samples above the indices are off)

In [78]:
pdb.data.to_pickle('data.pkl')

In [79]:
select = pdb.data[['Numbering', 'Res_ids', 'Res_names']]

In [80]:
select

Unnamed: 0,Numbering,Res_ids,Res_names
0,"{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...","[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ..."
1,"{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...","[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ..."
2,"{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...","[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ..."
3,"{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...","[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ..."
4,"{33: ('GLU', 1.28), 34: ('PRO', 1.29), 35: ('T...","[001, 002, 003, 004, 005, 006, 007, 008, 009, ...","[MET, ASN, GLY, THR, GLU, GLY, PRO, ASN, PHE, ..."
...,...,...,...
167,"{2: ('PRO', 1.28), 3: ('ILE', 1.29), 4: ('MET'...","[-02, -01, 000, 001, 002, 003, 004, 005, 006, ...","[ASP, GLY, ALA, PRO, PRO, ILE, MET, GLY, SER, ..."
168,"{2: ('PRO', 1.28), 3: ('ILE', 1.29), 4: ('MET'...","[-02, -01, 000, 001, 002, 003, 004, 005, 006, ...","[ASP, GLY, ALA, PRO, PRO, ILE, MET, GLY, SER, ..."
169,"{2: ('PRO', 1.28), 3: ('ILE', 1.29), 4: ('MET'...","[-02, -01, 000, 001, 002, 003, 004, 005, 006, ...","[ASP, GLY, ALA, PRO, PRO, ILE, MET, GLY, SER, ..."
170,"{2: ('PRO', 1.28), 3: ('ILE', 1.29), 4: ('MET'...","[-02, -01, 000, 001, 002, 003, 004, 005, 006, ...","[ASP, GLY, ALA, PRO, PRO, ILE, MET, GLY, SER, ..."


In [81]:
res_ids = select['Res_ids'].iloc[0]

In [82]:
res_names = select['Res_names'].iloc[0]

In [83]:
numbering = select['Numbering'].iloc[0]

In [84]:
pdb.data[pdb.data['Cl.'].str.contains('B')]

Unnamed: 0,Name,State,Uniprot,Uniprot_Sequence,PDB,PDB_Sequence,PDB_Sequence_paddedNumbering,Res_names,Res_ids,Res_pos,...,Omega,Cl.,Function,Gaps,Gapsum,Mapping,Numbering,PDB_Sequence_padded,Preferred Chain,Structure
93,sp|P34998|CRFR1_HUMAN,Inactive,P34998,MGGHPQLRLVKALLLLGLNPVSASLQDQHCESLSLASNISGLQCNA...,4K5Y,HVAAIINYLGHCISLVALLVAFVLFLRARSIRCLRNIIHANLIAAF...,,"[HIS, VAL, ALA, ALA, ILE, ILE, ASN, TYR, LEU, ...","[117, 118, 119, 120, 121, 122, 123, 124, 125, ...",,...,"[None, -179.99607866099404, 179.94698691202322...",B1(Secretin),Antagonist,"[(104, 220, 224)]",4.0,"(None, None, None, None, None, None)","{115: ('HIS', 1.35), 116: ('TYR', 1.36), 117: ...",HVAAIINYLGHCISLVALLVAFVLFLRARSIRCLRNIIHANLIAAF...,C,"[[C, 117, [-64.566, 0.819, 63.498], N], [C, 11..."
94,sp|P0ABE7|C562_ECOLX,Inactive,P0ABE7,MRKSLLAILAVSSLVFSSASFAADLEDNMETLNDNLKVIEKADNAA...,4L6R,ADLEDNWETLNDNLKVIEKADNAAQVKDALTKMRAAALDAQKATPP...,,"[ALA, ASP, LEU, GLU, ASP, ASN, TRP, GLU, THR, ...","[1001, 1002, 1003, 1004, 1005, 1006, 1007, 100...",,...,"[None, 178.55622933847178, -175.6514863663833,...",B1(Secretin),-,"[(106, 1106, 123), (184, 200, 216)]",16.0,"(P0ABE7, C562_ECOLX, 1, 106, 23, 128)","{131: ('GLN', 1.29), 132: ('LYS', 1.3), 133: (...",ADLEDNWETLNDNLKVIEKADNAAQVKDALTKMRAAALDAQKATPP...,A,"[[A, 1001, [13.645, 14.436, -11.317], N], [A, ..."
136,sp|P34998|CRFR1_HUMAN,Inactive,P34998,MGGHPQLRLVKALLLLGLNPVSASLQDQHCESLSLASNISGLQCNA...,4Z9G,KSKVHYHVAAIINYLGHCISLVALLVAFVLFLRARSIRCLRNIIHA...,,"[LYS, SER, LYS, VAL, HIS, TYR, HIS, VAL, ALA, ...","[111, 112, 113, 114, 115, 116, 117, 118, 119, ...",,...,"[None, -179.65748267777883, -178.9569487886989...",B1(Secretin),Antagonist,"[(110, 220, 1002), (270, 1161, 222)]",782.0,"(P34998, CRFR1_HUMAN, 2, 118, 104, 249)",{},KSKVHYHVAAIINYLGHCISLVALLVAFVLFLRARSIRCLRNIIHA...,A,"[[A, 111, [-11.077, 39.714, 8.167], N], [A, 11..."
154,sp|P00720|ENLYS_BPT4,Inactive,P00720,MNIFEMLRIDERLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,5EE7,YSSFQVMYTVGYSLSLAALLLALAILGGLSKLHCTANAIHANLFLS...,,"[TYR, SER, SER, PHE, GLN, VAL, MET, TYR, THR, ...","[138, 139, 140, 141, 142, 143, 144, 145, 146, ...",,...,"[None, 179.5209949161551, 179.09808789509543, ...",B1(Secretin),AntagonistNAM,"[(63, 200, 212), (107, 255, 1000), (126, 1018,...",771.0,"(P00720, ENLYS_BPT4, 121, 279, 2, 160)","{138: ('TYR', 1.36), 139: ('SER', 1.37), 140: ...",YSSFQVMYTVGYSLSLAALLLALAILGGLSKLHCTANAIHANLFLS...,A,"[[A, 138, [-26.173, 9.248, -69.199], N], [A, 1..."


In [91]:
pdb.data[['Uniprot_Sequence', 'PDB_Sequence', 'Res_ids']].iloc[93]

Uniprot_Sequence    MGGHPQLRLVKALLLLGLNPVSASLQDQHCESLSLASNISGLQCNA...
PDB_Sequence        HVAAIINYLGHCISLVALLVAFVLFLRARSIRCLRNIIHANLIAAF...
Res_ids             [117, 118, 119, 120, 121, 122, 123, 124, 125, ...
Name: 93, dtype: object

In [108]:
def represented_seq(x, y):
    try: 
        return ''.join(list(x)[int(y)-1:])
    except:
        return None

In [109]:
pdb.data['Uniprot_trunc'] = pdb.data.apply(lambda x: represented_seq(x.Uniprot_Sequence, x.Res_ids[0]), axis=1)

In [110]:
pdb.data[['Uniprot_trunc', 'PDB_Sequence']]

Unnamed: 0,Uniprot_trunc,PDB_Sequence
0,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
1,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
2,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
3,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
4,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...,MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFL...
...,...,...
167,GVS,DGAPPIMGSSVYITVELAIAVLAILGNVLVCWAVWLNSNLQNVTNY...
168,GVS,DGAPPIMGSSVYITVELAIAVLAILGNVLVCWAVWLNSNLQNVTNY...
169,GVS,DGAPPIMGSSVYITVELAIAVLAILGNVLVCWAVWLNSNLQNVTNY...
170,GVS,DGAPPIMGSSVYITVELAIAVLAILGNVLVCWAVWLNSNLQNVTNY...


In [120]:
for i in range(len(pdb.data)):
    if pdb.data['Uniprot_trunc'].iloc[i][:10] != pdb.data['PDB_Sequence'].iloc[i][:10] :
        print()
        print(i)
        print(pdb.data['Uniprot_trunc'].iloc[i])
        print()
        print(pdb.data['PDB_Sequence'].iloc[i])


10
MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA

MCGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSCFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNP

14
IGHLLTKSPSLNAAKSELDKAIGRNCNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRCALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSIWYNQTPNRAKRVITTFRTGTWDAYKNL

DEVWVVGMGIVMSLIVLAIVFGNVLVITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEFWTSIDVLCVTASIETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQEAINCYAEETCCDFFTNQAYAIASS

In [85]:
entity_data = None

In [None]:
res_data = None

In [None]:
atom_data = None

In [None]:
j = 0
l_err = []
for i in range(len(pdb.data)):
    numbering = pdb.data['Numbering'].iloc[i]
    if numbering == {}:
        j+=1
        l_err.append(pdb.data['PDB'].iloc[i])
    weird = False
    for num in numbering:
        try:
            num_str = str(num).zfill(3)
            idx = res_ids.index(num_str)
            if numbering[num][0] != res_names[idx]:
                weird = True
        except:
            pass
            # weird = True
            # print("Didnt find num_str", num_str)
        print(num, numbering[num][0], res_names[idx])
    if weird:
        pass

In [None]:
print("Number of pdbs without corresponding position numberings: {}.".format(j))

In [None]:
uniprot_l = []
for r in l_err:
    uniprot = pdb.data[pdb.data['PDB']==r]['Uniprot'].iloc[0]
    if uniprot not in uniprot_l:
        uniprot_l.append(uniprot)

In [None]:
len(uniprot_l)

In [None]:
for u in uniprot_l:
    lu = list(pdb.data[pdb.data['Uniprot']==u]['PDB'])
    ite = 0
    for u_ in lu:
        if u_ not in l_err:
            ite += 1
            
    print('{}: {} / {} of given uniprot id are numbered correctly.'.format(u, ite, len(lu)))
    print()
    print()

In [None]:
print("Number of pdbs without corresponding position numberings: {}.".format(j))

In [None]:
uniprot = pdbtouniprot('7LJC')

In [None]:
uniprot

In [None]:
def get_numbering_json(entry_name: str):
    if not os.path.isfile('data/numbering/'+entry_name+'.txt') or reload:
        response = requests.get('https://gpcrdb.org/services/residues/extended/'+entry_name+'/')
        return response.json()

In [None]:
def get_entry_name(pdb_id:str):
    return requests.get('https://gpcrdb.org/services/structure/'+pdb_id+'/').json()

In [None]:
entry=get_entry_name('7LJC')

In [None]:
entry

In [None]:
entry_name=entry['protein']

In [None]:
output = get_numbering_json(entry_name)

In [None]:
#output

In [None]:
parsed = []
for dic in output:
    seq_num = dic['sequence_number']
    aa = dic['amino_acid']
    num = dic['display_generic_number']
    parsed.append([seq_num, aa, num])

In [None]:
parsed

In [None]:
pdb.data.Structure.iloc[14]

In [62]:
for i in range(len(pdb.data)):
    try:
        u = len(pdb.data['Uniprot_Sequence'].iloc[i])
        p = len(pdb.data['PDB_Sequence_padded'].iloc[i])
        print('uniprot: {} | pdb_padded: {} | missing tail length: {}'.format(u, p, u-p))
    except:
        pass

uniprot: 348 | pdb_padded: 348 | missing tail length: 0
uniprot: 348 | pdb_padded: 332 | missing tail length: 16
uniprot: 348 | pdb_padded: 348 | missing tail length: 0
uniprot: 348 | pdb_padded: 348 | missing tail length: 0
uniprot: 348 | pdb_padded: 348 | missing tail length: 0
uniprot: 348 | pdb_padded: 348 | missing tail length: 0
uniprot: 348 | pdb_padded: 348 | missing tail length: 0
uniprot: 348 | pdb_padded: 332 | missing tail length: 16
uniprot: 348 | pdb_padded: 326 | missing tail length: 22
uniprot: 348 | pdb_padded: 329 | missing tail length: 19
uniprot: 348 | pdb_padded: 327 | missing tail length: 21
uniprot: 348 | pdb_padded: 348 | missing tail length: 0
uniprot: 413 | pdb_padded: 312 | missing tail length: 101
uniprot: 413 | pdb_padded: 312 | missing tail length: 101
uniprot: 164 | pdb_padded: 1213 | missing tail length: -1049
uniprot: 483 | pdb_padded: 321 | missing tail length: 162


In [63]:
for i in range(len(pdb.data)):
    print(pdb.data['Uniprot_Sequence'].iloc[i])
    print("\n")
    print(pdb.data['PDB_Sequence_padded'].iloc[i])
    print("\n\n\n\n\n")

MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNPLGDDEASTTVSKTETSQVAPA


MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAA____SATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFFAKTSAVYNPVIYIMMNKQFRNCMVTTLCCGKNP______STTVSKTETSQVAPA






MNGTEGPNFYVPFSNKTGVVRSPFEAPQYYLAEPWQFSMLAAYMFLLIMLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVFGGFTTTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLVGWSRYIPEGMQCSCGIDYYTPHEETNNESFVIYMFVVHFIIPLIVIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWLPYAGVAFYIFTHQGSDFGPIFMTIPAFF

In [110]:
print(pdb.data['PDB_Sequence_padded'].iloc[14])

DEVWVVGMGIVMSLIVLAIVFGNVLVITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEFWTSIDVLCVTASIETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQEAINCYAEETCCDFFTNQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLNIFEMLRIDEGLRLKIYKDTEGYYTIGI__________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________

In [None]:
pdb.data['']

In [130]:
print(pdb.data['Uniprot_Sequence'].iloc[14])

MGQPGNGSAFLLAPNGSHAPDHDVTQERDEVWVVGMGIVMSLIVLAIVFGNVLVITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEFWTSIDVLCVTASIETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQEAINCYANETCCDFFTNQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLQKIDKSEGRFHVQNLSQVEQDGRTGHGLRRSSKFCLKEHKALKTLGIIMGTFTLCWLPFFIVNIVHVIQDNLIRKEVYILLNWIGYVNSGFNPLIYCRSPDFRIAFQELLCLRRSSLKAYGNGYSSNGNTGEQSGYHVEQEKENKLLCEDLPGTEDFVGHQGTVPSDNIDSQGRNCSTNDSLL


Something weird is going on! I think this possibly has to do with the complex adding ligands/gproteins to the preferred chain. Will try to fix this by using the residue numbers.. (i.e. everything below 500)

In [131]:
pdb.data.iloc[14]

Name                                                        sp|P07550|ADRB2_HUMAN
State                                                                    Inactive
Uniprot                                                                    P07550
Uniprot_Sequence                MGQPGNGSAFLLAPNGSHAPDHDVTQERDEVWVVGMGIVMSLIVLA...
PDB                                                                          2RH1
PDB_Sequence                    DEVWVVGMGIVMSLIVLAIVFGNVLVITAIAKFERLQTVTNYFITS...
PDB_Sequence_paddedNumbering                                                  NaN
Res_names                       [ASP, GLU, VAL, TRP, VAL, VAL, GLY, MET, GLY, ...
Res_ids                         [029, 030, 031, 032, 033, 034, 035, 036, 037, ...
Res_pos                                                                      None
n_residues                                                                    442
GapsStructure                                                                 NaN
Ca              

In [132]:
pdb.data.apply(lambda x: [i for i, _ in enumerate(x.Phi) if _ == None], axis=1)  
#  this is how we can mask residues without angles

0           [0, 235, 323]
1           [0, 326, 328]
2           [0, 235, 325]
3           [0, 235, 325]
4                     [0]
              ...        
523               [0, 87]
524              [0, 101]
525              [0, 101]
526    [0, 148, 204, 239]
527    [0, 148, 208, 243]
Length: 528, dtype: object

In [133]:
pdb.data.apply(lambda x: [len(x.Res_ids), len(x.Psi), len(x.Phi)], axis=1)

0      [338, 338, 338]
1      [329, 329, 329]
2      [340, 340, 340]
3      [340, 340, 340]
4      [348, 348, 348]
            ...       
523    [393, 393, 393]
524    [390, 390, 390]
525    [390, 390, 390]
526    [280, 280, 280]
527    [285, 285, 285]
Length: 528, dtype: object

In [None]:
class Gproteins():
    def __init__(self,
                 path='data/'):
        #https://gpcrdb.org/structure/g_protein_structure_browser
        pass

In [181]:
class GPCR():
    def __init__(self,
                 path='data/',
                 columns=['PDB', 'Name', 
                          'Uniprot', 
                          'Sequence', 'Sections', 'Gaps',
                          'Structure', 'Ca','CaC','Psi','Phi','Omega',
                          'State', 'G-protein'],
                 alignment="GPCRdb_alignment_TM7_ICL4_H8.csv",
                 limit=5,
                 verbose=1):
        
        """
        path: pdb file path
        columns: [
            Index:         Identifier for our data
            Type:          G-Protein/Receptor/?
            Function:      Agonist/Antagonist
            Seq:           Protein sequence from PDBs
            Uniprot_Seq:   Protein sequence from uniprot entry
            Sections:      Polypeptides with structure (Sequence)
            Gaps:          Indices of starts/ends in full protein sequence of the polypeptides
            Position_idx:  GPCRdb alignment position index (if possible)
            XYZ:           Structure Data contained in the PDB (saved as per-residue)
            XYZ_padded:    Including NaN values for AAs in Gaps
            Ca:            Position of the C-alphas
            CaC:           Direction of backbone
            Psi:           Psi angle
            Phi:           Phi angle
            Omega:         Omega angle
            State:         Active/Inactive (for Receptors)
            G-Protein:     Primary target I guess
            Selectivities: Selectivity Values
            References:    ???
            ]
        limit: maximum number of proteins loaded
        """
        self.verbose = verbose
        self.limit = limit
        
        self.path_pdb = path + 'pdb/'
        self.path_alignment = path + 'alignments/' + alignment
        self.path_table = 'gpcrdb/'
        
        self.alignment = None
        self.data = pd.DataFrame(columns = columns)
    
    # ======================================================================================================================
    
    def load_pkl(self, path: str):
        self.data = pd.read_pickle(path)
        
    def save_pkl(self, path: str):
        self.data.save_pickle(path)
    
    # ======================================================================================================================
    
    def filter_pos(self, start, end):
        pass
    
    
    def view(self, pdb=None, uniprot=None, idx=None, start=None, end=None, start_idx=None, end_idx=None):
        # implement a view of a protein
        # implement view of overlap of proteins
        pass
    
    
    
        
    def plot_distr(self, mode='len', idx=None, save=False):
        """
        plot different data visualizations
        modes: - 'len': plot distribution of amino sequence lengths
               - 'residues': plot occurances of residues overall
        """
        assert mode in ['len', 'residues'], print("make sure mode is either 'len' or 'residues'")
        if mode == 'len':
            plot_len_hist(self.data, save)
        else:
            plot_res_hist(self.data, idx, save)

            
    def get_res_list(self):
        """
        Run this on your dataset, and copypaste the list to utils: "RES_LIST"
        """
        flatten = lambda l: [item for sublist in l for item in sublist]
        l = self.data['PDB_Sequence'].tolist()
        return sorted(list(set(flatten(l))))
    
    
    def rl_remove_variants(self, n_aa=20):
        """
        {'LEU': 13034, 'GLU': 9023, ' Ca': 5, ..} --> Remove proteins where rare variants occur,
        use a cutoff of n_aa = 20 to remove all rare variants by only selecting (the normal) 20 aminoacids.
        """
        d = Counter(flat_l(self.data['PDB_Sequence'].tolist()))
        v, k = zip(*sorted(zip(d.values(), d.keys())))
        self.res_list = dict(zip(k[-n_aa:], v[-n_aa:]))
        return k[:-n_aa], k[-n_aa:]  # return k[-n_aa] just so I can copy it to utils.py as an initialization array
    
    
    def remove_variants(self):
        c_tot = self.data.count().len
        k, _ = self.rl_remove_variants()
        new = self.data
        for key in k:
            new = new[new['PDB_Sequence'].apply(lambda x: key not in x)]
        self.data = new.reset_index()
        c_red = self.data.count().len
        print("After removing {} variants our data contains {} proteins.".format(c_tot-c_red, c_red))
        self.res_list = _
        
    
    def remove_outliers(self, min_len=100, max_len=6000):
        c_tot = self.data.count().len
        new = self.data[(min_len < self.data['n_residues']) & (self.data['n_residues'] < max_len)]
        self.data = new.reset_index()
        c_red = self.data.count().len
        print("After removing {} outliers with {} > length > {} our data contains {} proteins."\
              .format(c_tot-c_red, min_len, max_len, c_red))
        del new
        return
    
    
    def label_ca(self):
        def mkdict(rl):
            d = {}
            for i, l in enumerate(reversed(rl)):
                d.update({l:i})
            return d
        def mapaa(aas, d):
            y = []
            for aa in aas:
                y.append(d[aa])
            return np.asarray(y)
        ltodict = mkdict(self.get_common_aas())
        temp = self.data.copy()
        self.data['y'] = temp[['res']].applymap(lambda x: mapaa(x, ltodict))
        del temp
        return

In [182]:
loader = GPCR()

In [183]:
loader.data

Unnamed: 0,PDB,Name,Uniprot,Sequence,Sections,Gaps,Structure,Ca,CaC,Psi,Phi,Omega,State,G-protein


In [36]:
loader.load_pdbs()

  0%|                                                                                                                                                                                                                                                                                            | 0/5 [00:00<?, ?it/s]

Found 528 files. Using 5 files.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.09s/it]
