In [1]:
from utils import *
from utils2 import *
from utils3 import *

In [2]:
import pandas as pd
from tqdm import tqdm
from os import walk

from Bio.PDB import PDBParser

In [3]:
PATH = 'data/pdb/active/'

In [4]:
def get_pdb_files(path=PATH):
    _, _, filenames = next(walk(path))
    return [x for x in filenames if '.pdb' in x]

In [5]:
def load_pdb(name, path=PATH):
    """
    Load Pdb and get for each atom xyz-coordinates, Atom type (CA, H, N, NZ etc.) and Residue type (ALA, PRO etc.)
    """
    p = PDBParser()
    structure = p.get_structure(name, path + name)
    types = []
    xyz = []
    res = []
    atoms = []
    model = structure[0]
    for chain in model:
        for residue in chain:
            res.append([residue.get_resname(), str(residue.get_id()[1]).zfill(3)])
            atoms_ = []
            for atom in residue:
                atoms_.append([atom.get_coord(), atom.get_id()])
            atoms.append(atoms_)
    return res, atoms

In [6]:
# res, atoms = load_pdb(filenames[0])

In [7]:
def map_gaps(x):
    min_num = 0
    gaps = []
    for i, z in enumerate(x):
        new_num = int(z[0][1])
        if not abs(new_num - min_num) <= 1:
            gaps.append((i, min_num, new_num))
            min_num = new_num
        else:
            min_num += 1
    return gaps

In [8]:
#map_gaps(zip(res, atoms))

In [9]:
from gpcrdb_soup import *

In [12]:
class Download():
    def __init__(self, 
                 path='data/'):
        self.path_pdb = path + 'pdb/'
        self.path_alignment = path + 'alignments/'
        self.path_table = 'gpcrdb/'
        
        self.table = None
        self.filenames, self.pdb_ids = get_pdb_files(path=self.path_pdb)
    
    # ======================================================================================================================
        
    def download_alignment(self):
        print("Not Implemented! TBD manually.")
    
    def download_pdbs(self, reload=False, update=False):
        pdb_table_ids = self.table['PDB'].tolist()
        missing = [x for x in pdb_table_ids if x not in self.pdb_ids]
        if reload or (len(missing)>0):
            print("Reloading pdb files...")
            print("Missing pdbs:", missing)
            fileformat='mmcif'
            for pdb in tqdm(missing):
                url = get_rcsb_download(pdb, fileformat)
                download_pdb(url, folder=self.path_pdb, fileformat=fileformat)
        elif update:
            self.update_pdbs()
        self.filenames, self.pdb_ids = get_pdb_files(path=self.path_pdb)
                
    def download_table(self, reload=True, filename='structure.pkl'):
        table_path = self.path_table + filename
        if reload or (not os.path.isfile(self.path_table+filename)):
            self. table = get_table(reload=True, uniprot=False)
        else:
            self.table = get_table(reload=False, uniprot=False)
            self.table = self.table[self.table['Species'].str.contains('Human')]
        self.table = self.table.drop(columns=['filler', 'filler2', 'receptor family', 
                                              'Species', 'Method', 'Refined Structure', 'Degree active %', 
                                              '% of Seq1', 'Family', 'Subtype', 'Name2', 'Fusion', 'Note', 
                                              '% of Seq2', 'Antibodies', 'Name1', 'Type1', 'Type2', 
                                              'D2x50 - S3x39', 'Sodium in structure', 'Authors', 'Reference', 
                                              'PDB date', 'Annotated', 'pdb_link'])
        
    # ======================================================================================================================
    
    def update_pdbs(self):
        updatepdbs(self.path_pdb)
    
    # ======================================================================================================================
    
    def preprocess(self):
        # drop unimportant 
        pass
    

In [13]:
D = Download()

In [14]:
D.download_alignment()

Not Implemented! TBD manually.


In [15]:
print(D.table)

None


In [16]:
D.download_table(reload=False)

In [17]:
D.table

Unnamed: 0,uniprot(gene),Cl.,PDB,Resolution,Preferred Chain,State,Function
0,STE2,D1(Ste2-likefungalpheromone),7AD3,3.5,A,Active,Agonist
1,CCR2,A(Rhodopsin),5T1A,2.8,A,Inactive,AntagonistNAM
2,OPRM,A(Rhodopsin),4DKL,2.8,A,Inactive,Antagonist
3,CNR2,A(Rhodopsin),5ZTY,2.8,A,Inactive,Antagonist
4,5HT1B,A(Rhodopsin),6G79,3.8,S,Active,Agonist
...,...,...,...,...,...,...,...
523,OPSD,A(Rhodopsin),6PH7,2.9,A,Active,unknown
524,NTR1,A(Rhodopsin),7L0S,4.5,C,Active,Agonist
525,AA2AR,A(Rhodopsin),6S0Q,2.7,A,Inactive,Antagonist
526,GLP1R,B1(Secretin),7LCJ,2.8,R,Active,Agonist


In [18]:
D.download_pdbs(reload=True)

  0%|                                                                                                                                                                                                                                                                                          | 0/528 [00:00<?, ?it/s]

Reloading pdb files...
Missing pdbs: ['7AD3', '5T1A', '4DKL', '5ZTY', '6G79', '6TPK', '6K42', '6K41', '7KNU', '7KNT', '6AK3', '5XSZ', '6LFM', '6LFL', '6LFO', '1GZM', '5NDD', '5NJ6', '5NDZ', '4N6H', '6OSA', '6OS9', '6CM4', '4OO9', '6NBF', '6NBI', '6NBH', '3VGA', '3VG9', '5JQH', '5YC8', '5ZKC', '5ZKB', '5ZK8', '5ZK3', '5O9H', '6WWZ', '6A93', '6A94', '7JV5', '7JVQ', '7JVP', '7JVR', '6RNK', '6IBB', '5TGZ', '5ZHP', '6PB0', '6PB1', '3OE8', '3ODU', '3OE6', '3OE9', '3OE0', '5C1M', '4MQT', '4MQS', '4XNV', '4XNW', '4UG2', '4UHR', '6U1N', '4AMJ', '4AMI', '6ME4', '6ME2', '6ME5', '6ME3', '6W2Y', '6W2X', '5WQC', '5WS3', '6QZH', '6OFJ', '6RZ8', '6RZ6', '6RZ9', '6RZ7', '7D77', '7D76', '5L7D', '5L7I', '6PWC', '5XEZ', '5XF1', '3RZE', '5VEW', '5VEX', '6OIJ', '6OIK', '7CZ5', '4Z36', '4Z34', '4Z35', '4Z9G', '5WIU', '5WIV', '3PBL', '6B73', '3VW7', '5ZKP', '5ZKQ', '4IAQ', '4IAR', '6W25', '6UO9', '6UOA', '6UO8', '6VJM', '6LI2', '6LI3', '6LI0', '6LI1', '6H7N', '6IBL', '4ZWJ', '6OL9', '5XPR', '5X93', '4ZUD', '3




NameError: name 'fileformat' is not defined

In [17]:
D.filenames

['data/pdb/00.m.pdb',
 'data/pdb/01.m.pdb',
 'data/pdb/02.m.pdb',
 'data/pdb/03.m.pdb',
 'data/pdb/04.m.pdb',
 'data/pdb/09.m.pdb',
 'data/pdb/0G.m.pdb',
 'data/pdb/0L.m.pdb',
 'data/pdb/0P.m.pdb',
 'data/pdb/0Q.m.pdb',
 'data/pdb/0R.m.pdb',
 'data/pdb/0S.m.pdb',
 'data/pdb/0V.m.pdb',
 'data/pdb/10.m.pdb',
 'data/pdb/14.m.pdb',
 'data/pdb/15.m.pdb',
 'data/pdb/16.m.pdb',
 'data/pdb/18.m.pdb',
 'data/pdb/19.m.pdb',
 'data/pdb/1A.m.pdb',
 'data/pdb/1H.m.pdb',
 'data/pdb/1I.m.pdb',
 'data/pdb/1M.m.pdb',
 'data/pdb/1N.m.pdb',
 'data/pdb/1Q.m.pdb',
 'data/pdb/1R.m.pdb',
 'data/pdb/20.m.pdb',
 'data/pdb/21.m.pdb',
 'data/pdb/25.m.pdb',
 'data/pdb/26.m.pdb',
 'data/pdb/27.m.pdb',
 'data/pdb/2A.m.pdb',
 'data/pdb/2B.m.pdb',
 'data/pdb/2C.m.pdb',
 'data/pdb/2D.m.pdb',
 'data/pdb/2E.m.pdb',
 'data/pdb/2R.m.pdb',
 'data/pdb/2S.m.pdb',
 'data/pdb/2W.m.pdb',
 'data/pdb/2X.m.pdb',
 'data/pdb/2Y.m.pdb',
 'data/pdb/32.m.pdb',
 'data/pdb/33.m.pdb',
 'data/pdb/34.m.pdb',
 'data/pdb/35.m.pdb',
 'data/pdb

In [30]:
D.update_pdbs()



ValueError: unknown url type: 'data/pdb/pub/pdb/data/status/latest/added.pdb'

In [84]:
class References():
    def __init__(self):
        # how to connect the Dataloaders
        pass

In [10]:
# ['PDB', 'Name', 'Uniprot', 'Sequence', 'Sections', 'Gaps','Structure', 'Ca','CaC','Psi','Phi','State', 'G-protein']

In [24]:
class DataLoader():
    def __init__(self):
        """
        Index:         Identifier for our data
        Type:          G-Protein/Receptor/?
        Function:      Agonist/Antagonist
        Seq:           Protein sequence from PDBs
        Uniprot_Seq:   Protein sequence from uniprot entry
        Sections:      Polypeptides with structure (Sequence)
        Gaps:          Indices of starts/ends in full protein sequence of the polypeptides
        Position_idx:  GPCRdb alignment position index (if possible)
        XYZ:           Structure Data contained in the PDB (saved as per-residue)
        XYZ_padded:    Including NaN values for AAs in Gaps
        Ca:            Position of the C-alphas
        CaC:           Direction of backbone
        Psi:           Psi angle
        Phi:           Phi angle
        State:         Active/Inactive (for Receptors)
        G-Protein:     Primary target I guess
        Selectivities: Selectivity Values
        References:    ???
        """
        columns = ['Index',
                   'Type',
                   'PDB', 'Name', 'Uniprot', 
                   'Seq', 'Uniprot_Seq',
                   'Sections', 'Gaps', 'Position_Idx'
                   'XYZ', 'XYZ_padded'
                   'Ca','CaC','Psi','Phi',
                   'State', 'G-protein',
                   'References']
        gproteins = ['gs', 'gi', 'gq']  # used in Selectivities to map the dict
        self.data = pd.DataFrame(columns = columns)
        
        # think about how to 
        
    # ======================================================================================================================
    
    def load_alignment(self):
        pass
    
    # ======================================================================================================================
    
    # PROCESSING
    
    # ======================================================================================================================
    
    def load_pkl(self, path: str):
        self.data = pd.read_pickle(path)
        
    def save_pkl(self, path: str):
        self.data.save_pickle(path)
        
    # ======================================================================================================================
    
    def separate_structure(self):
        # if structure is active: use another dict for g-protein/complex?
        pass
    
    # ======================================================================================================================
    
    def view(self, pdb=None, uniprot=None, idx=None, start=None, end=None, start_idx=None, end_idx=None):
        # implement a view of a protein
        # implement view of overlap of proteins
        pass
    

In [28]:
loader = DataLoader()

In [29]:
loader.data

Unnamed: 0,Index,Type,PDB,Name,Uniprot,Sequence,Sections,Gaps,Position_IdxStructure,Full_StructCa,CaC,Psi,Phi,State,G-protein,References


In [27]:
del loader