In [1]:
from utils import *
from gpcrdb_soup import *
import gemmi

In [2]:
from Bio.PDB.MMCIFParser import MMCIFParser
parser = MMCIFParser()

In [3]:
eg = '1f88'

In [4]:
path = get_rcsb_download(eg, fileformat = 'cif')

In [5]:
path

'https://files.rcsb.org/download/1f88.cif'

In [6]:

def download(url: str, folder: str, fileformat: str):
    if not os.path.isdir(folder):
        os.mkdir(folder)
    try:
        r = requests.get(url)
        loc = len(fileformat)+1
        fname = folder + '/' + url[-(loc+4):-loc] + '.' + fileformat
        with open(fname, 'wb') as f:
            f.write(r.content)
    except Exception:
        print("Url invalid:", url)

    
def download_pdb(url, folder, fileformat):
    download(url, folder, fileformat)

In [7]:
download(url=path, folder='data/mmcif', fileformat='cif')

In [8]:
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
import pandas as pd
d = MMCIF2Dict("data/mmcif/"+eg+".cif")
pd.DataFrame({k:v for k,v in d.items() if "_atom_site." in k})

Unnamed: 0,_atom_site.group_PDB,_atom_site.id,_atom_site.type_symbol,_atom_site.label_atom_id,_atom_site.label_alt_id,_atom_site.label_comp_id,_atom_site.label_asym_id,_atom_site.label_entity_id,_atom_site.label_seq_id,_atom_site.pdbx_PDB_ins_code,...,_atom_site.Cartn_y,_atom_site.Cartn_z,_atom_site.occupancy,_atom_site.B_iso_or_equiv,_atom_site.pdbx_formal_charge,_atom_site.auth_seq_id,_atom_site.auth_comp_id,_atom_site.auth_asym_id,_atom_site.auth_atom_id,_atom_site.pdbx_PDB_model_num
0,ATOM,1,N,N,.,MET,A,1,1,?,...,-5.980,-27.758,1.00,54.29,?,1,MET,A,N,1
1,ATOM,2,C,CA,.,MET,A,1,1,?,...,-5.054,-26.911,1.00,53.52,?,1,MET,A,CA,1
2,ATOM,3,C,C,.,MET,A,1,1,?,...,-4.848,-25.543,1.00,52.77,?,1,MET,A,C,1
3,ATOM,4,O,O,.,MET,A,1,1,?,...,-4.618,-25.451,1.00,51.10,?,1,MET,A,O,1
4,ATOM,5,C,CB,.,MET,A,1,1,?,...,-3.699,-27.610,1.00,53.58,?,1,MET,A,CB,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5262,HETATM,5263,O,O,.,HOH,T,7,.,?,...,-4.467,38.118,1.00,32.05,?,961,HOH,B,O,1
5263,HETATM,5264,O,O,.,HOH,T,7,.,?,...,4.389,47.804,1.00,38.08,?,969,HOH,B,O,1
5264,HETATM,5265,O,O,.,HOH,T,7,.,?,...,26.355,-7.866,1.00,28.34,?,973,HOH,B,O,1
5265,HETATM,5266,O,O,.,HOH,T,7,.,?,...,7.105,39.757,1.00,53.03,?,975,HOH,B,O,1


In [9]:
structure = parser.get_structure("", "data/mmcif/"+eg+".cif")


In [10]:
import nglview as nv
view = nv.show_biopython(structure)



In [11]:
view

NGLWidget()

In [12]:
cols = ['group_PDB', 'auth_asym_id', 'label_asym_id', 'auth_seq_id', 'label_seq_id', 'label_comp_id', 'id', 'label_atom_id',
        'type_symbol', 'Cartn_x', 'Cartn_y', 'Cartn_z']

In [13]:
import sys
import gemmi
from gemmi import cif


print("Loading cif file of", eg)
path = 'data/mmcif/'+eg+'.cif'
try:
    doc = cif.read_file(path)  # copy all the data from mmCIF file
    lol = []  # list of lists
    for b, block in enumerate(doc):
        table = block.find('_atom_site.', cols)
        for row in table:
            lol.append(list(row))
except Exception as e:
    print("Oops. %s" % e)
    sys.exit(1)

Loading cif file of 1f88


In [14]:
cols = ['group_PDB', 'auth_asym_id', 'label_asym_id', 'auth_seq_id', 'label_seq_id', 'label_comp_id', 'id', 'label_atom_id',
        'atom_type', 'x', 'y', 'z']
df = pd.DataFrame(data=lol, columns=cols)

In [15]:
def fix_label_seq_id(x):
    try:
        return int(x)
    except:
        return None

In [16]:
df

Unnamed: 0,group_PDB,auth_asym_id,label_asym_id,auth_seq_id,label_seq_id,label_comp_id,id,label_atom_id,atom_type,x,y,z
0,ATOM,A,A,1,1,MET,1,N,N,43.958,-5.980,-27.758
1,ATOM,A,A,1,1,MET,2,CA,C,44.718,-5.054,-26.911
2,ATOM,A,A,1,1,MET,3,C,C,44.069,-4.848,-25.543
3,ATOM,A,A,1,1,MET,4,O,O,42.854,-4.618,-25.451
4,ATOM,A,A,1,1,MET,5,CB,C,44.868,-3.699,-27.610
...,...,...,...,...,...,...,...,...,...,...,...,...
5262,HETATM,B,T,961,.,HOH,5263,O,O,33.819,-4.467,38.118
5263,HETATM,B,T,969,.,HOH,5264,O,O,66.200,4.389,47.804
5264,HETATM,B,T,973,.,HOH,5265,O,O,26.109,26.355,-7.866
5265,HETATM,B,T,975,.,HOH,5266,O,O,66.293,7.105,39.757


In [17]:
df.groupby('auth_seq_id')['label_comp_id'].first()

auth_seq_id
1      MET
10     TYR
100    HIS
101    GLY
102    TYR
      ... 
976    HOH
977    RET
978    RET
98     SER
99     LEU
Name: label_comp_id, Length: 377, dtype: object

In [18]:
df.groupby('label_atom_id').sum()

Unnamed: 0_level_0,group_PDB,auth_asym_id,label_asym_id,auth_seq_id,label_seq_id,label_comp_id,id,atom_type,x,y,z
label_atom_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C,ATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMAT...,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,1234567891011121314151617181920212223242526272...,1234567891011121314151617181920212223242526272...,METASNGLYTHRGLUGLYPROASNPHETYRVALPROPHESERASNL...,3111923303943505869818895106112120129136140147...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,44.06944.47444.97545.40947.14648.52050.49650.0...,-4.848-3.296-0.1932.7754.6517.63010.07810.0956...,-25.543-22.697-24.576-25.508-27.612-28.257-28....
C1,HETATMHETATMHETATMHETATMHETATMHETATMHETATMHETA...,CCDDEEEFFAB,CCDDEEEFFLR,121212312977978,...........,NAGNAGNAGNAGNAGNAGMANNAGNAGRETRET,50685082509651105124513851525163517751965221,CCCCCCCCCCC,37.62035.56447.22550.56859.51262.86964.48246.8...,3.7577.698-8.617-12.3540.9713.7248.734-9.992-1...,-28.808-31.709-23.091-21.37241.02843.58444.035...
C10,HETATMHETATM,AB,LR,977978,..,RETRET,52055230,CC,57.62638.639,10.21111.044,-10.42426.150
C11,HETATMHETATM,AB,LR,977978,..,RETRET,52065231,CC,56.58239.898,10.87111.497,-11.23626.800
C12,HETATMHETATM,AB,LR,977978,..,RETRET,52075232,CC,55.33541.161,11.24311.577,-10.84826.282
...,...,...,...,...,...,...,...,...,...,...,...
OH,ATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMAT...,AAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBB,AAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBB,1029304360749610213617819119220622326827430130...,1029304360749610213617819119220622326827430130...,TYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRT...,7822824034749060977982610871410150515171635178...,OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO,48.67134.61339.66949.37859.44374.24639.29738.2...,-1.77713.8576.17010.53036.32729.09324.30218.60...,-22.305-12.642-20.924-1.0399.6573.050-14.559-1...
OXT,ATOM,A,A,348,348,ALA,2638,O,90.145,31.762,16.624
SD,ATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMAT...,AAAAAAAAAAAAAAAABBBBBBBBBBBBBBB,AAAAAAAAAAAAAAAABBBBBBBBBBBBBBB,1394449861431551631832072532572883083093171394...,1394449861431551631832072532572883083093171394...,METMETMETMETMETMETMETMETMETMETMETMETMETMETMETM...,7316354397700113712361299144516421976200722572...,SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS,46.86337.63247.72743.61957.66089.57179.47374.0...,-4.87610.29514.55422.26416.85415.58025.37714.1...,-29.066-0.070-4.4345.183-4.2293.163-4.682-11.0...
SG,ATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMAT...,AAAAAAAAAABBBBBBBBBB,AAAAAAAAAABBBBBBBBBB,1101401671851872222643163223231101401671851872...,1101401671851872222643163223231101401671851872...,CYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSC...,8831114132414611473177120612486252925353521375...,SSSSSSSSSSSSSSSSSSSS,51.21984.37465.32742.66950.40378.02856.89866.5...,14.0094.84411.23911.95013.3177.4468.87227.5622...,-18.3868.988-13.144-12.401-16.6654.194-0.22016...


In [19]:
df['label_seq_id']=df.apply(lambda x: fix_label_seq_id(x.label_seq_id), axis=1)

Columns
group_PDB 	id 	type_symbol 	label_atom_id 	label_alt_id 	label_comp_id 	label_asym_id 	label_entity_id 	label_seq_id 	pdbx_PDB_ins_code 	Cartn_x 	Cartn_y 	Cartn_z 	occupancy 	B_iso_or_equiv 	Cartn_x_esd 	Cartn_y_esd 	Cartn_z_esd 	occupancy_esd 	B_iso_or_equiv_esd 	pdbx_formal_charge 	auth_seq_id 	auth_comp_id 	auth_asym_id 	auth_atom_id 	pdbx_PDB_model_num

In [20]:
import sys
from math import degrees
import gemmi

ramas = {aa: [] for aa in [
    'LEU', 'ALA', 'GLY', 'VAL', 'GLU', 'SER', 'LYS', 'ASP', 'THR', 'ILE',
    'ARG', 'PRO', 'ASN', 'PHE', 'GLN', 'TYR', 'HIS', 'MET', 'CYS', 'TRP']}

rol = []
for path in ['data/mmcif/'+eg+'.cif']:
    st = gemmi.read_structure(path)
    if 0.1 < st.resolution < 5:
        model = st[0]
        if len(st) > 1:
            print("There are multiple models!")
        for chain in model:
            for r, res in enumerate(chain.get_polymer()):
                # previous_residue() and next_residue() return previous/next
                # residue only if the residues are bonded. Otherwise -- None.
                prev_res = chain.previous_residue(res)
                next_res = chain.next_residue(res)
                phi, psi = gemmi.calculate_phi_psi(prev_res, res, next_res)
                omega = gemmi.calculate_omega(res, next_res)
                rol.append([res.name, res.label_seq, res.subchain, 
                            degrees(phi), degrees(omega), degrees(psi)])
                try:
                    ramas[res.name].append([degrees(phi), degrees(omega), degrees(psi)])
                except KeyError:
                    pass

In [21]:
"""
# Write data to files
for aa, data in ramas.items():
    with open('data/ramach/' + aa + '.tsv', 'w') as f:
        for phi, omega, psi in data:
            f.write('%.4f\t%.4f\n' % (degrees(phi), degrees(psi)))"""

"\n# Write data to files\nfor aa, data in ramas.items():\n    with open('data/ramach/' + aa + '.tsv', 'w') as f:\n        for phi, omega, psi in data:\n            f.write('%.4f\t%.4f\n' % (degrees(phi), degrees(psi)))"

In [22]:
cols = ['name', 'label_seq_id', 'chain', 'phi', 'omega', 'psi']

In [23]:
res_df = pd.DataFrame(data=rol, columns=cols)

In [24]:
res_df

Unnamed: 0,name,label_seq_id,chain,phi,omega,psi
0,MET,1,A,,-179.718577,133.146625
1,ASN,2,A,-87.567941,179.953276,-5.295383
2,GLY,3,A,-150.672305,178.921162,143.916355
3,THR,4,A,-85.441746,178.477151,112.209423
4,GLU,5,A,-96.636867,178.659871,123.041558
...,...,...,...,...,...,...
638,CYS,322,B,-74.457863,179.107101,-7.588421
639,CYS,323,B,65.675548,179.210471,29.106972
640,GLY,324,B,134.859554,-179.348699,-24.685362
641,LYS,325,B,-77.173222,-179.767435,94.072619


In [25]:
full = pd.merge(res_df, df, on=['label_seq_id', 'label_seq_id'])

In [26]:
import numpy as np
full = full.astype({'label_seq_id': np.int16, 'auth_seq_id': np.int16, 'id': np.int16, 
                    'phi': np.float32, 'omega': np.float32, 'psi': np.float32,
                    'x': np.float32, 'y': np.float32, 'z': np.float32})

In [27]:
def get_atom_id(x):
    atom_dict = {'C': 0, 'O': 1, 'N': 2, 'H': 3, 'S': 4}
    try:
        return atom_dict[str(x)]
    except:
        return 5
full['atom_id'] = full.apply(lambda x: get_atom_id(x.atom_type), axis=1)

In [28]:
full

Unnamed: 0,name,label_seq_id,chain,phi,omega,psi,group_PDB,auth_asym_id,label_asym_id,auth_seq_id,label_comp_id,id,label_atom_id,atom_type,x,y,z,atom_id
0,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,1,N,N,43.958000,-5.980000,-27.757999,2
1,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,2,CA,C,44.717999,-5.054000,-26.910999,0
2,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,3,C,C,44.069000,-4.848000,-25.542999,0
3,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,4,O,O,42.854000,-4.618000,-25.451000,1
4,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,5,CB,C,44.868000,-3.699000,-27.610001,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9920,ALA,348,A,-165.916489,,,ATOM,A,A,348,ALA,2634,CA,C,88.039001,30.749001,17.236000,0
9921,ALA,348,A,-165.916489,,,ATOM,A,A,348,ALA,2635,C,C,89.197998,31.023001,16.242001,0
9922,ALA,348,A,-165.916489,,,ATOM,A,A,348,ALA,2636,O,O,89.163002,30.493000,15.102000,1
9923,ALA,348,A,-165.916489,,,ATOM,A,A,348,ALA,2637,CB,C,88.360001,29.510000,18.103001,0


In [29]:
xyz = full[['x', 'y', 'z']].to_numpy(dtype=float)

In [30]:
ids = full[['label_seq_id', 'auth_seq_id']].to_numpy(dtype='int')

In [31]:
atoms = full['atom_id'].to_numpy(dtype=int)

In [32]:
xyz

array([[ 43.95800018,  -5.98000002, -27.75799942],
       [ 44.7179985 ,  -5.0539999 , -26.9109993 ],
       [ 44.06900024,  -4.84800005, -25.54299927],
       ...,
       [ 89.16300201,  30.49300003,  15.10200024],
       [ 88.36000061,  29.51000023,  18.10300064],
       [ 90.14499664,  31.76199913,  16.62400055]])

In [33]:
ids

array([[  1,   1],
       [  1,   1],
       [  1,   1],
       ...,
       [348, 348],
       [348, 348],
       [348, 348]])

In [34]:
atoms[:100]

array([2, 0, 0, 1, 0, 0, 4, 0, 2, 0, 0, 1, 0, 0, 4, 0, 2, 0, 0, 1, 0, 0,
       4, 0, 2, 0, 0, 1, 0, 0, 4, 0, 2, 0, 0, 1, 0, 0, 1, 2, 2, 0, 0, 1,
       0, 0, 1, 2, 2, 0, 0, 1, 0, 0, 1, 2, 2, 0, 0, 1, 0, 0, 1, 2, 2, 0,
       0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 0, 1, 0, 2,
       0, 0, 1, 0, 1, 0, 2, 0, 0, 1, 0, 1])

In [35]:
"""import plotly.express as px
select = full[(100 < full['label_seq_id']) & (full['label_seq_id'] < 120)]
df = px.data.iris()
fig = px.scatter_3d(select, x='x', y='y', z='z',
                    color='atom_type', opacity=0.5)
fig.update_traces(marker=dict(size=7))
fig.show()"""

"import plotly.express as px\nselect = full[(100 < full['label_seq_id']) & (full['label_seq_id'] < 120)]\ndf = px.data.iris()\nfig = px.scatter_3d(select, x='x', y='y', z='z',\n                    color='atom_type', opacity=0.5)\nfig.update_traces(marker=dict(size=7))\nfig.show()"

In [36]:
"""import sys
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator


def plot(data_file, label, output=None):
    x, y = [], []
    for line in open(data_file):
        phi, psi = line.split()
        if phi != 'nan' and psi != 'nan':
            x.append(float(phi))
            y.append(float(psi))
    print('Plotting %d points for %s' % (len(x), label))

    plt.figure(figsize=(5.5, 5.5))
    plt.title('%s, %d points.' % (label, len(x)), fontsize=12)
    plt.xlim([-180, 180])
    plt.ylim([-180, 180])
    ax = plt.gca()
    ax.xaxis.set_major_locator(MultipleLocator(60))
    ax.yaxis.set_major_locator(MultipleLocator(60))
    plt.xlabel(r'$\phi$', fontsize=14)
    plt.ylabel(r'$\psi$', fontsize=14, labelpad=0)
    plt.grid(color='#AAAAAA', linestyle='--')
    plt.hexbin(x, y, gridsize=2*180, bins='log', cmap='Blues')
    if output:
        plt.savefig(output, dpi=300)  # dpi=70 for small images in docs
    else:
        plt.show()


for aa in ramas:
    plot('data/ramach/%s.tsv' % aa, aa)
    #plot('ramas/%s.tsv' % aa, aa, 'ramas/%s.png' % aa)"""

"import sys\nimport matplotlib.pyplot as plt\nfrom matplotlib.ticker import MultipleLocator\n\n\ndef plot(data_file, label, output=None):\n    x, y = [], []\n    for line in open(data_file):\n        phi, psi = line.split()\n        if phi != 'nan' and psi != 'nan':\n            x.append(float(phi))\n            y.append(float(psi))\n    print('Plotting %d points for %s' % (len(x), label))\n\n    plt.figure(figsize=(5.5, 5.5))\n    plt.title('%s, %d points.' % (label, len(x)), fontsize=12)\n    plt.xlim([-180, 180])\n    plt.ylim([-180, 180])\n    ax = plt.gca()\n    ax.xaxis.set_major_locator(MultipleLocator(60))\n    ax.yaxis.set_major_locator(MultipleLocator(60))\n    plt.xlabel(r'$\\phi$', fontsize=14)\n    plt.ylabel(r'$\\psi$', fontsize=14, labelpad=0)\n    plt.grid(color='#AAAAAA', linestyle='--')\n    plt.hexbin(x, y, gridsize=2*180, bins='log', cmap='Blues')\n    if output:\n        plt.savefig(output, dpi=300)  # dpi=70 for small images in docs\n    else:\n        plt.show()\

In [1]:
from utils import *
from utils2 import *
from utils3 import *
from plotting import *
from gpcrdb_soup import *

In [2]:
from tqdm import tqdm, trange

In [3]:
import sys
import pandas
import gemmi
from gemmi import cif

In [40]:
class DataLoader():
    def __init__(self, 
                 path = 'data/',
                 structure = 'mmcif/',
                 limit=10,
                 remove_hetatm=True):
        self.path = path
        self.structure_path = self.path + structure
        self.path_table = path + 'gpcrdb/' + 'structures.pkl'
        
        self.filenames, self.pdb_ids = get_pdb_files(path=self.structure_path)
        # Columns for structure dataframe
        self.cols = ['group_PDB', 'auth_asym_id', 'label_asym_id', 'label_seq_id', 'auth_seq_id', 
                     'label_comp_id', 'id', 'label_atom_id', 
                     'type_symbol', 'Cartn_x', 'Cartn_y', 'Cartn_z']
        
        self.numbering = pd.DataFrame()
        self.table = pd.read_pickle(self.path_table)
        for i, pdb_id in tqdm(enumerate(self.pdb_ids)):
            if i < limit:
                protein, family = self.get_prot_info(pdb_id)
                numbering = self.get_res_nums(protein)
                if i == 0:
                    self.structure = self.load_cifs(pdb_id)
                    self.structure['identifier'] = protein.upper()
                    self.structure['label_comp_sid'] = self.structure.apply(lambda x: \
                                                                            gemmi.find_tabulated_residue(\
                                                                            x.label_comp_id).one_letter_code, axis=1)
                    self.mappings = self.get_mapping(pdb_id)
                    numb = [pdb_id, protein, family, numbering]
                    # numb = [pdb_id, protein, self.entry_to_ac(protein), family, numbering]
                    self.numbering = self.numbering.append(pd.DataFrame(numb).T)
                else:
                    structure = self.load_cifs(pdb_id)
                    structure['identifier'] = protein.upper()
                    self.structure = self.structure.append(structure, ignore_index=True)
                    self.mappings = self.mappings.append(self.get_mapping(pdb_id), ignore_index=True)
                    # self.numbering = self.numbering.append(pd.DataFrame(data=[pdb_id, protein, self.entry_to_ac(protein),
                    #                                                           family, numbering]).T, 
                    self.numbering = self.numbering.append(pd.DataFrame(data=[pdb_id, protein, family, numbering]).T, 
                                                           ignore_index=True)
        self.numbering.columns = ['PDB', 'identifier', 'family', 'numbering']
        
        if remove_hetatm:
            self.structure = self.structure[self.structure['group_PDB']!='HETATM']
            self.structure['label_seq_id'] = self.structure['label_seq_id'].astype(np.int64)
        self.structure['label_comp_sid'] = self.structure.\
                                                    apply(lambda x:
                                                    gemmi.find_tabulated_residue(x.label_comp_id).one_letter_code, 
                                                    axis=1)
        
        
    def entry_to_ac(self, entry: str):
        query = 'https://www.uniprot.org/uniprot/'+entry+'.txt'
        print('querying', query)
        response = requests.get(query)
        return response.text.split('\n')[1].split('AC   ')[1][:6]
    
    def get_prot_info(self, pdb_id):
        # query structure
        query = 'https://gpcrdb.org/services/structure/'+pdb_id.upper()+'/'
        print('querying', query)
        response = requests.get(query)
        protein = response.json()['protein']
        family = response.json()['family']
        return protein, family
    
    def get_res_nums(self, protein):
        # query uniprot -> res num
        query = 'https://gpcrdb.org/services/residues/extended/'+protein+'/'
        print("querying", query)
        response = requests.get(query)
        # select res num
        # assign res_num to structure data
        return response.json()
    
    def get_mapping(self, pdb_id):
        print("mapping")
        maps = get_mappings_data(pdb_id)[pdb_id.lower()]['UniProt']
        uniprots = maps.keys()
        full_table=pd.DataFrame()
        for i, uniprot in enumerate(uniprots):
            table = pd.DataFrame.from_dict(maps[uniprot])
            table['PDB'] = pdb_id
            table['uniprot'] = uniprot
            if i == 0:
                full_table = table
            else:
                full_table = full_table.append(table, ignore_index=True)
        return full_table
    
    def load_cifs(self, pdb_id):
        print("Loading cif file of", pdb_id)
        path = 'data/mmcif/' + pdb_id + '.cif'
        print(path)
        try:
            doc = cif.read_file(path)  # copy all the data from mmCIF file
            lol = []  # list of lists
            for b, block in enumerate(doc):
                table = block.find('_atom_site.', self.cols)
                for row in table:
                    lol.append([pdb_id]+list(row))
        except Exception as e:
            print("Oops. %s" % e)
            sys.exit(1)
        cols = ['PDB']+self.cols
        return pd.DataFrame(data=lol, columns=cols)

In [41]:
data = DataLoader(limit=20)

0it [00:00, ?it/s]

querying https://gpcrdb.org/services/structure/5UZ7/
querying https://gpcrdb.org/services/residues/extended/calcr_human/
Loading cif file of 5UZ7
data/mmcif/5UZ7.cif
mapping
getting the mapping...


1it [02:07, 127.06s/it]

got the mapping
querying https://gpcrdb.org/services/structure/6LPB/
querying https://gpcrdb.org/services/residues/extended/pacr_human/
Loading cif file of 6LPB
data/mmcif/6LPB.cif
mapping
getting the mapping...


2it [03:13, 108.96s/it]

got the mapping
querying https://gpcrdb.org/services/structure/6MET/
querying https://gpcrdb.org/services/residues/extended/ccr5_human/
Loading cif file of 6MET
data/mmcif/6MET.cif
mapping
getting the mapping...


3it [04:01, 90.53s/it] 

got the mapping
querying https://gpcrdb.org/services/structure/6LI3/
querying https://gpcrdb.org/services/residues/extended/gpr52_human/
Loading cif file of 6LI3
data/mmcif/6LI3.cif
mapping
getting the mapping...


4it [04:43, 76.11s/it]

got the mapping
querying https://gpcrdb.org/services/structure/4LDL/
querying https://gpcrdb.org/services/residues/extended/adrb2_human/
Loading cif file of 4LDL
data/mmcif/4LDL.cif
mapping
getting the mapping...


5it [05:48, 72.56s/it]

got the mapping
querying https://gpcrdb.org/services/structure/6HLP/
querying https://gpcrdb.org/services/residues/extended/nk1r_human/
Loading cif file of 6HLP
data/mmcif/6HLP.cif
mapping
getting the mapping...


6it [06:47, 68.67s/it]

got the mapping
querying https://gpcrdb.org/services/structure/6WQA/
querying https://gpcrdb.org/services/residues/extended/aa2ar_human/
Loading cif file of 6WQA
data/mmcif/6WQA.cif
mapping
getting the mapping...


7it [08:59, 87.73s/it]

got the mapping
querying https://gpcrdb.org/services/structure/5OLZ/
querying https://gpcrdb.org/services/residues/extended/aa2ar_human/
Loading cif file of 5OLZ
data/mmcif/5OLZ.cif
mapping
getting the mapping...


8it [09:46, 75.39s/it]

got the mapping
querying https://gpcrdb.org/services/structure/4NC3/
querying https://gpcrdb.org/services/residues/extended/5ht2b_human/
Loading cif file of 4NC3
data/mmcif/4NC3.cif
mapping
getting the mapping...


9it [11:10, 78.12s/it]

got the mapping
querying https://gpcrdb.org/services/structure/6WWZ/
querying https://gpcrdb.org/services/residues/extended/ccr6_human/
Loading cif file of 6WWZ
data/mmcif/6WWZ.cif
mapping
getting the mapping...


10it [12:22, 76.09s/it]

got the mapping
querying https://gpcrdb.org/services/structure/6NI3/
querying https://gpcrdb.org/services/residues/extended/adrb2_human/
Loading cif file of 6NI3
data/mmcif/6NI3.cif
mapping
getting the mapping...


11it [13:48, 79.14s/it]

got the mapping
querying https://gpcrdb.org/services/structure/6IIV/
querying https://gpcrdb.org/services/residues/extended/ta2r_human/
Loading cif file of 6IIV
data/mmcif/6IIV.cif
mapping
getting the mapping...


12it [14:44, 72.17s/it]

got the mapping
querying https://gpcrdb.org/services/structure/4OR2/
querying https://gpcrdb.org/services/residues/extended/grm1_human/
Loading cif file of 4OR2
data/mmcif/4OR2.cif
mapping
getting the mapping...


13it [15:25, 62.73s/it]

got the mapping
querying https://gpcrdb.org/services/structure/3PBL/
querying https://gpcrdb.org/services/residues/extended/drd3_human/
Loading cif file of 3PBL
data/mmcif/3PBL.cif
mapping
getting the mapping...


14it [16:24, 61.75s/it]

got the mapping
querying https://gpcrdb.org/services/structure/4N6H/
querying https://gpcrdb.org/services/residues/extended/oprd_human/
Loading cif file of 4N6H
data/mmcif/4N6H.cif
mapping
getting the mapping...


15it [16:54, 52.10s/it]

got the mapping
querying https://gpcrdb.org/services/structure/5UIG/
querying https://gpcrdb.org/services/residues/extended/aa2ar_human/
Loading cif file of 5UIG
data/mmcif/5UIG.cif
mapping
getting the mapping...


16it [17:55, 54.91s/it]

got the mapping
querying https://gpcrdb.org/services/structure/5YWY/
querying https://gpcrdb.org/services/residues/extended/pe2r4_human/
Loading cif file of 5YWY
data/mmcif/5YWY.cif
mapping
getting the mapping...


17it [19:33, 67.72s/it]

got the mapping
querying https://gpcrdb.org/services/structure/6HLO/
querying https://gpcrdb.org/services/residues/extended/nk1r_human/
Loading cif file of 6HLO
data/mmcif/6HLO.cif
mapping
getting the mapping...


18it [20:38, 66.95s/it]

got the mapping
querying https://gpcrdb.org/services/structure/6PS5/
querying https://gpcrdb.org/services/residues/extended/adrb2_human/
Loading cif file of 6PS5
data/mmcif/6PS5.cif
mapping
getting the mapping...


19it [21:08, 56.00s/it]

got the mapping
querying https://gpcrdb.org/services/structure/4MBS/
querying https://gpcrdb.org/services/residues/extended/ccr5_human/
Loading cif file of 4MBS
data/mmcif/4MBS.cif
mapping
getting the mapping...


530it [21:52,  2.48s/it]

got the mapping





In [None]:
def lookup(pdb, identifier, mappings):
    return list(mappings[(mappings['identifier']==identifier.upper()) & (mappings['PDB']==pdb)]['mappings'])

In [None]:
# df = pd.merge(data.table, data.structure, on='PDB')
# df = pd.merge(df, data.numbering, on='PDB')

In [42]:
data.structure['identifier']

0         CALCR_HUMAN
1         CALCR_HUMAN
2         CALCR_HUMAN
3         CALCR_HUMAN
4         CALCR_HUMAN
             ...     
112266     CCR5_HUMAN
112267     CCR5_HUMAN
112268     CCR5_HUMAN
112269     CCR5_HUMAN
112270     CCR5_HUMAN
Name: identifier, Length: 108150, dtype: object

In [43]:
"""for i, pdb in enumerate(list(set(list(data.structure['PDB'])))):
    data.structure[data.structure['identifier']] = data.table[data.table['PDB']==pdb]['uniprot(gene)'].iloc[0]"""

"for i, pdb in enumerate(list(set(list(data.structure['PDB'])))):\n    data.structure[data.structure['identifier']] = data.table[data.table['PDB']==pdb]['uniprot(gene)'].iloc[0]"

In [63]:
# add gene to mapping

for i, pdb in enumerate(list(set(list(data.structure['PDB'])))):
    mappings = data.mappings[data.mappings['PDB']==pdb]
    pref_chain = data.table[data.table['PDB']==pdb]['Preferred Chain'].iloc[0]
    map_df_list = []
    for j in range(len(mappings)):
        chain = pd.DataFrame.from_dict(mappings.iloc[j]['mappings'])['chain_id'].iloc[0]
        identifier = mappings.iloc[j]['name']
        dict_ = pd.DataFrame.from_dict(mappings.iloc[j]['mappings'])
        dict_['identifier'] = identifier
        map_df_list.append(pd.DataFrame.from_dict(dict_))
    _ = pd.concat(map_df_list)
    _ = _[_['chain_id']==pref_chain]
    _['PDB'] = pdb
    if i == 0:
        maps_stacked = _
    else:
        maps_stacked = maps_stacked.append(_)

In [64]:
maps_stacked

Unnamed: 0,entity_id,chain_id,start,unp_end,unp_start,end,struct_asym_id,identifier,PDB
author_residue_number,5,R,,474,25,,E,CALCR_HUMAN,5UZ7
author_insertion_code,5,R,,474,25,,E,CALCR_HUMAN,5UZ7
residue_number,5,R,33,474,25,482,E,CALCR_HUMAN,5UZ7
author_residue_number,2,R,,417,21,,B,PACR_HUMAN,6LPB
author_insertion_code,2,R,,417,21,,B,PACR_HUMAN,6LPB
...,...,...,...,...,...,...,...,...,...
author_insertion_code,1,A,,352,227,,A,CCR5_HUMAN,4MBS
residue_number,1,A,280,352,227,405,A,CCR5_HUMAN,4MBS
author_residue_number,1,A,1001,54,1,1054,A,RUBR_CLOPA,4MBS
author_insertion_code,1,A,,54,1,,A,RUBR_CLOPA,4MBS


In [65]:
def get_generic_nums(pdb_id, numbering):
    sequence_numbers = []
    amino_acids = []
    generic_numbers = []
    for i in numbering[numbering['PDB']==pdb_id].iloc[0]['numbering']:
        if i['alternative_generic_numbers'] != []:
            sequence_numbers.append(i['sequence_number'])
            amino_acids.append(i['amino_acid'])
            generic_numbers.append(i['display_generic_number'])
    return list(zip(sequence_numbers, amino_acids, generic_numbers))

In [92]:
data.structure.drop(['gen_pos','gen_pos1','gen_pos2','uniprot_comp_sid'], axis=1, inplace=True)

In [89]:
data.structure['label_2_uni'] = 0
data.structure['gen_pos'] = ''
data.structure['gen_pos1'] = 0
data.structure['gen_pos2'] = 0
data.structure['uniprot_comp_sid'] = ''



def get_pos(pdb_id, numbering, l2u, comp_sid):
    zipped_pos_dict = get_generic_nums(pdb_id, data.numbering)  # nums, aas, gn
    if l2u >= 0:
        if l2u in list(zip(*zipped_pos_dict))[0]:
            idx = list(zip(*zipped_pos_dict))[0].index(l2u)
            row = zipped_pos_dict[idx]
            if row[1] == comp_sid:
                # doesnt run...
                print("found row", row[1], float(row[2].split('x')[0]), int(row[2].split('x')[1]), comp_sid)
                return row[2], row[1], float(row[2].split('x')[0]), int(row[2].split('x')[1])
            else:
                print("found row, but residue are not the same", row, comp_sid)
                return row[2]+'?', row[1], float(row[2].split('x')[0]), int(row[2].split('x')[1])
        else:
            return ['', '', 0, 0]
    else:
        return ['', '', 0, 0]


pdbs = list(set(list(data.structure['PDB'])))
for i, pdb in enumerate(pdbs[:2]):
    print(pdb)
    uniprot_list = []
    if type(maps_stacked[maps_stacked['PDB']==pdb].\
            loc['residue_number'][['chain_id', 'start','end','unp_start','unp_end', 'identifier', 'PDB']]) == pandas.core.series.Series:
        pref_mapping = maps_stacked[maps_stacked['PDB']==pdb].\
            loc['residue_number'][['chain_id', 'start','end','unp_start','unp_end', 'identifier', 'PDB']].to_frame().T
    else:
        pref_mapping = maps_stacked[maps_stacked['PDB']==pdb].\
            loc['residue_number'][['chain_id', 'start','end','unp_start','unp_end', 'identifier', 'PDB']]
    pref_chain = pref_mapping['chain_id'].iloc[0]
    pref_mapping = pref_mapping.sort_values('start')
    uniprot_identifier = data.structure[data.structure['PDB']==pdb]['identifier'].iloc[0]
        
    for j in range(len(pref_mapping)):
        row = pref_mapping.iloc[j].to_dict()
        map_identifier = row['identifier']
        map_pdb = row['PDB']
        start_label_seq_id = row['start']
        start_uniprot = row['unp_start']
        end_label_seq_id = row['end']
        end_uniprot = row['unp_end']
        if map_identifier == uniprot_identifier:
            print('Found correct uniprot map:', uniprot_identifier, map_identifier)
            idxs = [x for x in range(10000) \
                    if ((x <= end_label_seq_id) & (x >= start_label_seq_id))]
            vals = [x + start_uniprot - start_label_seq_id for x in range(10000) \
                    if ((x <= end_label_seq_id) & (x >= start_label_seq_id))]

            for k, idx in enumerate(idxs):
                line = data.structure[(data.structure['PDB'] == pdb) & 
                                      (data.structure['label_asym_id'] == pref_chain) & 
                                      (data.structure['label_seq_id'] == idx) &
                                      (data.structure['label_atom_id'] == 'CA')]
                lines = len(line)
                if lines > 0:
                    data.structure.at[line.index[0], 'label_2_uni'] = int(vals[k])
        else:
            print('Didnt find correct uniprotmap (not a gpcr):', uniprot_identifier, map_identifier)
# statement to drive labelling of residues by gene only if gene is a gpcr
data.structure[['gen_pos', 'uniprot_comp_sid', 'gen_pos1', 'gen_pos2']] = data.structure.\
    apply(lambda x: get_pos(x.PDB, data.numbering, x.label_2_uni, x.label_comp_sid), axis=1, result_type='expand')

5UZ7
Found correct uniprot map: CALCR_HUMAN CALCR_HUMAN
6LPB
Found correct uniprot map: PACR_HUMAN PACR_HUMAN


In [77]:
data.structure['label_comp_sid','label_2_uni','gen_pos','gen_pos1','gen_pos2','uniprot_comp_sid']

KeyError: ('label_comp_sid', 'label_2_uni', 'gen_pos', 'gen_pos1', 'gen_pos2', 'uniprot_comp_sid')

In [93]:
data.structure.to_pickle('data_structure.pkl')
data.numbering.to_pickle('data_numbering.pkl')
data.table.to_pickle('data_table.pkl')
data.mappings.to_pickle('data_mappings.pkl')


In [4]:
structure = pd.read_pickle('data_structure.pkl')
numbering = pd.read_pickle('data_numbering.pkl')
table = pd.read_pickle('data_table.pkl')
mappings = pd.read_pickle('data_mappings.pkl')


In [7]:
table

Unnamed: 0,uniprot(gene),Cl.,PDB,Resolution,Preferred Chain,State,Function
0,STE2,D1(Ste2-likefungalpheromone),7AD3,3.5,A,Active,Agonist
1,CCR2,A(Rhodopsin),5T1A,2.8,A,Inactive,AntagonistNAM
2,OPRM,A(Rhodopsin),4DKL,2.8,A,Inactive,Antagonist
3,CNR2,A(Rhodopsin),5ZTY,2.8,A,Inactive,Antagonist
4,5HT1B,A(Rhodopsin),6G79,3.8,S,Active,Agonist
...,...,...,...,...,...,...,...
523,OPSD,A(Rhodopsin),6PH7,2.9,A,Active,unknown
524,NTR1,A(Rhodopsin),7L0S,4.5,C,Active,Agonist
525,AA2AR,A(Rhodopsin),6S0Q,2.7,A,Inactive,Antagonist
526,GLP1R,B1(Secretin),7LCJ,2.8,R,Active,Agonist


In [5]:
numbering

Unnamed: 0,PDB,identifier,family,numbering
0,5UZ7,calcr_human,002_001_001_001,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr..."
1,6LPB,pacr_human,002_001_005_001,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr..."
2,6MET,ccr5_human,001_003_002_005,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr..."
3,6LI3,gpr52_human,001_011_001_028,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr..."
4,4LDL,adrb2_human,001_001_003_008,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr..."
5,6HLP,nk1r_human,001_002_029_001,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr..."
6,6WQA,aa2ar_human,001_006_001_002,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr..."
7,5OLZ,aa2ar_human,001_006_001_002,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr..."
8,4NC3,5ht2b_human,001_001_001_007,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr..."
9,6WWZ,ccr6_human,001_003_002_006,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr..."


In [6]:
mappings

Unnamed: 0,identifier,name,mappings,PDB,uniprot
0,GBB1_HUMAN,GBB1_HUMAN,"{'entity_id': 2, 'chain_id': 'B', 'start': {'a...",5UZ7,P62873
1,CALCR_HUMAN,CALCR_HUMAN,"{'entity_id': 5, 'chain_id': 'R', 'start': {'a...",5UZ7,P30988
2,GNAS2_HUMAN,GNAS2_HUMAN,"{'entity_id': 1, 'chain_id': 'A', 'start': {'a...",5UZ7,P63092
3,GBG2_HUMAN,GBG2_HUMAN,"{'entity_id': 3, 'chain_id': 'G', 'start': {'a...",5UZ7,P59768
4,GBB1_RAT,GBB1_RAT,"{'entity_id': 3, 'chain_id': 'B', 'start': {'a...",6LPB,P54311
...,...,...,...,...,...
70,CCR5_HUMAN,CCR5_HUMAN,"{'entity_id': 1, 'chain_id': 'A', 'start': {'a...",4MBS,P51681
71,CCR5_HUMAN,CCR5_HUMAN,"{'entity_id': 1, 'chain_id': 'B', 'start': {'a...",4MBS,P51681
72,CCR5_HUMAN,CCR5_HUMAN,"{'entity_id': 1, 'chain_id': 'B', 'start': {'a...",4MBS,P51681
73,RUBR_CLOPA,RUBR_CLOPA,"{'entity_id': 1, 'chain_id': 'A', 'start': {'a...",4MBS,P00268


In [25]:
def get_stacked_maps(structure, mappings, table):
    # add gene to mapping
    for i, pdb in enumerate(list(set(list(structure['PDB'])))):
        mappings_ = mappings[mappings['PDB']==pdb]
        pref_chain = table[table['PDB']==pdb]['Preferred Chain'].iloc[0]
        map_df_list = []
        for j in range(len(mappings_)):
            chain = pd.DataFrame.from_dict(mappings_.iloc[j]['mappings'])['chain_id'].iloc[0]
            identifier = mappings.iloc[j]['name']
            dict_ = pd.DataFrame.from_dict(mappings_.iloc[j]['mappings'])
            dict_['identifier'] = identifier
            map_df_list.append(pd.DataFrame.from_dict(dict_))
        _ = pd.concat(map_df_list)
        _ = _[_['chain_id']==pref_chain]
        _['PDB'] = pdb
        if i == 0:
            maps_stacked = _
        else:
            maps_stacked = maps_stacked.append(_)
    return maps_stacked


def get_generic_nums(pdb_id, numbering):
    sequence_numbers = []
    amino_acids = []
    generic_numbers = []
    for i in numbering[numbering['PDB']==pdb_id].iloc[0]['numbering']:
        if i['alternative_generic_numbers'] != []:
            sequence_numbers.append(i['sequence_number'])
            amino_acids.append(i['amino_acid'])
            generic_numbers.append(i['display_generic_number'])
    return list(zip(sequence_numbers, amino_acids, generic_numbers))


def get_generic_number(zipped_pos_dict, l2u, comp_sid):
    if l2u >= 0:
        print(l2u)
        if l2u in list(zip(*zipped_pos_dict))[0]:
            idx = list(zip(*zipped_pos_dict))[0].index(l2u)
            row = zipped_pos_dict[idx]
            if row[1] == comp_sid:
                print("found row", row[1], float(row[2].split('x')[0]), int(row[2].split('x')[1]), comp_sid)
                return row[2], row[1], float(row[2].split('x')[0]), int(row[2].split('x')[1])
            else:
                print("found row, but residue are not the same", row, comp_sid)
                return row[2]+'?', row[1], float(row[2].split('x')[0]), int(row[2].split('x')[1])
        else:
            return ['', '', 0, 0]
    else:
        return ['', '', 0, 0]


def assign_generic_numbers(structure, numbering, mappings, table, overwrite=True, limit=3):
    columns = ['gen_pos','gen_pos1','gen_pos2','uniprot_comp_sid']
    overwrite_cols=[]
    for c in structure.columns:
        if c in columns:
            overwrite_cols.append(c)
    if overwrite:
        structure.drop(overwrite_cols, axis=1, inplace=True)
    structure['label_2_uni'] = 0
    structure['gen_pos'] = ''
    structure['gen_pos1'] = 0
    structure['gen_pos2'] = 0
    structure['uniprot_comp_sid'] = ''
    
    pdbs = list(set(list(structure['PDB'])))[:limit]
    maps_stacked = get_stacked_maps(structure, mappings, table)
    for i, pdb in enumerate(pdbs):
        if type(maps_stacked[maps_stacked['PDB']==pdb].\
                loc['residue_number'][['chain_id', 'start','end','unp_start','unp_end', 'identifier', 'PDB']])\
            == pandas.core.series.Series: 
            pref_mapping = maps_stacked[maps_stacked['PDB']==pdb].\
                loc['residue_number'][['chain_id', 'start','end','unp_start','unp_end', 'identifier', 'PDB']].to_frame().T
        else:
            pref_mapping = maps_stacked[maps_stacked['PDB']==pdb].\
                loc['residue_number'][['chain_id', 'start','end','unp_start','unp_end', 'identifier', 'PDB']]
            
        pref_chain = pref_mapping['chain_id'].iloc[0]
        pref_mapping = pref_mapping.sort_values('start')
        uniprot_identifier_ = structure[structure['PDB']==pdb]['identifier']
        assert len(uniprot_identifier_) >= 1, print(uniprot_identifier_)
        uniprot_identifier = uniprot_identifier_.iloc[0]
        added_labels = False
        for j in range(len(pref_mapping)):
            row = pref_mapping.iloc[j].to_dict()
            map_identifier = row['identifier']
            map_pdb = row['PDB']
            start_label_seq_id = row['start']
            start_uniprot = row['unp_start']
            end_label_seq_id = row['end']
            end_uniprot = row['unp_end']
            if map_identifier == uniprot_identifier:
                added_labels=True
                print('Found correct uniprot map:', uniprot_identifier, map_identifier)
                max_idx = max(end_label_seq_id, end_uniprot)
                idxs = [x for x in range(max_idx) \
                        if ((x <= end_label_seq_id) & (x >= start_label_seq_id))]
                vals = [x + start_uniprot - start_label_seq_id for x in range(max_idx) \
                        if ((x <= end_label_seq_id) & (x >= start_label_seq_id))]

                for k, idx in enumerate(idxs):
                    line = structure[(structure['PDB'] == pdb) & 
                                     (structure['label_asym_id'] == pref_chain) & 
                                     (structure['label_seq_id'] == idx) &
                                     (structure['label_atom_id'] == 'CA')]
                    lines = len(line)
                    if lines > 0:
                        structure.at[line.index[0], 'label_2_uni'] = int(vals[k])
            else:
                print('Didnt find correct uniprotmap (not a gpcr):', uniprot_identifier, map_identifier)
        if added_labels:
            # Generate generic numbers
            print("making number list")
            zipped_pos_dict = get_generic_nums(pdb, numbering)
            print("generating generic numbers for structures of pdbs:\n",pdbs)
            structure[['gen_pos', 'uniprot_comp_sid', 'gen_pos1', 'gen_pos2']] = structure.\
                apply(lambda x: get_generic_number(zipped_pos_dict, x.label_2_uni, x.label_comp_sid) if x.PDB==pdb\
                      else [x.gen_pos, x.uniprot_comp_sid, x.gen_pos1, x.gen_pos2], axis=1, 
                      result_type='expand')
    return structure

In [28]:
s = assign_generic_numbers(structure, numbering, mappings, table, limit=20)

Didnt find correct uniprotmap (not a gpcr): AA2AR_HUMAN CALCR_HUMAN
Didnt find correct uniprotmap (not a gpcr): AA2AR_HUMAN GBB1_HUMAN
Didnt find correct uniprotmap (not a gpcr): AA2AR_HUMAN GNAS2_HUMAN
Didnt find correct uniprotmap (not a gpcr): CCR5_HUMAN GNAS2_HUMAN
Didnt find correct uniprotmap (not a gpcr): ADRB2_HUMAN CALCR_HUMAN
Didnt find correct uniprotmap (not a gpcr): ADRB2_HUMAN GBB1_HUMAN
Didnt find correct uniprotmap (not a gpcr): ADRB2_HUMAN CALCR_HUMAN
Didnt find correct uniprotmap (not a gpcr): ADRB2_HUMAN GBB1_RAT
Didnt find correct uniprotmap (not a gpcr): DRD3_HUMAN GNAS2_HUMAN
Didnt find correct uniprotmap (not a gpcr): DRD3_HUMAN GBB1_HUMAN
Didnt find correct uniprotmap (not a gpcr): DRD3_HUMAN GBG2_HUMAN
Didnt find correct uniprotmap (not a gpcr): TA2R_HUMAN GNAS2_HUMAN
Didnt find correct uniprotmap (not a gpcr): TA2R_HUMAN GBB1_HUMAN
Didnt find correct uniprotmap (not a gpcr): TA2R_HUMAN GBG2_HUMAN
Didnt find correct uniprotmap (not a gpcr): TA2R_HUMAN CALCR_HUM

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


Didnt find correct uniprotmap (not a gpcr): PE2R4_HUMAN GBB1_HUMAN
Didnt find correct uniprotmap (not a gpcr): 5HT2B_HUMAN CALCR_HUMAN
Didnt find correct uniprotmap (not a gpcr): 5HT2B_HUMAN GBB1_HUMAN
Didnt find correct uniprotmap (not a gpcr): 5HT2B_HUMAN GNAS2_HUMAN
Didnt find correct uniprotmap (not a gpcr): AA2AR_HUMAN CALCR_HUMAN
Didnt find correct uniprotmap (not a gpcr): AA2AR_HUMAN GBB1_HUMAN
Didnt find correct uniprotmap (not a gpcr): AA2AR_HUMAN GNAS2_HUMAN
Didnt find correct uniprotmap (not a gpcr): CCR6_HUMAN GBB1_HUMAN
Didnt find correct uniprotmap (not a gpcr): CCR6_HUMAN GNAS2_HUMAN
Didnt find correct uniprotmap (not a gpcr): NK1R_HUMAN CALCR_HUMAN
Didnt find correct uniprotmap (not a gpcr): NK1R_HUMAN GBB1_HUMAN
Didnt find correct uniprotmap (not a gpcr): NK1R_HUMAN GNAS2_HUMAN


In [123]:
s[(s['label_atom_id']=='CA')]

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,Cartn_x,Cartn_y,Cartn_z,identifier,label_comp_sid,label_2_uni,gen_pos,uniprot_comp_sid,gen_pos1,gen_pos2
1,5UZ7,ATOM,A,A,9,9,THR,2,CA,C,105.877,74.963,138.307,CALCR_HUMAN,T,0,,,0,0
8,5UZ7,ATOM,A,A,10,10,GLU,9,CA,C,105.948,71.942,135.992,CALCR_HUMAN,E,0,,,0,0
17,5UZ7,ATOM,A,A,11,11,ASP,18,CA,C,102.328,72.433,134.898,CALCR_HUMAN,D,0,,,0,0
25,5UZ7,ATOM,A,A,12,12,GLN,26,CA,C,102.871,76.175,134.425,CALCR_HUMAN,Q,0,,,0,0
34,5UZ7,ATOM,A,A,13,13,ARG,35,CA,C,106.024,75.629,132.344,CALCR_HUMAN,R,0,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112228,4MBS,ATOM,B,B,362,309,LEU,5444,CA,C,202.627,110.855,57.841,CCR5_HUMAN,L,0,,,0,0
112236,4MBS,ATOM,B,B,363,310,VAL,5452,CA,C,201.412,108.610,60.787,CCR5_HUMAN,V,0,,,0,0
112241,4MBS,ATOM,B,B,364,311,PHE,5457,CA,C,201.745,105.693,58.280,CCR5_HUMAN,F,0,,,0,0
112252,4MBS,ATOM,B,B,365,312,PHE,5468,CA,C,205.441,104.962,57.317,CCR5_HUMAN,F,0,,,0,0
