In [1]:
from utils import *
from gpcrdb_soup import *
import gemmi

In [2]:
from Bio.PDB.MMCIFParser import MMCIFParser
parser = MMCIFParser()

In [3]:
eg = '1f88'

In [4]:
path = get_rcsb_download(eg, fileformat = 'cif')

In [5]:
path

'https://files.rcsb.org/download/1f88.cif'

In [6]:

def download(url: str, folder: str, fileformat: str):
    if not os.path.isdir(folder):
        os.mkdir(folder)
    try:
        r = requests.get(url)
        loc = len(fileformat)+1
        fname = folder + '/' + url[-(loc+4):-loc] + '.' + fileformat
        with open(fname, 'wb') as f:
            f.write(r.content)
    except Exception:
        print("Url invalid:", url)

    
def download_pdb(url, folder, fileformat):
    download(url, folder, fileformat)

In [7]:
download(url=path, folder='data/mmcif', fileformat='cif')

In [8]:
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
import pandas as pd
d = MMCIF2Dict("data/mmcif/"+eg+".cif")
pd.DataFrame({k:v for k,v in d.items() if "_atom_site." in k})

Unnamed: 0,_atom_site.group_PDB,_atom_site.id,_atom_site.type_symbol,_atom_site.label_atom_id,_atom_site.label_alt_id,_atom_site.label_comp_id,_atom_site.label_asym_id,_atom_site.label_entity_id,_atom_site.label_seq_id,_atom_site.pdbx_PDB_ins_code,...,_atom_site.Cartn_y,_atom_site.Cartn_z,_atom_site.occupancy,_atom_site.B_iso_or_equiv,_atom_site.pdbx_formal_charge,_atom_site.auth_seq_id,_atom_site.auth_comp_id,_atom_site.auth_asym_id,_atom_site.auth_atom_id,_atom_site.pdbx_PDB_model_num
0,ATOM,1,N,N,.,MET,A,1,1,?,...,-5.980,-27.758,1.00,54.29,?,1,MET,A,N,1
1,ATOM,2,C,CA,.,MET,A,1,1,?,...,-5.054,-26.911,1.00,53.52,?,1,MET,A,CA,1
2,ATOM,3,C,C,.,MET,A,1,1,?,...,-4.848,-25.543,1.00,52.77,?,1,MET,A,C,1
3,ATOM,4,O,O,.,MET,A,1,1,?,...,-4.618,-25.451,1.00,51.10,?,1,MET,A,O,1
4,ATOM,5,C,CB,.,MET,A,1,1,?,...,-3.699,-27.610,1.00,53.58,?,1,MET,A,CB,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5262,HETATM,5263,O,O,.,HOH,T,7,.,?,...,-4.467,38.118,1.00,32.05,?,961,HOH,B,O,1
5263,HETATM,5264,O,O,.,HOH,T,7,.,?,...,4.389,47.804,1.00,38.08,?,969,HOH,B,O,1
5264,HETATM,5265,O,O,.,HOH,T,7,.,?,...,26.355,-7.866,1.00,28.34,?,973,HOH,B,O,1
5265,HETATM,5266,O,O,.,HOH,T,7,.,?,...,7.105,39.757,1.00,53.03,?,975,HOH,B,O,1


In [9]:
structure = parser.get_structure("", "data/mmcif/"+eg+".cif")


In [10]:
import nglview as nv
view = nv.show_biopython(structure)



In [11]:
view

NGLWidget()

In [27]:
cols = ['group_PDB', 'auth_asym_id', 'label_asym_id', 'auth_seq_id', 'label_seq_id', 'label_comp_id', 'id', 'label_atom_id',
        'type_symbol', 'Cartn_x', 'Cartn_y', 'Cartn_z']

In [28]:
import sys
import gemmi
from gemmi import cif


print("Loading cif file of", eg)
path = 'data/mmcif/'+eg+'.cif'
try:
    doc = cif.read_file(path)  # copy all the data from mmCIF file
    lol = []  # list of lists
    for b, block in enumerate(doc):
        table = block.find('_atom_site.', cols)
        for row in table:
            lol.append(list(row))
except Exception as e:
    print("Oops. %s" % e)
    sys.exit(1)

Loading cif file of 1f88


In [29]:
cols = ['group_PDB', 'auth_asym_id', 'label_asym_id', 'auth_seq_id', 'label_seq_id', 'label_comp_id', 'id', 'label_atom_id',
        'atom_type', 'x', 'y', 'z']
df = pd.DataFrame(data=lol, columns=cols)

In [30]:
def fix_label_seq_id(x):
    try:
        return int(x)
    except:
        return None

In [39]:
df

Unnamed: 0,group_PDB,auth_asym_id,label_asym_id,auth_seq_id,label_seq_id,label_comp_id,id,label_atom_id,atom_type,x,y,z
0,ATOM,A,A,1,1,MET,1,N,N,43.958,-5.980,-27.758
1,ATOM,A,A,1,1,MET,2,CA,C,44.718,-5.054,-26.911
2,ATOM,A,A,1,1,MET,3,C,C,44.069,-4.848,-25.543
3,ATOM,A,A,1,1,MET,4,O,O,42.854,-4.618,-25.451
4,ATOM,A,A,1,1,MET,5,CB,C,44.868,-3.699,-27.610
...,...,...,...,...,...,...,...,...,...,...,...,...
5262,HETATM,B,T,961,.,HOH,5263,O,O,33.819,-4.467,38.118
5263,HETATM,B,T,969,.,HOH,5264,O,O,66.200,4.389,47.804
5264,HETATM,B,T,973,.,HOH,5265,O,O,26.109,26.355,-7.866
5265,HETATM,B,T,975,.,HOH,5266,O,O,66.293,7.105,39.757


In [40]:
df.groupby('auth_seq_id')['label_comp_id'].first()

auth_seq_id
1      MET
10     TYR
100    HIS
101    GLY
102    TYR
      ... 
976    HOH
977    RET
978    RET
98     SER
99     LEU
Name: label_comp_id, Length: 377, dtype: object

In [41]:
df.groupby('label_atom_id').sum()

Unnamed: 0_level_0,group_PDB,auth_asym_id,label_asym_id,auth_seq_id,label_seq_id,label_comp_id,id,atom_type,x,y,z
label_atom_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C,ATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMAT...,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,1234567891011121314151617181920212223242526272...,1234567891011121314151617181920212223242526272...,METASNGLYTHRGLUGLYPROASNPHETYRVALPROPHESERASNL...,3111923303943505869818895106112120129136140147...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,44.06944.47444.97545.40947.14648.52050.49650.0...,-4.848-3.296-0.1932.7754.6517.63010.07810.0956...,-25.543-22.697-24.576-25.508-27.612-28.257-28....
C1,HETATMHETATMHETATMHETATMHETATMHETATMHETATMHETA...,CCDDEEEFFAB,CCDDEEEFFLR,121212312977978,...........,NAGNAGNAGNAGNAGNAGMANNAGNAGRETRET,50685082509651105124513851525163517751965221,CCCCCCCCCCC,37.62035.56447.22550.56859.51262.86964.48246.8...,3.7577.698-8.617-12.3540.9713.7248.734-9.992-1...,-28.808-31.709-23.091-21.37241.02843.58444.035...
C10,HETATMHETATM,AB,LR,977978,..,RETRET,52055230,CC,57.62638.639,10.21111.044,-10.42426.150
C11,HETATMHETATM,AB,LR,977978,..,RETRET,52065231,CC,56.58239.898,10.87111.497,-11.23626.800
C12,HETATMHETATM,AB,LR,977978,..,RETRET,52075232,CC,55.33541.161,11.24311.577,-10.84826.282
...,...,...,...,...,...,...,...,...,...,...,...
OH,ATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMAT...,AAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBB,AAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBB,1029304360749610213617819119220622326827430130...,1029304360749610213617819119220622326827430130...,TYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRT...,7822824034749060977982610871410150515171635178...,OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO,48.67134.61339.66949.37859.44374.24639.29738.2...,-1.77713.8576.17010.53036.32729.09324.30218.60...,-22.305-12.642-20.924-1.0399.6573.050-14.559-1...
OXT,ATOM,A,A,348,348,ALA,2638,O,90.145,31.762,16.624
SD,ATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMAT...,AAAAAAAAAAAAAAAABBBBBBBBBBBBBBB,AAAAAAAAAAAAAAAABBBBBBBBBBBBBBB,1394449861431551631832072532572883083093171394...,1394449861431551631832072532572883083093171394...,METMETMETMETMETMETMETMETMETMETMETMETMETMETMETM...,7316354397700113712361299144516421976200722572...,SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS,46.86337.63247.72743.61957.66089.57179.47374.0...,-4.87610.29514.55422.26416.85415.58025.37714.1...,-29.066-0.070-4.4345.183-4.2293.163-4.682-11.0...
SG,ATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMAT...,AAAAAAAAAABBBBBBBBBB,AAAAAAAAAABBBBBBBBBB,1101401671851872222643163223231101401671851872...,1101401671851872222643163223231101401671851872...,CYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSC...,8831114132414611473177120612486252925353521375...,SSSSSSSSSSSSSSSSSSSS,51.21984.37465.32742.66950.40378.02856.89866.5...,14.0094.84411.23911.95013.3177.4468.87227.5622...,-18.3868.988-13.144-12.401-16.6654.194-0.22016...


In [42]:
df['label_seq_id']=df.apply(lambda x: fix_label_seq_id(x.label_seq_id), axis=1)

Columns
group_PDB 	id 	type_symbol 	label_atom_id 	label_alt_id 	label_comp_id 	label_asym_id 	label_entity_id 	label_seq_id 	pdbx_PDB_ins_code 	Cartn_x 	Cartn_y 	Cartn_z 	occupancy 	B_iso_or_equiv 	Cartn_x_esd 	Cartn_y_esd 	Cartn_z_esd 	occupancy_esd 	B_iso_or_equiv_esd 	pdbx_formal_charge 	auth_seq_id 	auth_comp_id 	auth_asym_id 	auth_atom_id 	pdbx_PDB_model_num

In [43]:
import sys
from math import degrees
import gemmi

ramas = {aa: [] for aa in [
    'LEU', 'ALA', 'GLY', 'VAL', 'GLU', 'SER', 'LYS', 'ASP', 'THR', 'ILE',
    'ARG', 'PRO', 'ASN', 'PHE', 'GLN', 'TYR', 'HIS', 'MET', 'CYS', 'TRP']}

rol = []
for path in ['data/mmcif/'+eg+'.cif']:
    st = gemmi.read_structure(path)
    if 0.1 < st.resolution < 5:
        model = st[0]
        if len(st) > 1:
            print("There are multiple models!")
        for chain in model:
            for r, res in enumerate(chain.get_polymer()):
                # previous_residue() and next_residue() return previous/next
                # residue only if the residues are bonded. Otherwise -- None.
                prev_res = chain.previous_residue(res)
                next_res = chain.next_residue(res)
                phi, psi = gemmi.calculate_phi_psi(prev_res, res, next_res)
                omega = gemmi.calculate_omega(res, next_res)
                rol.append([res.name, res.label_seq, res.subchain, 
                            degrees(phi), degrees(omega), degrees(psi)])
                try:
                    ramas[res.name].append([degrees(phi), degrees(omega), degrees(psi)])
                except KeyError:
                    pass

In [44]:
"""
# Write data to files
for aa, data in ramas.items():
    with open('data/ramach/' + aa + '.tsv', 'w') as f:
        for phi, omega, psi in data:
            f.write('%.4f\t%.4f\n' % (degrees(phi), degrees(psi)))"""

"\n# Write data to files\nfor aa, data in ramas.items():\n    with open('data/ramach/' + aa + '.tsv', 'w') as f:\n        for phi, omega, psi in data:\n            f.write('%.4f\t%.4f\n' % (degrees(phi), degrees(psi)))"

In [45]:
cols = ['name', 'label_seq_id', 'chain', 'phi', 'omega', 'psi']

In [46]:
res_df = pd.DataFrame(data=rol, columns=cols)

In [47]:
res_df

Unnamed: 0,name,label_seq_id,chain,phi,omega,psi
0,MET,1,A,,-179.718577,133.146625
1,ASN,2,A,-87.567941,179.953276,-5.295383
2,GLY,3,A,-150.672305,178.921162,143.916355
3,THR,4,A,-85.441746,178.477151,112.209423
4,GLU,5,A,-96.636867,178.659871,123.041558
...,...,...,...,...,...,...
638,CYS,322,B,-74.457863,179.107101,-7.588421
639,CYS,323,B,65.675548,179.210471,29.106972
640,GLY,324,B,134.859554,-179.348699,-24.685362
641,LYS,325,B,-77.173222,-179.767435,94.072619


In [48]:
full = pd.merge(res_df, df, on=['label_seq_id', 'label_seq_id'])

In [49]:
import numpy as np
full = full.astype({'label_seq_id': np.int16, 'auth_seq_id': np.int16, 'id': np.int16, 
                    'phi': np.float32, 'omega': np.float32, 'psi': np.float32,
                    'x': np.float32, 'y': np.float32, 'z': np.float32})

In [50]:
def get_atom_id(x):
    atom_dict = {'C': 0, 'O': 1, 'N': 2, 'H': 3, 'S': 4}
    try:
        return atom_dict[str(x)]
    except:
        return 5
full['atom_id'] = full.apply(lambda x: get_atom_id(x.atom_type), axis=1)

In [51]:
full

Unnamed: 0,name,label_seq_id,chain,phi,omega,psi,group_PDB,auth_asym_id,label_asym_id,auth_seq_id,label_comp_id,id,label_atom_id,atom_type,x,y,z,atom_id
0,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,1,N,N,43.958000,-5.980000,-27.757999,2
1,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,2,CA,C,44.717999,-5.054000,-26.910999,0
2,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,3,C,C,44.069000,-4.848000,-25.542999,0
3,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,4,O,O,42.854000,-4.618000,-25.451000,1
4,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,5,CB,C,44.868000,-3.699000,-27.610001,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9920,ALA,348,A,-165.916489,,,ATOM,A,A,348,ALA,2634,CA,C,88.039001,30.749001,17.236000,0
9921,ALA,348,A,-165.916489,,,ATOM,A,A,348,ALA,2635,C,C,89.197998,31.023001,16.242001,0
9922,ALA,348,A,-165.916489,,,ATOM,A,A,348,ALA,2636,O,O,89.163002,30.493000,15.102000,1
9923,ALA,348,A,-165.916489,,,ATOM,A,A,348,ALA,2637,CB,C,88.360001,29.510000,18.103001,0


In [52]:
xyz = full[['x', 'y', 'z']].to_numpy(dtype=float)

In [53]:
ids = full[['label_seq_id', 'auth_seq_id']].to_numpy(dtype='int')

In [54]:
atoms = full['atom_id'].to_numpy(dtype=int)

In [55]:
xyz

array([[ 43.95800018,  -5.98000002, -27.75799942],
       [ 44.7179985 ,  -5.0539999 , -26.9109993 ],
       [ 44.06900024,  -4.84800005, -25.54299927],
       ...,
       [ 89.16300201,  30.49300003,  15.10200024],
       [ 88.36000061,  29.51000023,  18.10300064],
       [ 90.14499664,  31.76199913,  16.62400055]])

In [56]:
ids

array([[  1,   1],
       [  1,   1],
       [  1,   1],
       ...,
       [348, 348],
       [348, 348],
       [348, 348]])

In [57]:
atoms[:100]

array([2, 0, 0, 1, 0, 0, 4, 0, 2, 0, 0, 1, 0, 0, 4, 0, 2, 0, 0, 1, 0, 0,
       4, 0, 2, 0, 0, 1, 0, 0, 4, 0, 2, 0, 0, 1, 0, 0, 1, 2, 2, 0, 0, 1,
       0, 0, 1, 2, 2, 0, 0, 1, 0, 0, 1, 2, 2, 0, 0, 1, 0, 0, 1, 2, 2, 0,
       0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 0, 1, 0, 2,
       0, 0, 1, 0, 1, 0, 2, 0, 0, 1, 0, 1])

In [58]:
"""import plotly.express as px
select = full[(100 < full['label_seq_id']) & (full['label_seq_id'] < 120)]
df = px.data.iris()
fig = px.scatter_3d(select, x='x', y='y', z='z',
                    color='atom_type', opacity=0.5)
fig.update_traces(marker=dict(size=7))
fig.show()"""

"import plotly.express as px\nselect = full[(100 < full['label_seq_id']) & (full['label_seq_id'] < 120)]\ndf = px.data.iris()\nfig = px.scatter_3d(select, x='x', y='y', z='z',\n                    color='atom_type', opacity=0.5)\nfig.update_traces(marker=dict(size=7))\nfig.show()"

In [59]:
"""import sys
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator


def plot(data_file, label, output=None):
    x, y = [], []
    for line in open(data_file):
        phi, psi = line.split()
        if phi != 'nan' and psi != 'nan':
            x.append(float(phi))
            y.append(float(psi))
    print('Plotting %d points for %s' % (len(x), label))

    plt.figure(figsize=(5.5, 5.5))
    plt.title('%s, %d points.' % (label, len(x)), fontsize=12)
    plt.xlim([-180, 180])
    plt.ylim([-180, 180])
    ax = plt.gca()
    ax.xaxis.set_major_locator(MultipleLocator(60))
    ax.yaxis.set_major_locator(MultipleLocator(60))
    plt.xlabel(r'$\phi$', fontsize=14)
    plt.ylabel(r'$\psi$', fontsize=14, labelpad=0)
    plt.grid(color='#AAAAAA', linestyle='--')
    plt.hexbin(x, y, gridsize=2*180, bins='log', cmap='Blues')
    if output:
        plt.savefig(output, dpi=300)  # dpi=70 for small images in docs
    else:
        plt.show()


for aa in ramas:
    plot('data/ramach/%s.tsv' % aa, aa)
    #plot('ramas/%s.tsv' % aa, aa, 'ramas/%s.png' % aa)"""

"import sys\nimport matplotlib.pyplot as plt\nfrom matplotlib.ticker import MultipleLocator\n\n\ndef plot(data_file, label, output=None):\n    x, y = [], []\n    for line in open(data_file):\n        phi, psi = line.split()\n        if phi != 'nan' and psi != 'nan':\n            x.append(float(phi))\n            y.append(float(psi))\n    print('Plotting %d points for %s' % (len(x), label))\n\n    plt.figure(figsize=(5.5, 5.5))\n    plt.title('%s, %d points.' % (label, len(x)), fontsize=12)\n    plt.xlim([-180, 180])\n    plt.ylim([-180, 180])\n    ax = plt.gca()\n    ax.xaxis.set_major_locator(MultipleLocator(60))\n    ax.yaxis.set_major_locator(MultipleLocator(60))\n    plt.xlabel(r'$\\phi$', fontsize=14)\n    plt.ylabel(r'$\\psi$', fontsize=14, labelpad=0)\n    plt.grid(color='#AAAAAA', linestyle='--')\n    plt.hexbin(x, y, gridsize=2*180, bins='log', cmap='Blues')\n    if output:\n        plt.savefig(output, dpi=300)  # dpi=70 for small images in docs\n    else:\n        plt.show()\

In [60]:
uniprot = pdbtouniprot('4JKV')

In [61]:
uniprot

'P0ABE7'

In [62]:
pdb = uniprottopdb(uniprot)

In [63]:
pdb

['1APC',
 '1LM3',
 '1M6T',
 '1QPU',
 '1QQ3',
 '256B',
 '2BC5',
 '2QLA',
 '3C62',
 '3C63',
 '3DE8',
 '3DE9',
 '3FOO',
 '3FOP',
 '3HNI',
 '3HNJ',
 '3HNK',
 '3HNL',
 '3IQ5',
 '3IQ6',
 '3L1M',
 '3M15',
 '3M4B',
 '3M4C',
 '3M79',
 '3NMI',
 '3NMJ',
 '3NMK',
 '3TOL',
 '3TOM',
 '3U8P',
 '4EA3',
 '4EIY',
 '4IAQ',
 '4IAR',
 '4IB4',
 '4JE9',
 '4JEA',
 '4JEB',
 '4JKV',
 '4L6R',
 '4N6H',
 '4NC3',
 '4NTJ',
 '4O9R',
 '4OR2',
 '4PXZ',
 '4PY0',
 '4QIM',
 '4QIN',
 '4RWA',
 '4RWD',
 '4U9D',
 '4U9E',
 '4YAY',
 '4Z34',
 '4Z35',
 '4Z36',
 '4ZUD',
 '5AWI',
 '5BU7',
 '5DHG',
 '5DHH',
 '5IU4',
 '5IU7',
 '5IU8',
 '5IUA',
 '5IUB',
 '5JTB',
 '5K2A',
 '5K2B',
 '5K2C',
 '5K2D',
 '5L31',
 '5L32',
 '5L7D',
 '5L7I',
 '5MZJ',
 '5MZP',
 '5N2R',
 '5N2S',
 '5NDD',
 '5NDZ',
 '5NJ6',
 '5NLX',
 '5NM2',
 '5NM4',
 '5OLG',
 '5OLH',
 '5OLO',
 '5OLV',
 '5OLZ',
 '5OM1',
 '5OM4',
 '5TUD',
 '5TVN',
 '5UEN',
 '5UIG',
 '5UNF',
 '5UNG',
 '5UNH',
 '5UVI',
 '5VRA',
 '5WIU',
 '5WIV',
 '5XJM',
 '5XZI',
 '5XZJ',
 '5YM7',
 '5YO3',
 '5YO4',
 

In [64]:
from utils3 import *


pdb_id_list = ['2RH1', '1HZX', '1GZM']

In [65]:
import pandas as pd


for j in range(len(pdb_id_list)):
    pdb_id = pdb_id_list[j]
    maps = get_mappings_data(pdb_id)[pdb_id.lower()]['UniProt']
    uniprots = maps.keys()
    for i, uniprot in enumerate(uniprots):
        table = pd.DataFrame.from_dict(maps[uniprot])
        table['PDB'] = pdb_id
        table['uniprot'] = uniprot
        if i + j == 0:
            full_table = table
        else:
            full_table = full_table.append(table, ignore_index=True)

In [66]:
full_table

Unnamed: 0,identifier,name,mappings,PDB,uniprot
0,ADRB2_HUMAN,ADRB2_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",2RH1,P07550
1,ADRB2_HUMAN,ADRB2_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",2RH1,P07550
2,ENLYS_BPT4,ENLYS_BPT4,"{'entity_id': 1, 'end': {'author_residue_numbe...",2RH1,P00720
3,OPSD_BOVIN,OPSD_BOVIN,"{'entity_id': 1, 'end': {'author_residue_numbe...",1HZX,P02699
4,OPSD_BOVIN,OPSD_BOVIN,"{'entity_id': 1, 'end': {'author_residue_numbe...",1HZX,P02699
5,OPSD_BOVIN,OPSD_BOVIN,"{'entity_id': 1, 'end': {'author_residue_numbe...",1GZM,P02699
6,OPSD_BOVIN,OPSD_BOVIN,"{'entity_id': 1, 'end': {'author_residue_numbe...",1GZM,P02699


In [67]:
pd.DataFrame.from_dict(full_table.iloc[0]['mappings'])

Unnamed: 0,entity_id,end,chain_id,start,unp_end,unp_start,struct_asym_id
author_residue_number,1,230.0,A,,230,1,A
author_insertion_code,1,,A,,230,1,A
residue_number,1,237.0,A,8.0,230,1,A


In [68]:
pd.DataFrame.from_dict(full_table.iloc[1]['mappings'])

Unnamed: 0,entity_id,end,chain_id,start,unp_end,unp_start,struct_asym_id
author_residue_number,1,,A,264.0,365,264,A
author_insertion_code,1,,A,,365,264,A
residue_number,1,500.0,A,399.0,365,264,A


In [69]:
from utils import *
from utils2 import *
from utils3 import *
from plotting import *
from gpcrdb_soup import *

In [70]:
from tqdm import tqdm, trange

In [71]:
import sys
from gemmi import cif

In [406]:
class DataLoader():
    def __init__(self, 
                 path = 'data/',
                 structure = 'mmcif/',
                 limit=10,
                 remove_hetatm=True):
        self.path = path
        self.structure_path = self.path + structure
        self.path_table = path + 'gpcrdb/' + 'structures.pkl'
        
        self.filenames, self.pdb_ids = get_pdb_files(path=self.structure_path)
        # Columns for structure dataframe
        self.cols = ['group_PDB', 'auth_asym_id', 'label_asym_id', 'label_seq_id', 'auth_seq_id', 
                     'label_comp_id', 'id', 'label_atom_id', 
                     'type_symbol', 'Cartn_x', 'Cartn_y', 'Cartn_z']
        
        self.numbering = pd.DataFrame()
        self.table = pd.read_pickle(self.path_table)
        for i, pdb_id in tqdm(enumerate(self.pdb_ids)):
            if i < limit:
                protein, family = self.get_prot_info(pdb_id)
                numbering = self.get_res_nums(protein)
                if i == 0:
                    self.structure = self.load_cifs(pdb_id)
                    self.mappings = self.get_mapping(pdb_id)
                    numb = [pdb_id, protein, family, numbering]
                    # numb = [pdb_id, protein, self.entry_to_ac(protein), family, numbering]
                    self.numbering = self.numbering.append(pd.DataFrame(numb).T)
                else:
                    structure = self.load_cifs(pdb_id)
                    self.structure = self.structure.append(structure, ignore_index=True)
                    self.mappings = self.mappings.append(self.get_mapping(pdb_id), ignore_index=True)
                    # self.numbering = self.numbering.append(pd.DataFrame(data=[pdb_id, protein, self.entry_to_ac(protein),
                    #                                                           family, numbering]).T, 
                    self.numbering = self.numbering.append(pd.DataFrame(data=[pdb_id, protein, family, numbering]).T, 
                                                           ignore_index=True)
        self.numbering.columns = ['PDB', 'identifier', 'family', 'numbering']
        
        if remove_hetatm:
            self.structure = self.structure[self.structure['group_PDB']!='HETATM']
            data.structure['label_seq_id'] = data.structure['label_seq_id'].astype(np.int64)
        # self.numbering.columns = ['pdb', 'identifier', 'uniprot', 'family', 'numbering']
        # self.structure['auth_seq_id'] = self.structure['auth_seq_id'].astype(np.int8)
        # self.structure['id'] = self.structure['id'].astype(np.int8)
        # self.structure['Cartn_x'] = self.structure['Cartn_x'].astype(np.float64)
        # self.structure['Cartn_y'] = self.structure['Cartn_y'].astype(np.float64)
        # self.structure['Cartn_z'] = self.structure['Cartn_z'].astype(np.float64)
        
        
    def entry_to_ac(self, entry: str):
        response = requests.get('https://www.uniprot.org/uniprot/'+entry+'.txt')
        return response.text.split('\n')[1].split('AC   ')[1][:6]
    
    def get_prot_info(self, pdb_id):
        # query structure
        response = requests.get('https://gpcrdb.org/services/structure/'+pdb_id.upper()+'/')
        protein = response.json()['protein']
        family = response.json()['family']
        return protein, family
    
    def get_res_nums(self, protein):
        # query uniprot -> res num
        response = requests.get('https://gpcrdb.org/services/residues/extended/'+protein+'/')    
        # select res num
        # assign res_num to structure data
        return response.json()
    
    def get_mapping(self, pdb_id):
        maps = get_mappings_data(pdb_id)[pdb_id.lower()]['UniProt']
        uniprots = maps.keys()
        full_table=pd.DataFrame()
        for i, uniprot in enumerate(uniprots):
            table = pd.DataFrame.from_dict(maps[uniprot])
            table['PDB'] = pdb_id
            table['uniprot'] = uniprot
            if i + j == 0:
                full_table = table
            else:
                full_table = full_table.append(table, ignore_index=True)
        return full_table
    
    def load_cifs(self, pdb_id):
        print("Loading cif file of", pdb_id)
        path = 'data/mmcif/' + pdb_id + '.cif'
        print(path)
        try:
            doc = cif.read_file(path)  # copy all the data from mmCIF file
            lol = []  # list of lists
            for b, block in enumerate(doc):
                table = block.find('_atom_site.', self.cols)
                for row in table:
                    lol.append([pdb_id]+list(row))
        except Exception as e:
            print("Oops. %s" % e)
            sys.exit(1)
        cols = ['PDB']+self.cols
        return pd.DataFrame(data=lol, columns=cols)

In [407]:
data = DataLoader(limit=2)

0it [00:00, ?it/s]

Loading cif file of 5L7D
data/mmcif/5L7D.cif


1it [00:03,  3.71s/it]

Loading cif file of 6LML
data/mmcif/6LML.cif


530it [00:06, 79.15it/s]


In [408]:
data.numbering['numbering'].iloc[0]

[{'sequence_number': 1,
  'amino_acid': 'M',
  'protein_segment': 'N-term',
  'display_generic_number': None,
  'alternative_generic_numbers': []},
 {'sequence_number': 2,
  'amino_acid': 'A',
  'protein_segment': 'N-term',
  'display_generic_number': None,
  'alternative_generic_numbers': []},
 {'sequence_number': 3,
  'amino_acid': 'A',
  'protein_segment': 'N-term',
  'display_generic_number': None,
  'alternative_generic_numbers': []},
 {'sequence_number': 4,
  'amino_acid': 'A',
  'protein_segment': 'N-term',
  'display_generic_number': None,
  'alternative_generic_numbers': []},
 {'sequence_number': 5,
  'amino_acid': 'R',
  'protein_segment': 'N-term',
  'display_generic_number': None,
  'alternative_generic_numbers': []},
 {'sequence_number': 6,
  'amino_acid': 'P',
  'protein_segment': 'N-term',
  'display_generic_number': None,
  'alternative_generic_numbers': []},
 {'sequence_number': 7,
  'amino_acid': 'A',
  'protein_segment': 'N-term',
  'display_generic_number': None,
  

In [409]:
data.mappings

Unnamed: 0,identifier,name,mappings,PDB,uniprot
0,C562_ECOLX,C562_ECOLX,"{'entity_id': 1, 'end': {'author_residue_numbe...",5L7D,P0ABE7
1,C562_ECOLX,C562_ECOLX,"{'entity_id': 1, 'end': {'author_residue_numbe...",5L7D,P0ABE7
2,SMO_HUMAN,SMO_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",5L7D,Q99835
3,SMO_HUMAN,SMO_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",5L7D,Q99835
4,SMO_HUMAN,SMO_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",5L7D,Q99835
5,SMO_HUMAN,SMO_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",5L7D,Q99835
6,GNAI1_HUMAN,GNAI1_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",6LML,P63096
7,GBB1_HUMAN,GBB1_HUMAN,"{'entity_id': 2, 'end': {'author_residue_numbe...",6LML,P62873
8,GLR_HUMAN,GLR_HUMAN,"{'entity_id': 6, 'end': {'author_residue_numbe...",6LML,P47871
9,GBG2_HUMAN,GBG2_HUMAN,"{'entity_id': 3, 'end': {'author_residue_numbe...",6LML,P59768


In [360]:
def lookup(pdb, identifier, mappings):
    return list(mappings[(mappings['identifier']==identifier.upper()) & (mappings['PDB']==pdb)]['mappings'])

In [361]:
maps = data.mappings

In [362]:
new = data.numbering

In [363]:
new['mapping'] = new.apply(lambda x: lookup(x.PDB, x.identifier, maps), axis=1)

In [364]:
for i in range(len(new)):
    print(new['mapping'].iloc[i])

[{'entity_id': 1, 'end': {'author_residue_number': 428, 'author_insertion_code': '', 'residue_number': 397}, 'chain_id': 'A', 'start': {'author_residue_number': None, 'author_insertion_code': '', 'residue_number': 1}, 'unp_end': 428, 'unp_start': 32, 'struct_asym_id': 'A'}, {'entity_id': 1, 'end': {'author_residue_number': None, 'author_insertion_code': '', 'residue_number': 628}, 'chain_id': 'A', 'start': {'author_residue_number': 1129, 'author_insertion_code': '', 'residue_number': 516}, 'unp_end': 555, 'unp_start': 443, 'struct_asym_id': 'A'}, {'entity_id': 1, 'end': {'author_residue_number': 428, 'author_insertion_code': '', 'residue_number': 397}, 'chain_id': 'B', 'start': {'author_residue_number': None, 'author_insertion_code': '', 'residue_number': 1}, 'unp_end': 428, 'unp_start': 32, 'struct_asym_id': 'B'}, {'entity_id': 1, 'end': {'author_residue_number': None, 'author_insertion_code': '', 'residue_number': 628}, 'chain_id': 'B', 'start': {'author_residue_number': 1129, 'autho

In [365]:
new

Unnamed: 0,PDB,identifier,family,numbering,mapping
0,5L7D,smo_human,006_001_001_011,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr...","[{'entity_id': 1, 'end': {'author_residue_numb..."
1,6LML,glr_human,002_001_003_005,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr...","[{'entity_id': 6, 'end': {'author_residue_numb..."


In [370]:
def get_gaps(mapping):
    starts = []
    ends = []
    for section in mapping:
        start = int(section['start']['residue_number'])
        auth_start = section['start']['author_residue_number']
        end = int(section['end']['residue_number'])
        auth_end = section['end']['author_residue_number']
        unp_start = int(section['unp_start'])
        unp_end = section['unp_end']
        length = end-start
        unp_length = unp_end-unp_start
        starts.append([start, unp_start])
        ends.append([end, unp_end])
    print(starts)
    print(ends, '\n')
    return starts, ends

In [371]:
df['lol'] = df.apply(lambda x: get_gaps(x.mapping), axis=1)

[[1, 32], [516, 443], [1, 32], [516, 443]]
[[397, 428], [628, 555], [397, 428], [628, 555]] 

[[1, 27]]
[[406, 432]] 



In [372]:
df['lol'].iloc[0]

([[1, 32], [516, 443], [1, 32], [516, 443]],
 [[397, 428], [628, 555], [397, 428], [628, 555]])

In [373]:
df['starts'] = df.apply(lambda x: x.lol[0], axis=1)
df['ends'] = df.apply(lambda x: x.lol[1], axis=1)
df.drop('lol', axis=1)

Unnamed: 0,PDB,identifier,family,numbering,mapping,uniprot(gene),Cl.,Resolution,Preferred Chain,State,Function,starts,ends
0,5L7D,smo_human,006_001_001_011,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr...","[{'entity_id': 1, 'end': {'author_residue_numb...",SMO,F(Frizzled),3.2,B,Inactive,-,"[[1, 32], [516, 443], [1, 32], [516, 443]]","[[397, 428], [628, 555], [397, 428], [628, 555]]"
1,6LML,glr_human,002_001_003_005,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr...","[{'entity_id': 6, 'end': {'author_residue_numb...",GLR,B1(Secretin),3.9,R,Active,Agonist,"[[1, 27]]","[[406, 432]]"


In [374]:
eg

'5L7D'

In [375]:
list(set(list(data.mappings['PDB'])))

['5L7D', '6LML']

In [376]:
lm = list(set(list(data.mappings['PDB'])))

In [377]:
eg = lm[0]

In [378]:
print(eg)

5L7D


In [379]:
data.numbering[data.numbering['PDB']==eg]

Unnamed: 0,PDB,identifier,family,numbering,mapping
0,5L7D,smo_human,006_001_001_011,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr...","[{'entity_id': 1, 'end': {'author_residue_numb..."


In [380]:
sequence_numbers = []
amino_acids = []
generic_numbers = []


for i in data.numbering[data.numbering['PDB']==eg].iloc[0]['numbering']:
    if i['alternative_generic_numbers'] != []:
        print(i)
        sequence_numbers.append(i['sequence_number'])
        amino_acids.append(i['amino_acid'])
        generic_numbers.append(['display_generic_number'])

{'sequence_number': 223, 'amino_acid': 'T', 'protein_segment': 'TM1', 'display_generic_number': '1.25x25', 'alternative_generic_numbers': [{'scheme': 'BW', 'label': '1.28'}, {'scheme': 'Wootten', 'label': '1.32'}, {'scheme': 'Pin', 'label': '1.32'}, {'scheme': 'Wang', 'label': '1.25'}, {'scheme': 'Fungal', 'label': '1.30'}, {'scheme': 'GPCRdb(A)', 'label': '1.28x28'}, {'scheme': 'GPCRdb(B)', 'label': '1.32x32'}, {'scheme': 'GPCRdb(C)', 'label': '1.32x32'}, {'scheme': 'GPCRdb(F)', 'label': '1.25x25'}, {'scheme': 'GPCRdb(D)', 'label': '1.30x30'}, {'scheme': 'Oliveira', 'label': '108'}, {'scheme': 'BS', 'label': 'I:-05'}]}
{'sequence_number': 224, 'amino_acid': 'E', 'protein_segment': 'TM1', 'display_generic_number': '1.26x26', 'alternative_generic_numbers': [{'scheme': 'BW', 'label': '1.29'}, {'scheme': 'Wootten', 'label': '1.33'}, {'scheme': 'Pin', 'label': '1.33'}, {'scheme': 'Wang', 'label': '1.26'}, {'scheme': 'Fungal', 'label': '1.31'}, {'scheme': 'GPCRdb(A)', 'label': '1.29x29'}, {

In [381]:
for i in zip(sequence_numbers, amino_acids):
    print(i)

(223, 'T')
(224, 'E')
(225, 'A')
(226, 'E')
(227, 'H')
(228, 'Q')
(229, 'D')
(230, 'M')
(231, 'H')
(232, 'S')
(233, 'Y')
(234, 'I')
(235, 'A')
(236, 'A')
(237, 'F')
(238, 'G')
(239, 'A')
(240, 'V')
(241, 'T')
(242, 'G')
(243, 'L')
(244, 'C')
(245, 'T')
(246, 'L')
(247, 'F')
(248, 'T')
(249, 'L')
(250, 'A')
(251, 'T')
(252, 'F')
(253, 'V')
(254, 'A')
(255, 'D')
(256, 'W')
(257, 'R')
(258, 'N')
(259, 'S')
(260, 'N')
(261, 'R')
(262, 'Y')
(263, 'P')
(264, 'A')
(265, 'V')
(266, 'I')
(267, 'L')
(268, 'F')
(269, 'Y')
(270, 'V')
(271, 'N')
(272, 'A')
(273, 'C')
(274, 'F')
(275, 'F')
(276, 'V')
(277, 'G')
(278, 'S')
(279, 'I')
(280, 'G')
(281, 'W')
(282, 'L')
(283, 'A')
(284, 'Q')
(285, 'F')
(312, 'L')
(313, 'S')
(314, 'C')
(315, 'V')
(316, 'I')
(317, 'I')
(318, 'F')
(319, 'V')
(320, 'I')
(321, 'V')
(322, 'Y')
(323, 'Y')
(324, 'A')
(325, 'L')
(326, 'M')
(327, 'A')
(328, 'G')
(329, 'V')
(330, 'V')
(331, 'W')
(332, 'F')
(333, 'V')
(334, 'V')
(335, 'L')
(336, 'T')
(337, 'Y')
(338, 'A')
(339, 'W')

1. figure out gpcr uniprot id (not auxilary etc)
2. select correpsonding sifts mapping
3. use sifts mapping to enumerate residues
4. number residues

In [382]:
start_seq_nums = sequence_numbers[0]

In [383]:
start_seq_nums

223

In [384]:
data.table[data.table['PDB']==eg]

Unnamed: 0,uniprot(gene),Cl.,PDB,Resolution,Preferred Chain,State,Function
80,SMO,F(Frizzled),5L7D,3.2,B,Inactive,-


In [385]:
pref_chain = data.table[data.table['PDB']==eg].iloc[0]['Preferred Chain']

In [386]:
pref_chain

'B'

In [400]:
# data.structure.drop(data.structure[data.structure['label_seq_id']=='.'].index, inplace=True)
data.structure['label_seq_id'] = data.structure['label_seq_id'].astype(np.int64)

In [402]:
data.structure[(data.structure['PDB']==eg)&
               (data.structure['auth_asym_id']==pref_chain)]

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,Cartn_x,Cartn_y,Cartn_z
4640,5L7D,ATOM,B,B,27,58,PRO,4641,N,N,39.917,22.187,-16.837
4641,5L7D,ATOM,B,B,27,58,PRO,4642,CA,C,40.760,20.983,-16.794
4642,5L7D,ATOM,B,B,27,58,PRO,4643,C,C,39.946,19.688,-16.894
4643,5L7D,ATOM,B,B,27,58,PRO,4644,O,O,38.776,19.698,-16.505
4644,5L7D,ATOM,B,B,27,58,PRO,4645,CB,C,41.462,21.093,-15.442
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9202,5L7D,ATOM,B,B,624,551,ARG,9203,CD,C,-5.558,44.475,69.444
9203,5L7D,ATOM,B,B,624,551,ARG,9204,NE,N,-4.184,44.841,69.100
9204,5L7D,ATOM,B,B,624,551,ARG,9205,CZ,C,-3.372,44.099,68.353
9205,5L7D,ATOM,B,B,624,551,ARG,9206,NH1,N,-3.790,42.942,67.852


In [320]:
x.label_seq_id.max()

127

In [330]:
def get_gaps(mapping):
    starts = []
    ends = []
    for section in mapping:
        start = int(section['start']['residue_number'])
        auth_start = section['start']['author_residue_number']
        end = int(section['end']['residue_number'])
        auth_end = section['end']['author_residue_number']
        unp_start = int(section['unp_start'])
        unp_end = section['unp_end']
        length = end-start
        unp_length = unp_end-unp_start
        """
        print(start, '\n', 
              end,  '\n',
              'diff:', length, '\n\n', 
              unp_start,  '\n', 
              unp_end, '\n',
              'diff:', unp_length, '\n\n\n\n')"""
        starts.append([start, unp_start])
        ends.append([end, unp_end])
    print('starts:',starts)
    print('ends:',ends, '\n')
    return starts, ends

In [331]:
mapping  = df[df['PDB']==eg]['mapping'].iloc[0]

In [332]:
pref_mapping = []
for m in mapping:
    if m['chain_id'] == pref_chain:
        pref_mapping.append(m)

In [333]:
pref_mapping

[{'entity_id': 1,
  'end': {'author_residue_number': 428,
   'author_insertion_code': '',
   'residue_number': 397},
  'chain_id': 'B',
  'start': {'author_residue_number': None,
   'author_insertion_code': '',
   'residue_number': 1},
  'unp_end': 428,
  'unp_start': 32,
  'struct_asym_id': 'B'},
 {'entity_id': 1,
  'end': {'author_residue_number': None,
   'author_insertion_code': '',
   'residue_number': 628},
  'chain_id': 'B',
  'start': {'author_residue_number': 1129,
   'author_insertion_code': '',
   'residue_number': 516},
  'unp_end': 555,
  'unp_start': 443,
  'struct_asym_id': 'B'}]

In [335]:
s_e = get_gaps(pref_mapping)

starts: [[1, 32], [516, 443]]
ends: [[397, 428], [628, 555]] 



In [336]:
s_e

([[1, 32], [516, 443]], [[397, 428], [628, 555]])

In [342]:
se1 = s_e[0][0]
se2 = s_e[0][1]
start_label_seq_id = se1[0]
start_uniprot = se1[1]
end_label_seq_id = se2[0]
end_uniprot = se2[1]

In [343]:
data.structure[(data.structure['PDB']==eg) & 
               (data.structure['label_seq_id'] > start_label_seq_id) &
               (data.structure['label_seq_id'] < end_label_seq_id) &
               (data.structure['label_atom_id']=='CA')]

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,Cartn_x,Cartn_y,Cartn_z
1,5L7D,ATOM,A,A,27,58,PRO,2,CA,C,-67.148,-8.259,115.290
8,5L7D,ATOM,A,A,28,59,PRO,9,CA,C,-64.607,-10.130,117.555
15,5L7D,ATOM,A,A,29,60,PRO,16,CA,C,-61.193,-10.607,115.822
22,5L7D,ATOM,A,A,30,61,LEU,23,CA,C,-60.461,-14.030,114.245
30,5L7D,ATOM,A,A,31,62,SER,31,CA,C,-58.073,-16.249,116.307
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9159,5L7D,ATOM,B,B,108,547,ARG,9160,CA,C,-11.609,40.572,70.829
9170,5L7D,ATOM,B,B,109,548,THR,9171,CA,C,-9.275,42.119,68.165
9177,5L7D,ATOM,B,B,110,549,TRP,9178,CA,C,-12.320,44.139,66.827
9191,5L7D,ATOM,B,B,111,550,CYS,9192,CA,C,-12.618,45.987,70.204


In [None]:
data

In [221]:
df2 = pd.DataFrame(data=[sequence_numbers, amino_acids]).T
df2.columns = ['id', 'res']

In [222]:
df2

Unnamed: 0,id,res
0,57,A
1,58,I
2,59,P
3,60,V
4,61,I
...,...,...
259,342,R
260,343,D
261,344,F
262,345,C


In [419]:
data.numbering[data.numbering['PDB']==eg]['numbering'].iloc[0]

[{'sequence_number': 1,
  'amino_acid': 'M',
  'protein_segment': 'N-term',
  'display_generic_number': None,
  'alternative_generic_numbers': []},
 {'sequence_number': 2,
  'amino_acid': 'A',
  'protein_segment': 'N-term',
  'display_generic_number': None,
  'alternative_generic_numbers': []},
 {'sequence_number': 3,
  'amino_acid': 'A',
  'protein_segment': 'N-term',
  'display_generic_number': None,
  'alternative_generic_numbers': []},
 {'sequence_number': 4,
  'amino_acid': 'A',
  'protein_segment': 'N-term',
  'display_generic_number': None,
  'alternative_generic_numbers': []},
 {'sequence_number': 5,
  'amino_acid': 'R',
  'protein_segment': 'N-term',
  'display_generic_number': None,
  'alternative_generic_numbers': []},
 {'sequence_number': 6,
  'amino_acid': 'P',
  'protein_segment': 'N-term',
  'display_generic_number': None,
  'alternative_generic_numbers': []},
 {'sequence_number': 7,
  'amino_acid': 'A',
  'protein_segment': 'N-term',
  'display_generic_number': None,
  

In [422]:
def get_generic_nums(pdb_id, numbering):
    sequence_numbers = []
    amino_acids = []
    generic_numbers = []
    for i in numbering[numbering['PDB']==pdb_id].iloc[0]['numbering']:
        if i['alternative_generic_numbers'] != []:
            sequence_numbers.append(i['sequence_number'])
            amino_acids.append(i['amino_acid'])
            generic_numbers.append(['display_generic_number'])
    return sequence_numbers, amino_acids, generic_numbers

In [424]:
nums, aas, gn = get_generic_nums(eg, data.numbering)

In [413]:
seq_id = list(x['label_seq_id'])
seq_res_ = list(x['label_comp_id'])
seq_res = [gemmi.find_tabulated_residue(resname).one_letter_code for resname in seq_res_]

In [414]:
y = zip([int(x) for x in seq_id], seq_res)

In [415]:
for j in y:
    if int(j[0]) >= start_seq_nums:
        print(j)

(223, 'A')
(224, 'D')
(225, 'W')
(226, 'R')
(227, 'N')
(228, 'S')
(229, 'N')
(230, 'R')
(231, 'Y')
(232, 'P')
(233, 'A')
(234, 'V')
(235, 'I')
(236, 'L')
(237, 'F')
(238, 'Y')
(239, 'V')
(240, 'N')
(241, 'A')
(242, 'C')
(243, 'F')
(244, 'F')
(245, 'V')
(246, 'G')
(247, 'S')
(248, 'I')
(249, 'G')
(250, 'W')
(251, 'L')
(252, 'A')
(253, 'Q')
(254, 'F')
(255, 'M')
(256, 'D')
(257, 'G')
(258, 'A')
(259, 'R')
(260, 'R')
(261, 'E')
(262, 'I')
(263, 'V')
(264, 'C')
(265, 'R')
(266, 'A')
(267, 'D')
(268, 'G')
(269, 'T')
(270, 'M')
(271, 'R')
(272, 'L')
(273, 'G')
(274, 'E')
(275, 'P')
(276, 'T')
(277, 'S')
(278, 'N')
(279, 'E')
(280, 'T')
(281, 'L')
(282, 'S')
(283, 'C')
(284, 'V')
(285, 'I')
(286, 'I')
(287, 'F')
(288, 'V')
(289, 'I')
(290, 'V')
(291, 'Y')
(292, 'Y')
(293, 'A')
(294, 'L')
(295, 'M')
(296, 'A')
(297, 'G')
(298, 'F')
(299, 'V')
(300, 'W')
(301, 'F')
(302, 'V')
(303, 'V')
(304, 'L')
(305, 'T')
(306, 'Y')
(307, 'A')
(308, 'W')
(309, 'H')
(310, 'T')
(311, 'S')
(312, 'F')
(313, 'K')

In [226]:
df1 = pd.DataFrame([seq_id, seq_res]).T
df1.columns = ['id', 'res']

In [227]:
df1

Unnamed: 0,id,res
0,55,S
1,56,P
2,57,A
3,58,I
4,59,P
...,...,...
442,343,D
443,344,F
444,345,C
445,346,F


In [228]:
j = 0
for i in range(len(df1)):
    x = df1.iloc[i]
    while(int(df2.iloc[j]['id']) < int(x['id'])):
        j+=1
        print(df2.iloc[j]['id'], x['id'])
    y = df2.iloc[j]
    if x['res'] != y['res']:
        print(x['res'], y['res'])
        print(x['id'], y['id'])
    j+=1

S A
55 57
P I
56 58
A P
57 59
I V
58 60
P I
59 61
V I
60 62
I T
61 63
I A
62 64
T V
63 65
A Y
64 66
V S
65 67
Y V
66 68
S V
67 69
V F
68 70
F V
70 72
V G
71 73
V L
72 74
G V
73 75
L G
74 76
V N
75 77
G S
76 78
N L
77 79
S V
78 80
L M
79 81
V F
80 82
M V
81 83
F I
82 84
V I
83 85
I R
84 86
I Y
85 87
R T
86 88
Y K
87 89
T M
88 90
M T
90 92
K A
91 93
A N
93 95
T I
94 96
N Y
95 97
Y F
97 99
I N
98 100
F L
99 101
N A
100 102
L D
103 105
D L
105 107
A V
106 108
L T
107 109
V T
108 110
T M
110 112
T P
111 113
M F
112 114
P Q
113 115
F S
114 116
Q T
115 117
S V
116 118
T Y
117 119
V L
118 120
Y M
119 121
L N
120 122
M S
121 123
N W
122 124
S P
123 125
W F
124 126
P G
125 127
F D
126 128
G V
127 129
D L
128 130
V C
129 131
L K
130 132
C I
131 133
K V
132 134
V S
134 136
L I
135 137
S D
136 138
I Y
137 139
D Y
138 140
Y N
139 141
Y M
140 142
N F
141 143
M T
142 144
F S
143 145
T I
144 146
S F
145 147
I T
146 148
F L
147 149
L M
149 151
T M
150 152
M S
151 153
M V
152 154
S D
153 155
V R
154 156


IndexError: single positional indexer is out-of-bounds

In [None]:
df['same'] = 

In [None]:
df

In [None]:
eg

In [229]:
data.mappings

Unnamed: 0,identifier,name,mappings,PDB,uniprot
0,C562_ECOLX,C562_ECOLX,"{'entity_id': 1, 'end': {'author_residue_numbe...",5L7D,P0ABE7
1,C562_ECOLX,C562_ECOLX,"{'entity_id': 1, 'end': {'author_residue_numbe...",5L7D,P0ABE7
2,SMO_HUMAN,SMO_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",5L7D,Q99835
3,SMO_HUMAN,SMO_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",5L7D,Q99835
4,SMO_HUMAN,SMO_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",5L7D,Q99835
5,SMO_HUMAN,SMO_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",5L7D,Q99835
6,GNAI1_HUMAN,GNAI1_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",6LML,P63096
7,GBB1_HUMAN,GBB1_HUMAN,"{'entity_id': 2, 'end': {'author_residue_numbe...",6LML,P62873
8,GLR_HUMAN,GLR_HUMAN,"{'entity_id': 6, 'end': {'author_residue_numbe...",6LML,P47871
9,GBG2_HUMAN,GBG2_HUMAN,"{'entity_id': 3, 'end': {'author_residue_numbe...",6LML,P59768


In [207]:
pd.DataFrame.from_dict(data.mappings[data.mappings['PDB']==eg]['mappings'])

Unnamed: 0,mappings
11,"{'entity_id': 1, 'end': {'author_residue_numbe..."
12,"{'entity_id': 1, 'end': {'author_residue_numbe..."
13,"{'entity_id': 1, 'end': {'author_residue_numbe..."
14,"{'entity_id': 1, 'end': {'author_residue_numbe..."
15,"{'entity_id': 1, 'end': {'author_residue_numbe..."
16,"{'entity_id': 1, 'end': {'author_residue_numbe..."


fRom mappings I have to choose the correct chain and use it to select the protein region

In [208]:
data.structure

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,Cartn_x,Cartn_y,Cartn_z
0,5L7D,ATOM,A,A,27,58,PRO,1,N,N,-67.220,-6.826,114.969
1,5L7D,ATOM,A,A,27,58,PRO,2,CA,C,-67.148,-8.259,115.290
2,5L7D,ATOM,A,A,27,58,PRO,3,C,C,-65.862,-8.633,116.051
3,5L7D,ATOM,A,A,27,58,PRO,4,O,O,-64.855,-7.926,115.903
4,5L7D,ATOM,A,A,27,58,PRO,5,CB,C,-67.195,-8.927,113.911
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25787,4DJH,HETATM,B,K,.,1315,HOH,7085,O,O,3.315,-71.138,25.416
25788,4DJH,HETATM,B,K,.,1316,HOH,7086,O,O,4.681,-69.269,62.196
25789,4DJH,HETATM,B,K,.,1318,HOH,7087,O,O,39.994,-79.892,18.776
25790,4DJH,HETATM,B,K,.,1319,HOH,7088,O,O,38.265,-63.223,12.142


In [210]:
data.table[data.table['PDB']==eg]

Unnamed: 0,uniprot(gene),Cl.,PDB,Resolution,Preferred Chain,State,Function
476,GLR,B1(Secretin),6LML,3.9,R,Active,Agonist


In [125]:
pref_chain = data.table[data.table['PDB']==eg]['Preferred Chain'].iloc[0]

In [126]:
pref_chain

'R'

In [127]:
data.structure[data.structure['PDB']==eg]['label_asym_id'].unique()

array(['A', 'B', 'C', 'D', 'E', 'F'], dtype=object)

In [128]:
a = data.structure[(data.structure['PDB']==eg) & 
                   (data.structure['label_atom_id']=='CA') & 
                   (data.structure['group_PDB']!='HETATM') &
                   (data.structure['auth_asym_id']==pref_chain)]
b = data.mappings[data.mappings['PDB']==eg]

In [129]:
a

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,Cartn_x,Cartn_y,Cartn_z
15551,6LML,ATOM,R,F,1,27,GLN,6287,CA,C,67.14400,101.14100,93.94000
15560,6LML,ATOM,R,F,2,28,VAL,6296,CA,C,63.73300,102.33500,92.79300
15567,6LML,ATOM,R,F,3,29,MET,6303,CA,C,64.28900,102.51500,89.04200
15575,6LML,ATOM,R,F,4,30,ASP,6311,CA,C,65.34100,98.88300,88.65100
15583,6LML,ATOM,R,F,5,31,PHE,6319,CA,C,62.31300,98.04300,90.75700
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18656,6LML,ATOM,R,F,391,417,ARG,9392,CA,C,124.20500,121.41900,96.08100
18667,6LML,ATOM,R,F,392,418,TRP,9403,CA,C,122.33800,122.56800,92.96800
18681,6LML,ATOM,R,F,393,419,ARG,9417,CA,C,123.98600,126.01800,93.12900
18692,6LML,ATOM,R,F,394,420,LEU,9428,CA,C,126.75600,124.79800,90.81000


In [130]:
b.iloc[0]

identifier                                          GNAI1_HUMAN
name                                                GNAI1_HUMAN
mappings      {'entity_id': 1, 'end': {'author_residue_numbe...
PDB                                                        6LML
uniprot                                                  P63096
Name: 6, dtype: object

In [131]:
correct = []
for d in range(len(b)):
    chain = pd.DataFrame.from_dict(b.iloc[d]['mappings'])['chain_id'].iloc[0]
    # if chain == pref_chain:
    correct.append(pd.DataFrame.from_dict(b.iloc[d]['mappings']))

In [132]:
tot = pd.concat(correct)

In [133]:
tot

Unnamed: 0,entity_id,end,chain_id,start,unp_end,unp_start,struct_asym_id
author_residue_number,1,354.0,A,,354,1,A
author_insertion_code,1,,A,,354,1,A
residue_number,1,354.0,A,1.0,354,1,A
author_residue_number,2,340.0,B,,340,2,B
author_insertion_code,2,,B,,340,2,B
residue_number,2,351.0,B,13.0,340,2,B
author_residue_number,6,,R,27.0,432,27,F
author_insertion_code,6,,R,,432,27,F
residue_number,6,406.0,R,1.0,432,27,F
author_residue_number,3,,C,,71,1,C


In [134]:
tot[tot['chain_id']==pref_chain]

Unnamed: 0,entity_id,end,chain_id,start,unp_end,unp_start,struct_asym_id
author_residue_number,6,,R,27.0,432,27,F
author_insertion_code,6,,R,,432,27,F
residue_number,6,406.0,R,1.0,432,27,F
