In [1]:
from utils import *
from gpcrdb_soup import *
import gemmi

In [2]:
from Bio.PDB.MMCIFParser import MMCIFParser
parser = MMCIFParser()

In [3]:
eg = '1f88'

In [4]:
path = get_rcsb_download(eg, fileformat = 'cif')

In [5]:
path

'https://files.rcsb.org/download/1f88.cif'

In [6]:

def download(url: str, folder: str, fileformat: str):
    if not os.path.isdir(folder):
        os.mkdir(folder)
    try:
        r = requests.get(url)
        loc = len(fileformat)+1
        fname = folder + '/' + url[-(loc+4):-loc] + '.' + fileformat
        with open(fname, 'wb') as f:
            f.write(r.content)
    except Exception:
        print("Url invalid:", url)

    
def download_pdb(url, folder, fileformat):
    download(url, folder, fileformat)

In [7]:
download(url=path, folder='data/mmcif', fileformat='cif')

In [8]:
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
import pandas as pd
d = MMCIF2Dict("data/mmcif/"+eg+".cif")
pd.DataFrame({k:v for k,v in d.items() if "_atom_site." in k})

Unnamed: 0,_atom_site.group_PDB,_atom_site.id,_atom_site.type_symbol,_atom_site.label_atom_id,_atom_site.label_alt_id,_atom_site.label_comp_id,_atom_site.label_asym_id,_atom_site.label_entity_id,_atom_site.label_seq_id,_atom_site.pdbx_PDB_ins_code,...,_atom_site.Cartn_y,_atom_site.Cartn_z,_atom_site.occupancy,_atom_site.B_iso_or_equiv,_atom_site.pdbx_formal_charge,_atom_site.auth_seq_id,_atom_site.auth_comp_id,_atom_site.auth_asym_id,_atom_site.auth_atom_id,_atom_site.pdbx_PDB_model_num
0,ATOM,1,N,N,.,MET,A,1,1,?,...,-5.980,-27.758,1.00,54.29,?,1,MET,A,N,1
1,ATOM,2,C,CA,.,MET,A,1,1,?,...,-5.054,-26.911,1.00,53.52,?,1,MET,A,CA,1
2,ATOM,3,C,C,.,MET,A,1,1,?,...,-4.848,-25.543,1.00,52.77,?,1,MET,A,C,1
3,ATOM,4,O,O,.,MET,A,1,1,?,...,-4.618,-25.451,1.00,51.10,?,1,MET,A,O,1
4,ATOM,5,C,CB,.,MET,A,1,1,?,...,-3.699,-27.610,1.00,53.58,?,1,MET,A,CB,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5262,HETATM,5263,O,O,.,HOH,T,7,.,?,...,-4.467,38.118,1.00,32.05,?,961,HOH,B,O,1
5263,HETATM,5264,O,O,.,HOH,T,7,.,?,...,4.389,47.804,1.00,38.08,?,969,HOH,B,O,1
5264,HETATM,5265,O,O,.,HOH,T,7,.,?,...,26.355,-7.866,1.00,28.34,?,973,HOH,B,O,1
5265,HETATM,5266,O,O,.,HOH,T,7,.,?,...,7.105,39.757,1.00,53.03,?,975,HOH,B,O,1


In [9]:
structure = parser.get_structure("", "data/mmcif/"+eg+".cif")


In [10]:
import nglview as nv
view = nv.show_biopython(structure)



In [11]:
view

NGLWidget()

In [12]:
cols = ['group_PDB', 'auth_asym_id', 'label_asym_id', 'auth_seq_id', 'label_seq_id', 'label_comp_id', 'id', 'label_atom_id',
        'type_symbol', 'Cartn_x', 'Cartn_y', 'Cartn_z']

In [13]:
import sys
import gemmi
from gemmi import cif


print("Loading cif file of", eg)
path = 'data/mmcif/'+eg+'.cif'
try:
    doc = cif.read_file(path)  # copy all the data from mmCIF file
    lol = []  # list of lists
    for b, block in enumerate(doc):
        table = block.find('_atom_site.', cols)
        for row in table:
            lol.append(list(row))
except Exception as e:
    print("Oops. %s" % e)
    sys.exit(1)

Loading cif file of 1f88


In [14]:
cols = ['group_PDB', 'auth_asym_id', 'label_asym_id', 'auth_seq_id', 'label_seq_id', 'label_comp_id', 'id', 'label_atom_id',
        'atom_type', 'x', 'y', 'z']
df = pd.DataFrame(data=lol, columns=cols)

In [15]:
def fix_label_seq_id(x):
    try:
        return int(x)
    except:
        return None

In [16]:
df

Unnamed: 0,group_PDB,auth_asym_id,label_asym_id,auth_seq_id,label_seq_id,label_comp_id,id,label_atom_id,atom_type,x,y,z
0,ATOM,A,A,1,1,MET,1,N,N,43.958,-5.980,-27.758
1,ATOM,A,A,1,1,MET,2,CA,C,44.718,-5.054,-26.911
2,ATOM,A,A,1,1,MET,3,C,C,44.069,-4.848,-25.543
3,ATOM,A,A,1,1,MET,4,O,O,42.854,-4.618,-25.451
4,ATOM,A,A,1,1,MET,5,CB,C,44.868,-3.699,-27.610
...,...,...,...,...,...,...,...,...,...,...,...,...
5262,HETATM,B,T,961,.,HOH,5263,O,O,33.819,-4.467,38.118
5263,HETATM,B,T,969,.,HOH,5264,O,O,66.200,4.389,47.804
5264,HETATM,B,T,973,.,HOH,5265,O,O,26.109,26.355,-7.866
5265,HETATM,B,T,975,.,HOH,5266,O,O,66.293,7.105,39.757


In [17]:
df.groupby('auth_seq_id')['label_comp_id'].first()

auth_seq_id
1      MET
10     TYR
100    HIS
101    GLY
102    TYR
      ... 
976    HOH
977    RET
978    RET
98     SER
99     LEU
Name: label_comp_id, Length: 377, dtype: object

In [18]:
df.groupby('label_atom_id').sum()

Unnamed: 0_level_0,group_PDB,auth_asym_id,label_asym_id,auth_seq_id,label_seq_id,label_comp_id,id,atom_type,x,y,z
label_atom_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C,ATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMAT...,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,1234567891011121314151617181920212223242526272...,1234567891011121314151617181920212223242526272...,METASNGLYTHRGLUGLYPROASNPHETYRVALPROPHESERASNL...,3111923303943505869818895106112120129136140147...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,44.06944.47444.97545.40947.14648.52050.49650.0...,-4.848-3.296-0.1932.7754.6517.63010.07810.0956...,-25.543-22.697-24.576-25.508-27.612-28.257-28....
C1,HETATMHETATMHETATMHETATMHETATMHETATMHETATMHETA...,CCDDEEEFFAB,CCDDEEEFFLR,121212312977978,...........,NAGNAGNAGNAGNAGNAGMANNAGNAGRETRET,50685082509651105124513851525163517751965221,CCCCCCCCCCC,37.62035.56447.22550.56859.51262.86964.48246.8...,3.7577.698-8.617-12.3540.9713.7248.734-9.992-1...,-28.808-31.709-23.091-21.37241.02843.58444.035...
C10,HETATMHETATM,AB,LR,977978,..,RETRET,52055230,CC,57.62638.639,10.21111.044,-10.42426.150
C11,HETATMHETATM,AB,LR,977978,..,RETRET,52065231,CC,56.58239.898,10.87111.497,-11.23626.800
C12,HETATMHETATM,AB,LR,977978,..,RETRET,52075232,CC,55.33541.161,11.24311.577,-10.84826.282
...,...,...,...,...,...,...,...,...,...,...,...
OH,ATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMAT...,AAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBB,AAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBB,1029304360749610213617819119220622326827430130...,1029304360749610213617819119220622326827430130...,TYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRTYRT...,7822824034749060977982610871410150515171635178...,OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO,48.67134.61339.66949.37859.44374.24639.29738.2...,-1.77713.8576.17010.53036.32729.09324.30218.60...,-22.305-12.642-20.924-1.0399.6573.050-14.559-1...
OXT,ATOM,A,A,348,348,ALA,2638,O,90.145,31.762,16.624
SD,ATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMAT...,AAAAAAAAAAAAAAAABBBBBBBBBBBBBBB,AAAAAAAAAAAAAAAABBBBBBBBBBBBBBB,1394449861431551631832072532572883083093171394...,1394449861431551631832072532572883083093171394...,METMETMETMETMETMETMETMETMETMETMETMETMETMETMETM...,7316354397700113712361299144516421976200722572...,SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS,46.86337.63247.72743.61957.66089.57179.47374.0...,-4.87610.29514.55422.26416.85415.58025.37714.1...,-29.066-0.070-4.4345.183-4.2293.163-4.682-11.0...
SG,ATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMAT...,AAAAAAAAAABBBBBBBBBB,AAAAAAAAAABBBBBBBBBB,1101401671851872222643163223231101401671851872...,1101401671851872222643163223231101401671851872...,CYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSCYSC...,8831114132414611473177120612486252925353521375...,SSSSSSSSSSSSSSSSSSSS,51.21984.37465.32742.66950.40378.02856.89866.5...,14.0094.84411.23911.95013.3177.4468.87227.5622...,-18.3868.988-13.144-12.401-16.6654.194-0.22016...


In [19]:
df['label_seq_id']=df.apply(lambda x: fix_label_seq_id(x.label_seq_id), axis=1)

Columns
group_PDB 	id 	type_symbol 	label_atom_id 	label_alt_id 	label_comp_id 	label_asym_id 	label_entity_id 	label_seq_id 	pdbx_PDB_ins_code 	Cartn_x 	Cartn_y 	Cartn_z 	occupancy 	B_iso_or_equiv 	Cartn_x_esd 	Cartn_y_esd 	Cartn_z_esd 	occupancy_esd 	B_iso_or_equiv_esd 	pdbx_formal_charge 	auth_seq_id 	auth_comp_id 	auth_asym_id 	auth_atom_id 	pdbx_PDB_model_num

In [20]:
import sys
from math import degrees
import gemmi

ramas = {aa: [] for aa in [
    'LEU', 'ALA', 'GLY', 'VAL', 'GLU', 'SER', 'LYS', 'ASP', 'THR', 'ILE',
    'ARG', 'PRO', 'ASN', 'PHE', 'GLN', 'TYR', 'HIS', 'MET', 'CYS', 'TRP']}

rol = []
for path in ['data/mmcif/'+eg+'.cif']:
    st = gemmi.read_structure(path)
    if 0.1 < st.resolution < 5:
        model = st[0]
        if len(st) > 1:
            print("There are multiple models!")
        for chain in model:
            for r, res in enumerate(chain.get_polymer()):
                # previous_residue() and next_residue() return previous/next
                # residue only if the residues are bonded. Otherwise -- None.
                prev_res = chain.previous_residue(res)
                next_res = chain.next_residue(res)
                phi, psi = gemmi.calculate_phi_psi(prev_res, res, next_res)
                omega = gemmi.calculate_omega(res, next_res)
                rol.append([res.name, res.label_seq, res.subchain, 
                            degrees(phi), degrees(omega), degrees(psi)])
                try:
                    ramas[res.name].append([degrees(phi), degrees(omega), degrees(psi)])
                except KeyError:
                    pass

In [21]:
"""
# Write data to files
for aa, data in ramas.items():
    with open('data/ramach/' + aa + '.tsv', 'w') as f:
        for phi, omega, psi in data:
            f.write('%.4f\t%.4f\n' % (degrees(phi), degrees(psi)))"""

"\n# Write data to files\nfor aa, data in ramas.items():\n    with open('data/ramach/' + aa + '.tsv', 'w') as f:\n        for phi, omega, psi in data:\n            f.write('%.4f\t%.4f\n' % (degrees(phi), degrees(psi)))"

In [22]:
cols = ['name', 'label_seq_id', 'chain', 'phi', 'omega', 'psi']

In [23]:
res_df = pd.DataFrame(data=rol, columns=cols)

In [24]:
res_df

Unnamed: 0,name,label_seq_id,chain,phi,omega,psi
0,MET,1,A,,-179.718577,133.146625
1,ASN,2,A,-87.567941,179.953276,-5.295383
2,GLY,3,A,-150.672305,178.921162,143.916355
3,THR,4,A,-85.441746,178.477151,112.209423
4,GLU,5,A,-96.636867,178.659871,123.041558
...,...,...,...,...,...,...
638,CYS,322,B,-74.457863,179.107101,-7.588421
639,CYS,323,B,65.675548,179.210471,29.106972
640,GLY,324,B,134.859554,-179.348699,-24.685362
641,LYS,325,B,-77.173222,-179.767435,94.072619


In [25]:
full = pd.merge(res_df, df, on=['label_seq_id', 'label_seq_id'])

In [26]:
import numpy as np
full = full.astype({'label_seq_id': np.int16, 'auth_seq_id': np.int16, 'id': np.int16, 
                    'phi': np.float32, 'omega': np.float32, 'psi': np.float32,
                    'x': np.float32, 'y': np.float32, 'z': np.float32})

In [27]:
def get_atom_id(x):
    atom_dict = {'C': 0, 'O': 1, 'N': 2, 'H': 3, 'S': 4}
    try:
        return atom_dict[str(x)]
    except:
        return 5
full['atom_id'] = full.apply(lambda x: get_atom_id(x.atom_type), axis=1)

In [28]:
full

Unnamed: 0,name,label_seq_id,chain,phi,omega,psi,group_PDB,auth_asym_id,label_asym_id,auth_seq_id,label_comp_id,id,label_atom_id,atom_type,x,y,z,atom_id
0,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,1,N,N,43.958000,-5.980000,-27.757999,2
1,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,2,CA,C,44.717999,-5.054000,-26.910999,0
2,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,3,C,C,44.069000,-4.848000,-25.542999,0
3,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,4,O,O,42.854000,-4.618000,-25.451000,1
4,MET,1,A,,-179.718582,133.146622,ATOM,A,A,1,MET,5,CB,C,44.868000,-3.699000,-27.610001,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9920,ALA,348,A,-165.916489,,,ATOM,A,A,348,ALA,2634,CA,C,88.039001,30.749001,17.236000,0
9921,ALA,348,A,-165.916489,,,ATOM,A,A,348,ALA,2635,C,C,89.197998,31.023001,16.242001,0
9922,ALA,348,A,-165.916489,,,ATOM,A,A,348,ALA,2636,O,O,89.163002,30.493000,15.102000,1
9923,ALA,348,A,-165.916489,,,ATOM,A,A,348,ALA,2637,CB,C,88.360001,29.510000,18.103001,0


In [29]:
xyz = full[['x', 'y', 'z']].to_numpy(dtype=float)

In [30]:
ids = full[['label_seq_id', 'auth_seq_id']].to_numpy(dtype='int')

In [31]:
atoms = full['atom_id'].to_numpy(dtype=int)

In [32]:
xyz

array([[ 43.95800018,  -5.98000002, -27.75799942],
       [ 44.7179985 ,  -5.0539999 , -26.9109993 ],
       [ 44.06900024,  -4.84800005, -25.54299927],
       ...,
       [ 89.16300201,  30.49300003,  15.10200024],
       [ 88.36000061,  29.51000023,  18.10300064],
       [ 90.14499664,  31.76199913,  16.62400055]])

In [33]:
ids

array([[  1,   1],
       [  1,   1],
       [  1,   1],
       ...,
       [348, 348],
       [348, 348],
       [348, 348]])

In [34]:
atoms[:100]

array([2, 0, 0, 1, 0, 0, 4, 0, 2, 0, 0, 1, 0, 0, 4, 0, 2, 0, 0, 1, 0, 0,
       4, 0, 2, 0, 0, 1, 0, 0, 4, 0, 2, 0, 0, 1, 0, 0, 1, 2, 2, 0, 0, 1,
       0, 0, 1, 2, 2, 0, 0, 1, 0, 0, 1, 2, 2, 0, 0, 1, 0, 0, 1, 2, 2, 0,
       0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 0, 1, 0, 2,
       0, 0, 1, 0, 1, 0, 2, 0, 0, 1, 0, 1])

In [35]:
"""import plotly.express as px
select = full[(100 < full['label_seq_id']) & (full['label_seq_id'] < 120)]
df = px.data.iris()
fig = px.scatter_3d(select, x='x', y='y', z='z',
                    color='atom_type', opacity=0.5)
fig.update_traces(marker=dict(size=7))
fig.show()"""

"import plotly.express as px\nselect = full[(100 < full['label_seq_id']) & (full['label_seq_id'] < 120)]\ndf = px.data.iris()\nfig = px.scatter_3d(select, x='x', y='y', z='z',\n                    color='atom_type', opacity=0.5)\nfig.update_traces(marker=dict(size=7))\nfig.show()"

In [36]:
"""import sys
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator


def plot(data_file, label, output=None):
    x, y = [], []
    for line in open(data_file):
        phi, psi = line.split()
        if phi != 'nan' and psi != 'nan':
            x.append(float(phi))
            y.append(float(psi))
    print('Plotting %d points for %s' % (len(x), label))

    plt.figure(figsize=(5.5, 5.5))
    plt.title('%s, %d points.' % (label, len(x)), fontsize=12)
    plt.xlim([-180, 180])
    plt.ylim([-180, 180])
    ax = plt.gca()
    ax.xaxis.set_major_locator(MultipleLocator(60))
    ax.yaxis.set_major_locator(MultipleLocator(60))
    plt.xlabel(r'$\phi$', fontsize=14)
    plt.ylabel(r'$\psi$', fontsize=14, labelpad=0)
    plt.grid(color='#AAAAAA', linestyle='--')
    plt.hexbin(x, y, gridsize=2*180, bins='log', cmap='Blues')
    if output:
        plt.savefig(output, dpi=300)  # dpi=70 for small images in docs
    else:
        plt.show()


for aa in ramas:
    plot('data/ramach/%s.tsv' % aa, aa)
    #plot('ramas/%s.tsv' % aa, aa, 'ramas/%s.png' % aa)"""

"import sys\nimport matplotlib.pyplot as plt\nfrom matplotlib.ticker import MultipleLocator\n\n\ndef plot(data_file, label, output=None):\n    x, y = [], []\n    for line in open(data_file):\n        phi, psi = line.split()\n        if phi != 'nan' and psi != 'nan':\n            x.append(float(phi))\n            y.append(float(psi))\n    print('Plotting %d points for %s' % (len(x), label))\n\n    plt.figure(figsize=(5.5, 5.5))\n    plt.title('%s, %d points.' % (label, len(x)), fontsize=12)\n    plt.xlim([-180, 180])\n    plt.ylim([-180, 180])\n    ax = plt.gca()\n    ax.xaxis.set_major_locator(MultipleLocator(60))\n    ax.yaxis.set_major_locator(MultipleLocator(60))\n    plt.xlabel(r'$\\phi$', fontsize=14)\n    plt.ylabel(r'$\\psi$', fontsize=14, labelpad=0)\n    plt.grid(color='#AAAAAA', linestyle='--')\n    plt.hexbin(x, y, gridsize=2*180, bins='log', cmap='Blues')\n    if output:\n        plt.savefig(output, dpi=300)  # dpi=70 for small images in docs\n    else:\n        plt.show()\

In [37]:
uniprot = pdbtouniprot('4JKV')

In [38]:
uniprot

'P0ABE7'

In [39]:
pdb = uniprottopdb(uniprot)

In [40]:
pdb

['1APC',
 '1LM3',
 '1M6T',
 '1QPU',
 '1QQ3',
 '256B',
 '2BC5',
 '2QLA',
 '3C62',
 '3C63',
 '3DE8',
 '3DE9',
 '3FOO',
 '3FOP',
 '3HNI',
 '3HNJ',
 '3HNK',
 '3HNL',
 '3IQ5',
 '3IQ6',
 '3L1M',
 '3M15',
 '3M4B',
 '3M4C',
 '3M79',
 '3NMI',
 '3NMJ',
 '3NMK',
 '3TOL',
 '3TOM',
 '3U8P',
 '4EA3',
 '4EIY',
 '4IAQ',
 '4IAR',
 '4IB4',
 '4JE9',
 '4JEA',
 '4JEB',
 '4JKV',
 '4L6R',
 '4N6H',
 '4NC3',
 '4NTJ',
 '4O9R',
 '4OR2',
 '4PXZ',
 '4PY0',
 '4QIM',
 '4QIN',
 '4RWA',
 '4RWD',
 '4U9D',
 '4U9E',
 '4YAY',
 '4Z34',
 '4Z35',
 '4Z36',
 '4ZUD',
 '5AWI',
 '5BU7',
 '5DHG',
 '5DHH',
 '5IU4',
 '5IU7',
 '5IU8',
 '5IUA',
 '5IUB',
 '5JTB',
 '5K2A',
 '5K2B',
 '5K2C',
 '5K2D',
 '5L31',
 '5L32',
 '5L7D',
 '5L7I',
 '5MZJ',
 '5MZP',
 '5N2R',
 '5N2S',
 '5NDD',
 '5NDZ',
 '5NJ6',
 '5NLX',
 '5NM2',
 '5NM4',
 '5OLG',
 '5OLH',
 '5OLO',
 '5OLV',
 '5OLZ',
 '5OM1',
 '5OM4',
 '5TUD',
 '5TVN',
 '5UEN',
 '5UIG',
 '5UNF',
 '5UNG',
 '5UNH',
 '5UVI',
 '5VRA',
 '5WIU',
 '5WIV',
 '5XJM',
 '5XZI',
 '5XZJ',
 '5YM7',
 '5YO3',
 '5YO4',
 

In [41]:
from utils3 import *


pdb_id_list = ['2RH1', '1HZX', '1GZM']

In [42]:
import pandas as pd


for j in range(len(pdb_id_list)):
    pdb_id = pdb_id_list[j]
    maps = get_mappings_data(pdb_id)[pdb_id.lower()]['UniProt']
    uniprots = maps.keys()
    for i, uniprot in enumerate(uniprots):
        table = pd.DataFrame.from_dict(maps[uniprot])
        table['PDB'] = pdb_id
        table['uniprot'] = uniprot
        if i + j == 0:
            full_table = table
        else:
            full_table = full_table.append(table, ignore_index=True)

In [43]:
full_table

Unnamed: 0,identifier,name,mappings,PDB,uniprot
0,ADRB2_HUMAN,ADRB2_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",2RH1,P07550
1,ADRB2_HUMAN,ADRB2_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",2RH1,P07550
2,ENLYS_BPT4,ENLYS_BPT4,"{'entity_id': 1, 'end': {'author_residue_numbe...",2RH1,P00720
3,OPSD_BOVIN,OPSD_BOVIN,"{'entity_id': 1, 'chain_id': 'A', 'start': {'a...",1HZX,P02699
4,OPSD_BOVIN,OPSD_BOVIN,"{'entity_id': 1, 'chain_id': 'B', 'start': {'a...",1HZX,P02699
5,OPSD_BOVIN,OPSD_BOVIN,"{'entity_id': 1, 'chain_id': 'A', 'start': {'a...",1GZM,P02699
6,OPSD_BOVIN,OPSD_BOVIN,"{'entity_id': 1, 'chain_id': 'B', 'start': {'a...",1GZM,P02699


In [44]:
pd.DataFrame.from_dict(full_table.iloc[0]['mappings'])

Unnamed: 0,entity_id,end,chain_id,start,unp_end,unp_start,struct_asym_id
author_residue_number,1,230.0,A,,230,1,A
author_insertion_code,1,,A,,230,1,A
residue_number,1,237.0,A,8.0,230,1,A


In [45]:
pd.DataFrame.from_dict(full_table.iloc[1]['mappings'])

Unnamed: 0,entity_id,end,chain_id,start,unp_end,unp_start,struct_asym_id
author_residue_number,1,,A,264.0,365,264,A
author_insertion_code,1,,A,,365,264,A
residue_number,1,500.0,A,399.0,365,264,A


In [316]:
from utils import *
from utils2 import *
from utils3 import *
from plotting import *
from gpcrdb_soup import *

In [317]:
from tqdm import tqdm, trange

In [318]:
import sys
from gemmi import cif

In [319]:
class DataLoader():
    def __init__(self, 
                 path = 'data/',
                 structure = 'mmcif/',
                 limit=10,
                 remove_hetatm=True):
        self.path = path
        self.structure_path = self.path + structure
        self.path_table = path + 'gpcrdb/' + 'structures.pkl'
        
        self.filenames, self.pdb_ids = get_pdb_files(path=self.structure_path)
        # Columns for structure dataframe
        self.cols = ['group_PDB', 'auth_asym_id', 'label_asym_id', 'label_seq_id', 'auth_seq_id', 
                     'label_comp_id', 'id', 'label_atom_id', 
                     'type_symbol', 'Cartn_x', 'Cartn_y', 'Cartn_z']
        
        self.numbering = pd.DataFrame()
        self.table = pd.read_pickle(self.path_table)
        for i, pdb_id in tqdm(enumerate(self.pdb_ids)):
            if i < limit:
                protein, family = self.get_prot_info(pdb_id)
                numbering = self.get_res_nums(protein)
                if i == 0:
                    self.structure = self.load_cifs(pdb_id)
                    self.structure['identifier'] = protein.upper()
                    self.mappings = self.get_mapping(pdb_id)
                    numb = [pdb_id, protein, family, numbering]
                    # numb = [pdb_id, protein, self.entry_to_ac(protein), family, numbering]
                    self.numbering = self.numbering.append(pd.DataFrame(numb).T)
                else:
                    structure = self.load_cifs(pdb_id)
                    structure['identifier'] = protein.upper()
                    self.structure = self.structure.append(structure, ignore_index=True)
                    self.mappings = self.mappings.append(self.get_mapping(pdb_id), ignore_index=True)
                    # self.numbering = self.numbering.append(pd.DataFrame(data=[pdb_id, protein, self.entry_to_ac(protein),
                    #                                                           family, numbering]).T, 
                    self.numbering = self.numbering.append(pd.DataFrame(data=[pdb_id, protein, family, numbering]).T, 
                                                           ignore_index=True)
        self.numbering.columns = ['PDB', 'identifier', 'family', 'numbering']
        
        if remove_hetatm:
            self.structure = self.structure[self.structure['group_PDB']!='HETATM']
            self.structure['label_seq_id'] = self.structure['label_seq_id'].astype(np.int64)
        self.structure['label_comp_sid'] = self.structure.apply(lambda x: \
                                                                gemmi.find_tabulated_residue(x.label_comp_id).one_letter_code, 
                                                                axis=1)
        # self.numbering.columns = ['pdb', 'identifier', 'uniprot', 'family', 'numbering']
        # self.structure['auth_seq_id'] = self.structure['auth_seq_id'].astype(np.int8)
        # self.structure['id'] = self.structure['id'].astype(np.int8)
        # self.structure['Cartn_x'] = self.structure['Cartn_x'].astype(np.float64)
        # self.structure['Cartn_y'] = self.structure['Cartn_y'].astype(np.float64)
        # self.structure['Cartn_z'] = self.structure['Cartn_z'].astype(np.float64)
        
        
    def entry_to_ac(self, entry: str):
        response = requests.get('https://www.uniprot.org/uniprot/'+entry+'.txt')
        return response.text.split('\n')[1].split('AC   ')[1][:6]
    
    def get_prot_info(self, pdb_id):
        # query structure
        response = requests.get('https://gpcrdb.org/services/structure/'+pdb_id.upper()+'/')
        protein = response.json()['protein']
        family = response.json()['family']
        return protein, family
    
    def get_res_nums(self, protein):
        # query uniprot -> res num
        response = requests.get('https://gpcrdb.org/services/residues/extended/'+protein+'/')    
        # select res num
        # assign res_num to structure data
        return response.json()
    
    def get_mapping(self, pdb_id):
        maps = get_mappings_data(pdb_id)[pdb_id.lower()]['UniProt']
        uniprots = maps.keys()
        full_table=pd.DataFrame()
        for i, uniprot in enumerate(uniprots):
            table = pd.DataFrame.from_dict(maps[uniprot])
            table['PDB'] = pdb_id
            table['uniprot'] = uniprot
            if i + j == 0:
                full_table = table
            else:
                full_table = full_table.append(table, ignore_index=True)
        return full_table
    
    def load_cifs(self, pdb_id):
        print("Loading cif file of", pdb_id)
        path = 'data/mmcif/' + pdb_id + '.cif'
        print(path)
        try:
            doc = cif.read_file(path)  # copy all the data from mmCIF file
            lol = []  # list of lists
            for b, block in enumerate(doc):
                table = block.find('_atom_site.', self.cols)
                for row in table:
                    lol.append([pdb_id]+list(row))
        except Exception as e:
            print("Oops. %s" % e)
            sys.exit(1)
        cols = ['PDB']+self.cols
        return pd.DataFrame(data=lol, columns=cols)

In [320]:
data = DataLoader(limit=2)

0it [00:00, ?it/s]

Loading cif file of 4XNV
data/mmcif/4XNV.cif


1it [00:11, 11.09s/it]

Loading cif file of 6C1R
data/mmcif/6C1R.cif


528it [00:21, 24.82it/s]


In [321]:
def lookup(pdb, identifier, mappings):
    return list(mappings[(mappings['identifier']==identifier.upper()) & (mappings['PDB']==pdb)]['mappings'])

In [322]:
data.structure

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,Cartn_x,Cartn_y,Cartn_z,identifier,label_comp_sid
0,4XNV,ATOM,A,A,38,39,SER,1,N,N,-1.110,11.647,-24.095,P2RY1_HUMAN,S
1,4XNV,ATOM,A,A,38,39,SER,2,CA,C,-1.527,11.293,-22.741,P2RY1_HUMAN,S
2,4XNV,ATOM,A,A,38,39,SER,3,C,C,-3.051,11.212,-22.626,P2RY1_HUMAN,S
3,4XNV,ATOM,A,A,38,39,SER,4,O,O,-3.732,12.245,-22.573,P2RY1_HUMAN,S
4,4XNV,ATOM,A,A,38,39,SER,5,CB,C,-0.955,12.276,-21.719,P2RY1_HUMAN,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5955,6C1R,ATOM,L,B,7,6,ARG,2842,CD,C,0.728,6.459,-25.367,C5AR1_HUMAN,R
5956,6C1R,ATOM,L,B,7,6,ARG,2843,NE,N,-0.170,7.588,-25.618,C5AR1_HUMAN,R
5957,6C1R,ATOM,L,B,7,6,ARG,2844,CZ,C,0.133,8.635,-26.382,C5AR1_HUMAN,R
5958,6C1R,ATOM,L,B,7,6,ARG,2845,NH1,N,-0.745,9.616,-26.542,C5AR1_HUMAN,R


In [285]:
mappings

Unnamed: 0,identifier,name,mappings,PDB,uniprot
0,P2RY1_HUMAN,P2RY1_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",4XNV,P47900
1,P2RY1_HUMAN,P2RY1_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",4XNV,P47900
2,RUBR_CLOPA,RUBR_CLOPA,"{'entity_id': 1, 'end': {'author_residue_numbe...",4XNV,P00268


In [127]:
# df = pd.merge(data.table, data.structure, on='PDB')
# df = pd.merge(df, data.numbering, on='PDB')

In [307]:
for i, pdb in enumerate(list(set(list(data.structure['PDB'])))):
    data.structure[data.structure['identifier']] = data.table[data.table['PDB']==pdb]['uniprot(gene)'].iloc[0]

'C5AR1'

In [323]:
# add gene to mapping

for i, pdb in enumerate(list(set(list(data.structure['PDB'])))):
    mappings = data.mappings[data.mappings['PDB']==pdb]
    pref_chain = data.table[data.table['PDB']==pdb]['Preferred Chain'].iloc[0]
    map_df_list = []
    identifier_list = []
    for j in range(len(mappings)):
        chain = pd.DataFrame.from_dict(mappings.iloc[j]['mappings'])['chain_id'].iloc[0]
        identifier = mappings.iloc[j]['name']
        dict_ = pd.DataFrame.from_dict(mappings.iloc[j]['mappings'])
        dict_['identifier'] = identifier
        map_df_list.append(pd.DataFrame.from_dict(dict_))
    _ = pd.concat(map_df_list)
    _ = _[_['chain_id']==pref_chain]
    _['PDB'] = pdb
    if i == 0:
        maps_stacked = _
    else:
        maps_stacked = maps_stacked.append(_)

In [324]:
data.structure

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,Cartn_x,Cartn_y,Cartn_z,identifier,label_comp_sid
0,4XNV,ATOM,A,A,38,39,SER,1,N,N,-1.110,11.647,-24.095,P2RY1_HUMAN,S
1,4XNV,ATOM,A,A,38,39,SER,2,CA,C,-1.527,11.293,-22.741,P2RY1_HUMAN,S
2,4XNV,ATOM,A,A,38,39,SER,3,C,C,-3.051,11.212,-22.626,P2RY1_HUMAN,S
3,4XNV,ATOM,A,A,38,39,SER,4,O,O,-3.732,12.245,-22.573,P2RY1_HUMAN,S
4,4XNV,ATOM,A,A,38,39,SER,5,CB,C,-0.955,12.276,-21.719,P2RY1_HUMAN,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5955,6C1R,ATOM,L,B,7,6,ARG,2842,CD,C,0.728,6.459,-25.367,C5AR1_HUMAN,R
5956,6C1R,ATOM,L,B,7,6,ARG,2843,NE,N,-0.170,7.588,-25.618,C5AR1_HUMAN,R
5957,6C1R,ATOM,L,B,7,6,ARG,2844,CZ,C,0.133,8.635,-26.382,C5AR1_HUMAN,R
5958,6C1R,ATOM,L,B,7,6,ARG,2845,NH1,N,-0.745,9.616,-26.542,C5AR1_HUMAN,R


In [325]:
maps_stacked

Unnamed: 0,entity_id,end,chain_id,start,unp_end,unp_start,struct_asym_id,identifier,PDB
author_residue_number,1,,B,116.0,331,30,A,C5AR1_HUMAN,6C1R
author_insertion_code,1,,B,,331,30,A,C5AR1_HUMAN,6C1R
residue_number,1,417.0,B,116.0,331,30,A,C5AR1_HUMAN,6C1R
author_residue_number,1,,B,,128,23,A,C562_ECOLX,6C1R
author_insertion_code,1,,B,,128,23,A,C562_ECOLX,6C1R
residue_number,1,115.0,B,10.0,128,23,A,C562_ECOLX,6C1R
author_residue_number,1,247.0,A,,247,2,A,P2RY1_HUMAN,4XNV
author_insertion_code,1,,A,,247,2,A,P2RY1_HUMAN,4XNV
residue_number,1,246.0,A,1.0,247,2,A,P2RY1_HUMAN,4XNV
author_residue_number,1,,A,253.0,373,253,A,P2RY1_HUMAN,4XNV


In [326]:
def get_generic_nums(pdb_id, numbering):
    sequence_numbers = []
    amino_acids = []
    generic_numbers = []
    for i in numbering[numbering['PDB']==pdb_id].iloc[0]['numbering']:
        if i['alternative_generic_numbers'] != []:
            sequence_numbers.append(i['sequence_number'])
            amino_acids.append(i['amino_acid'])
            generic_numbers.append(i['display_generic_number'])
    return list(zip(sequence_numbers, amino_acids, generic_numbers))

In [344]:
eg = '5.40x41'

In [346]:
eg.split('x')

['5.40', '41']

In [349]:
data.structure['label_2_uni'] = -1
data.structure['gen_pos'] = ''
data.structure['gen_pos1'] = -1
data.structure['gen_pos2'] = -1
data.structure['uniprot_comp_sid'] = ''
data.structure['label_comp_sid'] = data.structure.apply(lambda x: \
                                                        gemmi.find_tabulated_residue(x.label_comp_id).one_letter_code, axis=1)


def get_pos(zipped_pos_dict, l2u, comp_sid):
    # nums, aas, gn
    if l2u >= 0:
        if l2u in list(zip(*zipped_pos_dict))[0]:
            print('found idx')
            idx = list(zip(*zipped_pos_dict))[0].index(l2u)
            row = zipped_pos_dict[idx]
            if row[1] == comp_sid:
                print("found row", row, comp_sid)
                
                return [row[2], row[1], float(row[2].split('x')[0]), int(row[2].split('x')[1])]
            else:                
                print("found row, but residue are not the same", row, comp_sid)
                return [row[2]+'?', row[1], float(row[2].split('x')[0]), int(row[2].split('x')[1])]
        else:
            return ['', '', -1., -1]
    else:
        return ['', '', -1., -1]



for i, pdb in enumerate(list(set(list(data.structure['PDB'])))):
    uniprot_list = []
    pref_mapping = maps_stacked[maps_stacked['PDB']==pdb].loc['residue_number'][['start','end','unp_start','unp_end', 'identifier']]
    pref_mapping = pref_mapping.sort_values('start')
    zipped_list = get_generic_nums(pdb, data.numbering)
    uniprot_identifier = data.structure[data.structure['PDB']==pdb]['identifier'].iloc[0]
    print(uniprot_identifier)
        
    for j in range(len(pref_mapping)):
        row = pref_mapping.iloc[j].to_dict()
        map_identifier = row['identifier']
        start_label_seq_id = row['start']
        start_uniprot = row['unp_start']
        end_label_seq_id = row['end']
        end_uniprot = row['unp_end']
        if map_identifier == uniprot_identifier:
            print('Found correct uniprot map:', uniprot_identifier, map_identifier)
            idxs = [x for x in range(10000) \
                    if ((x <= end_label_seq_id) & (x >= start_label_seq_id))]
            vals = [x + start_uniprot - start_label_seq_id for x in range(5000) \
                    if ((x <= end_label_seq_id) & (x >= start_label_seq_id))]

            for k, idx in enumerate(idxs):
                line = data.structure[(data.structure['PDB'] == pdb) & 
                                      (data.structure['label_seq_id'] == idx) &
                                      (data.structure['label_atom_id'] == 'CA')]
                lines = len(line)
                if lines > 0:
                    data.structure.at[line.index[0], 'label_2_uni'] = int(vals[k])
        else:
            print('Didnt find correct uniprotmap (not a gpcr):', uniprot_identifier, map_identifier)
    # statement to drive labelling of residues by gene only if gene is a gpcr
    data.structure[['gen_pos', 'uniprot_comp_sid', 'gen_pos1', 'gen_pos2']] = data.structure.\
        apply(lambda x: get_pos(zipped_list, x.label_2_uni, x.label_comp_sid), axis=1, result_type='expand')

C5AR1_HUMAN
Didnt find correct uniprotmap (not a gpcr): C5AR1_HUMAN C562_ECOLX
Found correct uniprot map: C5AR1_HUMAN C5AR1_HUMAN
found idx
found row (34, 'R', '1.29x29') R
found idx
found row (35, 'V', '1.30x30') V
found idx
found row (36, 'P', '1.31x31') P
found idx
found row (37, 'D', '1.32x32') D
found idx
found row (38, 'I', '1.33x33') I
found idx
found row (39, 'L', '1.34x34') L
found idx
found row (40, 'A', '1.35x35') A
found idx
found row (41, 'L', '1.36x36') L
found idx
found row (42, 'V', '1.37x37') V
found idx
found row (43, 'I', '1.38x38') I
found idx
found row (44, 'F', '1.39x39') F
found idx
found row (45, 'A', '1.40x40') A
found idx
found row (46, 'V', '1.41x41') V
found idx
found row (47, 'V', '1.42x42') V
found idx
found row (48, 'F', '1.43x43') F
found idx
found row (49, 'L', '1.44x44') L
found idx
found row (50, 'V', '1.45x45') V
found idx
found row (51, 'G', '1.46x46') G
found idx
found row (52, 'V', '1.47x47') V
found idx
found row (53, 'L', '1.48x48') L
found idx


P2RY1_HUMAN
Found correct uniprot map: P2RY1_HUMAN P2RY1_HUMAN
Didnt find correct uniprotmap (not a gpcr): P2RY1_HUMAN RUBR_CLOPA
Found correct uniprot map: P2RY1_HUMAN P2RY1_HUMAN
found idx
found row (49, 'F', '1.30x30') F
found idx
found row (50, 'Q', '1.31x31') Q
found idx
found row (51, 'F', '1.32x32') F
found idx
found row (52, 'Y', '1.33x33') Y
found idx
found row (53, 'Y', '1.34x34') Y
found idx
found row (54, 'L', '1.35x35') L
found idx
found row (55, 'P', '1.36x36') P
found idx
found row (56, 'A', '1.37x37') A
found idx
found row (57, 'V', '1.38x38') V
found idx
found row (58, 'Y', '1.39x39') Y
found idx
found row (59, 'I', '1.40x40') I
found idx
found row (60, 'L', '1.41x41') L
found idx
found row (61, 'V', '1.42x42') V
found idx
found row (62, 'F', '1.43x43') F
found idx
found row (63, 'I', '1.44x44') I
found idx
found row (64, 'I', '1.45x45') I
found idx
found row (65, 'G', '1.46x46') G
found idx
found row (66, 'F', '1.47x47') F
found idx
found row (67, 'L', '1.48x48') L
fo

In [350]:
data.structure[data.structure['label_atom_id']=='CA'].loc[1400:1500].head(20)

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,...,Cartn_y,Cartn_z,identifier,label_comp_sid,label_2_uni,gen_pos,uniprot_comp_sid,primary_seq,gen_pos1,gen_pos2
1401,4XNV,ATOM,A,A,212,213,SER,1402,CA,C,...,15.224,-5.178,P2RY1_HUMAN,S,213,5.34x35,S,False,5.34,35
1407,4XNV,ATOM,A,A,213,214,TYR,1408,CA,C,...,16.19,-3.539,P2RY1_HUMAN,Y,214,5.35x36,Y,False,5.35,36
1419,4XNV,ATOM,A,A,214,215,PHE,1420,CA,C,...,12.49,-3.309,P2RY1_HUMAN,F,215,5.36x37,F,False,5.36,37
1430,4XNV,ATOM,A,A,215,216,ILE,1431,CA,C,...,11.624,-1.612,P2RY1_HUMAN,I,216,5.37x38,I,False,5.37,38
1438,4XNV,ATOM,A,A,216,217,TYR,1439,CA,C,...,14.55,0.801,P2RY1_HUMAN,Y,217,5.38x39,Y,False,5.38,39
1450,4XNV,ATOM,A,A,217,218,SER,1451,CA,C,...,13.614,1.52,P2RY1_HUMAN,S,218,5.39x40,S,False,5.39,40
1456,4XNV,ATOM,A,A,218,219,MET,1457,CA,C,...,10.005,2.32,P2RY1_HUMAN,M,219,5.40x41,M,False,5.4,41
1464,4XNV,ATOM,A,A,219,220,CYS,1465,CA,C,...,11.346,4.872,P2RY1_HUMAN,C,220,5.41x42,C,False,5.41,42
1470,4XNV,ATOM,A,A,220,221,THR,1471,CA,C,...,13.481,6.685,P2RY1_HUMAN,T,221,5.42x43,T,False,5.42,43
1477,4XNV,ATOM,A,A,221,222,THR,1478,CA,C,...,10.672,6.403,P2RY1_HUMAN,T,222,5.43x44,T,False,5.43,44


In [351]:
data.structure[(data.structure['label_atom_id']=='CA') &
               (data.structure['gen_pos1'] > 7.45) &
               (data.structure['gen_pos1'] < 8.48)]

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,...,Cartn_y,Cartn_z,identifier,label_comp_sid,label_2_uni,gen_pos,uniprot_comp_sid,primary_seq,gen_pos1,gen_pos2
2579,4XNV,ATOM,A,A,365,317,SER,2580,CA,C,...,21.118,12.618,P2RY1_HUMAN,S,317,7.46x46,S,False,7.46,46
2585,4XNV,ATOM,A,A,366,318,CYS,2586,CA,C,...,21.009,13.66,P2RY1_HUMAN,C,318,7.47x47,C,False,7.47,47
2591,4XNV,ATOM,A,A,367,319,VAL,2592,CA,C,...,17.552,15.417,P2RY1_HUMAN,V,319,7.48x48,V,False,7.48,48
2598,4XNV,ATOM,A,A,368,320,ASN,2599,CA,C,...,18.414,17.762,P2RY1_HUMAN,N,320,7.49x49?,D,False,7.49,49
2606,4XNV,ATOM,A,A,369,321,PRO,2607,CA,C,...,20.942,20.006,P2RY1_HUMAN,P,321,7.50x50,P,False,7.5,50
2613,4XNV,ATOM,A,A,370,322,ILE,2614,CA,C,...,17.972,21.333,P2RY1_HUMAN,I,322,7.51x51,I,False,7.51,51
2621,4XNV,ATOM,A,A,371,323,LEU,2622,CA,C,...,16.617,22.911,P2RY1_HUMAN,L,323,7.52x52,L,False,7.52,52
2629,4XNV,ATOM,A,A,372,324,TYR,2630,CA,C,...,19.734,25.129,P2RY1_HUMAN,Y,324,7.53x53,Y,False,7.53,53
2641,4XNV,ATOM,A,A,373,325,PHE,2642,CA,C,...,19.189,26.754,P2RY1_HUMAN,F,325,7.54x54,F,False,7.54,54
2652,4XNV,ATOM,A,A,374,326,LEU,2653,CA,C,...,15.323,26.938,P2RY1_HUMAN,L,326,7.55x55,L,False,7.55,55


In [None]:
list(set(list(data.mappings['PDB'])))

In [95]:
def get_generic_nums(pdb_id, numbering):
    sequence_numbers = []
    amino_acids = []
    generic_numbers = []
    for i in numbering[numbering['PDB']==pdb_id].iloc[0]['numbering']:
        if i['alternative_generic_numbers'] != []:
            sequence_numbers.append(i['sequence_number'])
            amino_acids.append(i['amino_acid'])
            generic_numbers.append(i['display_generic_number'])
    return list(zip(sequence_numbers, amino_acids, generic_numbers))

In [96]:
nums, aas, gn = get_generic_nums(eg, data.numbering)

In [97]:
zipped_pos_list = list(zip(nums, aas, gn))

In [98]:
def get_pos(zipped_pos_dict, l2u, aa):
    # nums, aas, gn
    idx = list(zip(*zipped_pos_dict))[0].index(l2u)
    row = zipped_pos_dict[idx]
    return row[2]

In [99]:
get_pos(zipped_pos_list, 6, 'SER')

'1.32x32'

In [164]:
stack

Unnamed: 0,entity_id,chain_id,start,unp_end,unp_start,end,struct_asym_id,PDB
author_residue_number,1,A,1001,128,23,1106,A,5MZJ
author_insertion_code,1,A,,128,23,,A,5MZJ
residue_number,1,A,219,128,23,324,A,5MZJ
author_residue_number,1,A,1001,128,23,1106,A,5MZJ
author_insertion_code,1,A,,128,23,,A,5MZJ
...,...,...,...,...,...,...,...,...
author_insertion_code,1,A,,208,2,,A,6GT3
residue_number,1,A,11,208,2,217,A,6GT3
author_residue_number,1,A,219,317,219,,A,6GT3
author_insertion_code,1,A,,317,219,,A,6GT3


In [161]:
list(set(list(stack['PDB'])))

['5MZJ', '6GT3']