In [1]:
from utils import *
from gpcrdb_soup import *

In [2]:
from Bio.PDB.MMCIFParser import MMCIFParser
parser = MMCIFParser()

In [3]:
eg = '1f88'

In [4]:
path = get_rcsb_download(eg, fileformat = 'cif')

In [5]:
path

'https://files.rcsb.org/download/1f88.cif'

In [6]:

def download(url: str, folder: str, fileformat: str):
    if not os.path.isdir(folder):
        os.mkdir(folder)
    try:
        r = requests.get(url)
        loc = len(fileformat)+1
        fname = folder + '/' + url[-(loc+4):-loc] + '.' + fileformat
        with open(fname, 'wb') as f:
            f.write(r.content)
    except Exception:
        print("Url invalid:", url)

    
def download_pdb(url, folder, fileformat):
    download(url, folder, fileformat)

In [7]:
download(url=path, folder='data/mmcif', fileformat='cif')

In [8]:
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
import pandas as pd
d = MMCIF2Dict("data/mmcif/"+eg+".cif")
pd.DataFrame({k:v for k,v in d.items() if "_atom_site." in k})

Unnamed: 0,_atom_site.group_PDB,_atom_site.id,_atom_site.type_symbol,_atom_site.label_atom_id,_atom_site.label_alt_id,_atom_site.label_comp_id,_atom_site.label_asym_id,_atom_site.label_entity_id,_atom_site.label_seq_id,_atom_site.pdbx_PDB_ins_code,...,_atom_site.Cartn_y,_atom_site.Cartn_z,_atom_site.occupancy,_atom_site.B_iso_or_equiv,_atom_site.pdbx_formal_charge,_atom_site.auth_seq_id,_atom_site.auth_comp_id,_atom_site.auth_asym_id,_atom_site.auth_atom_id,_atom_site.pdbx_PDB_model_num
0,ATOM,1,N,N,.,MET,A,1,1,?,...,-5.980,-27.758,1.00,54.29,?,1,MET,A,N,1
1,ATOM,2,C,CA,.,MET,A,1,1,?,...,-5.054,-26.911,1.00,53.52,?,1,MET,A,CA,1
2,ATOM,3,C,C,.,MET,A,1,1,?,...,-4.848,-25.543,1.00,52.77,?,1,MET,A,C,1
3,ATOM,4,O,O,.,MET,A,1,1,?,...,-4.618,-25.451,1.00,51.10,?,1,MET,A,O,1
4,ATOM,5,C,CB,.,MET,A,1,1,?,...,-3.699,-27.610,1.00,53.58,?,1,MET,A,CB,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5262,HETATM,5263,O,O,.,HOH,T,7,.,?,...,-4.467,38.118,1.00,32.05,?,961,HOH,B,O,1
5263,HETATM,5264,O,O,.,HOH,T,7,.,?,...,4.389,47.804,1.00,38.08,?,969,HOH,B,O,1
5264,HETATM,5265,O,O,.,HOH,T,7,.,?,...,26.355,-7.866,1.00,28.34,?,973,HOH,B,O,1
5265,HETATM,5266,O,O,.,HOH,T,7,.,?,...,7.105,39.757,1.00,53.03,?,975,HOH,B,O,1


In [9]:
structure = parser.get_structure("", "data/mmcif/"+eg+".cif")


In [10]:
import nglview as nv
view = nv.show_biopython(structure)



In [11]:
view

NGLWidget()

In [190]:
cols = ['group_PDB', 'label_asym_id', 'label_seq_id', 'auth_seq_id', 'label_comp_id', 'id', 'label_atom_id',
        'type_symbol', 'Cartn_x', 'Cartn_y', 'Cartn_z', 'auth_asym_id']

In [191]:
import sys
import gemmi
from gemmi import cif


print("Loading cif file of", eg)
path = 'data/mmcif/'+eg+'.cif'
try:
    doc = cif.read_file(path)  # copy all the data from mmCIF file
    lol = []  # list of lists
    for b, block in enumerate(doc):
        table = block.find('_atom_site.', cols)
        for row in table:
            lol.append(list(row))
except Exception as e:
    print("Oops. %s" % e)
    sys.exit(1)

Loading cif file of 6WPW


In [192]:
cols = ['group_PDB', 'label_asym_id', 'label_seq_id', 'auth_seq_id', 'label_comp_id', 'id', 'label_atom_id',
        'atom_type', 'x', 'y', 'z', 'auth_asym_id']
df = pd.DataFrame(data=lol, columns=cols)

In [15]:
def fix_label_seq_id(x):
    try:
        return int(x)
    except:
        return None

In [193]:
df

Unnamed: 0,group_PDB,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,atom_type,x,y,z,auth_asym_id
0,ATOM,A,9,9,THR,1,N,N,128.348,95.767,159.547,C
1,ATOM,A,9,9,THR,2,CA,C,128.920,94.459,159.263,C
2,ATOM,A,9,9,THR,3,C,C,129.458,94.392,157.840,C
3,ATOM,A,9,9,THR,4,O,O,129.283,95.323,157.055,C
4,ATOM,A,9,9,THR,5,CB,C,127.886,93.330,159.467,C
...,...,...,...,...,...,...,...,...,...,...,...,...
9257,ATOM,F,432,424,LEU,9258,N,N,106.541,138.131,153.474,R
9258,ATOM,F,432,424,LEU,9259,CA,C,105.135,137.743,153.464,R
9259,ATOM,F,432,424,LEU,9260,C,C,104.970,136.268,153.812,R
9260,ATOM,F,432,424,LEU,9261,O,O,103.879,135.712,153.703,R


In [16]:
df['label_seq_id']=df.apply(lambda x: fix_label_seq_id(x.label_seq_id), axis=1)

Columns
group_PDB 	id 	type_symbol 	label_atom_id 	label_alt_id 	label_comp_id 	label_asym_id 	label_entity_id 	label_seq_id 	pdbx_PDB_ins_code 	Cartn_x 	Cartn_y 	Cartn_z 	occupancy 	B_iso_or_equiv 	Cartn_x_esd 	Cartn_y_esd 	Cartn_z_esd 	occupancy_esd 	B_iso_or_equiv_esd 	pdbx_formal_charge 	auth_seq_id 	auth_comp_id 	auth_asym_id 	auth_atom_id 	pdbx_PDB_model_num

In [17]:
import sys
from math import degrees
import gemmi

ramas = {aa: [] for aa in [
    'LEU', 'ALA', 'GLY', 'VAL', 'GLU', 'SER', 'LYS', 'ASP', 'THR', 'ILE',
    'ARG', 'PRO', 'ASN', 'PHE', 'GLN', 'TYR', 'HIS', 'MET', 'CYS', 'TRP']}

rol = []
for path in ['data/mmcif/'+eg+'.cif']:
    st = gemmi.read_structure(path)
    if 0.1 < st.resolution < 5:
        model = st[0]
        if len(st) > 1:
            print("There are multiple models!")
        for chain in model:
            for r, res in enumerate(chain.get_polymer()):
                # previous_residue() and next_residue() return previous/next
                # residue only if the residues are bonded. Otherwise -- None.
                prev_res = chain.previous_residue(res)
                next_res = chain.next_residue(res)
                phi, psi = gemmi.calculate_phi_psi(prev_res, res, next_res)
                omega = gemmi.calculate_omega(res, next_res)
                rol.append([res.name, res.label_seq, res.subchain, 
                            degrees(phi), degrees(omega), degrees(psi)])
                try:
                    ramas[res.name].append([degrees(phi), degrees(omega), degrees(psi)])
                except KeyError:
                    pass

In [20]:
"""
# Write data to files
for aa, data in ramas.items():
    with open('data/ramach/' + aa + '.tsv', 'w') as f:
        for phi, omega, psi in data:
            f.write('%.4f\t%.4f\n' % (degrees(phi), degrees(psi)))"""

"\n# Write data to files\nfor aa, data in ramas.items():\n    with open('data/ramach/' + aa + '.tsv', 'w') as f:\n        for phi, omega, psi in data:\n            f.write('%.4f\t%.4f\n' % (degrees(phi), degrees(psi)))"

In [21]:
cols = ['name', 'label_seq_id', 'chain', 'phi', 'omega', 'psi']

In [22]:
res_df = pd.DataFrame(data=rol, columns=cols)

In [23]:
res_df

Unnamed: 0,name,label_seq_id,chain,phi,omega,psi
0,MET,1,A,,-179.718577,133.146625
1,ASN,2,A,-87.567941,179.953276,-5.295383
2,GLY,3,A,-150.672305,178.921162,143.916355
3,THR,4,A,-85.441746,178.477151,112.209423
4,GLU,5,A,-96.636867,178.659871,123.041558
...,...,...,...,...,...,...
638,CYS,322,B,-74.457863,179.107101,-7.588421
639,CYS,323,B,65.675548,179.210471,29.106972
640,GLY,324,B,134.859554,-179.348699,-24.685362
641,LYS,325,B,-77.173222,-179.767435,94.072619


In [24]:
full = pd.merge(res_df, df, on=['label_seq_id', 'label_seq_id'])

In [25]:
import numpy as np
full = full.astype({'label_seq_id': np.int16, 'auth_seq_id': np.int16, 'id': np.int16, 
                    'phi': np.float32, 'omega': np.float32, 'psi': np.float32,
                    'x': np.float32, 'y': np.float32, 'z': np.float32})

In [26]:
def get_atom_id(x):
    atom_dict = {'C': 0, 'O': 1, 'N': 2, 'H': 3, 'S': 4}
    try:
        return atom_dict[str(x)]
    except:
        return 5
full['atom_id'] = full.apply(lambda x: get_atom_id(x.atom_type), axis=1)

In [27]:
full

Unnamed: 0,name,label_seq_id,chain,phi,omega,psi,group_PDB,label_asym_id,auth_seq_id,label_comp_id,id,label_atom_id,atom_type,x,y,z,atom_id
0,MET,1,A,,-179.718582,133.146622,ATOM,A,1,MET,1,N,N,43.958000,-5.980000,-27.757999,2
1,MET,1,A,,-179.718582,133.146622,ATOM,A,1,MET,2,CA,C,44.717999,-5.054000,-26.910999,0
2,MET,1,A,,-179.718582,133.146622,ATOM,A,1,MET,3,C,C,44.069000,-4.848000,-25.542999,0
3,MET,1,A,,-179.718582,133.146622,ATOM,A,1,MET,4,O,O,42.854000,-4.618000,-25.451000,1
4,MET,1,A,,-179.718582,133.146622,ATOM,A,1,MET,5,CB,C,44.868000,-3.699000,-27.610001,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9920,ALA,348,A,-165.916489,,,ATOM,A,348,ALA,2634,CA,C,88.039001,30.749001,17.236000,0
9921,ALA,348,A,-165.916489,,,ATOM,A,348,ALA,2635,C,C,89.197998,31.023001,16.242001,0
9922,ALA,348,A,-165.916489,,,ATOM,A,348,ALA,2636,O,O,89.163002,30.493000,15.102000,1
9923,ALA,348,A,-165.916489,,,ATOM,A,348,ALA,2637,CB,C,88.360001,29.510000,18.103001,0


In [28]:
xyz = full[['x', 'y', 'z']].to_numpy(dtype=float)

In [29]:
ids = full[['label_seq_id', 'auth_seq_id']].to_numpy(dtype='int')

In [30]:
atoms = full['atom_id'].to_numpy(dtype=int)

In [31]:
xyz

array([[ 43.95800018,  -5.98000002, -27.75799942],
       [ 44.7179985 ,  -5.0539999 , -26.9109993 ],
       [ 44.06900024,  -4.84800005, -25.54299927],
       ...,
       [ 89.16300201,  30.49300003,  15.10200024],
       [ 88.36000061,  29.51000023,  18.10300064],
       [ 90.14499664,  31.76199913,  16.62400055]])

In [32]:
ids

array([[  1,   1],
       [  1,   1],
       [  1,   1],
       ...,
       [348, 348],
       [348, 348],
       [348, 348]])

In [33]:
atoms[:100]

array([2, 0, 0, 1, 0, 0, 4, 0, 2, 0, 0, 1, 0, 0, 4, 0, 2, 0, 0, 1, 0, 0,
       4, 0, 2, 0, 0, 1, 0, 0, 4, 0, 2, 0, 0, 1, 0, 0, 1, 2, 2, 0, 0, 1,
       0, 0, 1, 2, 2, 0, 0, 1, 0, 0, 1, 2, 2, 0, 0, 1, 0, 0, 1, 2, 2, 0,
       0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 0, 1, 0, 2,
       0, 0, 1, 0, 1, 0, 2, 0, 0, 1, 0, 1])

In [34]:
"""import plotly.express as px
select = full[(100 < full['label_seq_id']) & (full['label_seq_id'] < 120)]
df = px.data.iris()
fig = px.scatter_3d(select, x='x', y='y', z='z',
                    color='atom_type', opacity=0.5)
fig.update_traces(marker=dict(size=7))
fig.show()"""

"import plotly.express as px\nselect = full[(100 < full['label_seq_id']) & (full['label_seq_id'] < 120)]\ndf = px.data.iris()\nfig = px.scatter_3d(select, x='x', y='y', z='z',\n                    color='atom_type', opacity=0.5)\nfig.update_traces(marker=dict(size=7))\nfig.show()"

In [35]:
"""import sys
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator


def plot(data_file, label, output=None):
    x, y = [], []
    for line in open(data_file):
        phi, psi = line.split()
        if phi != 'nan' and psi != 'nan':
            x.append(float(phi))
            y.append(float(psi))
    print('Plotting %d points for %s' % (len(x), label))

    plt.figure(figsize=(5.5, 5.5))
    plt.title('%s, %d points.' % (label, len(x)), fontsize=12)
    plt.xlim([-180, 180])
    plt.ylim([-180, 180])
    ax = plt.gca()
    ax.xaxis.set_major_locator(MultipleLocator(60))
    ax.yaxis.set_major_locator(MultipleLocator(60))
    plt.xlabel(r'$\phi$', fontsize=14)
    plt.ylabel(r'$\psi$', fontsize=14, labelpad=0)
    plt.grid(color='#AAAAAA', linestyle='--')
    plt.hexbin(x, y, gridsize=2*180, bins='log', cmap='Blues')
    if output:
        plt.savefig(output, dpi=300)  # dpi=70 for small images in docs
    else:
        plt.show()


for aa in ramas:
    plot('data/ramach/%s.tsv' % aa, aa)
    #plot('ramas/%s.tsv' % aa, aa, 'ramas/%s.png' % aa)"""

"import sys\nimport matplotlib.pyplot as plt\nfrom matplotlib.ticker import MultipleLocator\n\n\ndef plot(data_file, label, output=None):\n    x, y = [], []\n    for line in open(data_file):\n        phi, psi = line.split()\n        if phi != 'nan' and psi != 'nan':\n            x.append(float(phi))\n            y.append(float(psi))\n    print('Plotting %d points for %s' % (len(x), label))\n\n    plt.figure(figsize=(5.5, 5.5))\n    plt.title('%s, %d points.' % (label, len(x)), fontsize=12)\n    plt.xlim([-180, 180])\n    plt.ylim([-180, 180])\n    ax = plt.gca()\n    ax.xaxis.set_major_locator(MultipleLocator(60))\n    ax.yaxis.set_major_locator(MultipleLocator(60))\n    plt.xlabel(r'$\\phi$', fontsize=14)\n    plt.ylabel(r'$\\psi$', fontsize=14, labelpad=0)\n    plt.grid(color='#AAAAAA', linestyle='--')\n    plt.hexbin(x, y, gridsize=2*180, bins='log', cmap='Blues')\n    if output:\n        plt.savefig(output, dpi=300)  # dpi=70 for small images in docs\n    else:\n        plt.show()\

In [36]:
uniprot = pdbtouniprot('4JKV')

In [37]:
uniprot

'P0ABE7'

In [38]:
pdb = uniprottopdb(uniprot)

In [39]:
pdb

['1APC',
 '1LM3',
 '1M6T',
 '1QPU',
 '1QQ3',
 '256B',
 '2BC5',
 '2QLA',
 '3C62',
 '3C63',
 '3DE8',
 '3DE9',
 '3FOO',
 '3FOP',
 '3HNI',
 '3HNJ',
 '3HNK',
 '3HNL',
 '3IQ5',
 '3IQ6',
 '3L1M',
 '3M15',
 '3M4B',
 '3M4C',
 '3M79',
 '3NMI',
 '3NMJ',
 '3NMK',
 '3TOL',
 '3TOM',
 '3U8P',
 '4EA3',
 '4EIY',
 '4IAQ',
 '4IAR',
 '4IB4',
 '4JE9',
 '4JEA',
 '4JEB',
 '4JKV',
 '4L6R',
 '4N6H',
 '4NC3',
 '4NTJ',
 '4O9R',
 '4OR2',
 '4PXZ',
 '4PY0',
 '4QIM',
 '4QIN',
 '4RWA',
 '4RWD',
 '4U9D',
 '4U9E',
 '4YAY',
 '4Z34',
 '4Z35',
 '4Z36',
 '4ZUD',
 '5AWI',
 '5BU7',
 '5DHG',
 '5DHH',
 '5IU4',
 '5IU7',
 '5IU8',
 '5IUA',
 '5IUB',
 '5JTB',
 '5K2A',
 '5K2B',
 '5K2C',
 '5K2D',
 '5L31',
 '5L32',
 '5L7D',
 '5L7I',
 '5MZJ',
 '5MZP',
 '5N2R',
 '5N2S',
 '5NDD',
 '5NDZ',
 '5NJ6',
 '5NLX',
 '5NM2',
 '5NM4',
 '5OLG',
 '5OLH',
 '5OLO',
 '5OLV',
 '5OLZ',
 '5OM1',
 '5OM4',
 '5TUD',
 '5TVN',
 '5UEN',
 '5UIG',
 '5UNF',
 '5UNG',
 '5UNH',
 '5UVI',
 '5VRA',
 '5WIU',
 '5WIV',
 '5XJM',
 '5XZI',
 '5XZJ',
 '5YM7',
 '5YO3',
 '5YO4',
 

In [40]:
from utils3 import *


pdb_id_list = ['2RH1', '1HZX', '1GZM']

In [43]:
import pandas as pd


for j in range(len(pdb_id_list)):
    pdb_id = pdb_id_list[j]
    maps = get_mappings_data(pdb_id)[pdb_id.lower()]['UniProt']
    uniprots = maps.keys()
    for i, uniprot in enumerate(uniprots):
        table = pd.DataFrame.from_dict(maps[uniprot])
        table['PDB'] = pdb_id
        table['uniprot'] = uniprot
        if i + j == 0:
            full_table = table
        else:
            full_table = full_table.append(table, ignore_index=True)

In [44]:
full_table

Unnamed: 0,identifier,name,mappings,PDB,uniprot
0,ADRB2_HUMAN,ADRB2_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",2RH1,P07550
1,ADRB2_HUMAN,ADRB2_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",2RH1,P07550
2,ENLYS_BPT4,ENLYS_BPT4,"{'entity_id': 1, 'end': {'author_residue_numbe...",2RH1,P00720
3,OPSD_BOVIN,OPSD_BOVIN,"{'entity_id': 1, 'chain_id': 'A', 'start': {'a...",1HZX,P02699
4,OPSD_BOVIN,OPSD_BOVIN,"{'entity_id': 1, 'chain_id': 'B', 'start': {'a...",1HZX,P02699
5,OPSD_BOVIN,OPSD_BOVIN,"{'entity_id': 1, 'chain_id': 'A', 'start': {'a...",1GZM,P02699
6,OPSD_BOVIN,OPSD_BOVIN,"{'entity_id': 1, 'chain_id': 'B', 'start': {'a...",1GZM,P02699


In [45]:
pd.DataFrame.from_dict(full_table.iloc[0]['mappings'])

Unnamed: 0,entity_id,end,chain_id,start,unp_end,unp_start,struct_asym_id
author_residue_number,1,230.0,A,,230,1,A
author_insertion_code,1,,A,,230,1,A
residue_number,1,237.0,A,8.0,230,1,A


In [46]:
pd.DataFrame.from_dict(full_table.iloc[1]['mappings'])

Unnamed: 0,entity_id,end,chain_id,start,unp_end,unp_start,struct_asym_id
author_residue_number,1,,A,264.0,365,264,A
author_insertion_code,1,,A,,365,264,A
residue_number,1,500.0,A,399.0,365,264,A


In [47]:
from utils import *
from utils2 import *
from utils3 import *
from plotting import *
from gpcrdb_soup import *

In [48]:
from tqdm import tqdm, trange

In [49]:
import sys
from gemmi import cif

In [194]:
class DataLoader():
    def __init__(self, 
                 path = 'data/',
                 structure = 'mmcif/',
                 limit=10):
        self.path = path
        self.structure_path = self.path + structure
        self.path_table = path + 'gpcrdb/' + 'structures.pkl'
        
        self.filenames, self.pdb_ids = get_pdb_files(path=self.structure_path)
        # Columns for structure dataframe
        self.cols = ['group_PDB', 'auth_asym_id', 'label_asym_id', 'label_seq_id', 'auth_seq_id', 
                     'label_comp_id', 'id', 'label_atom_id', 
                     'type_symbol', 'Cartn_x', 'Cartn_y', 'Cartn_z']
        
        self.numbering = pd.DataFrame()
        self.table = pd.read_pickle(self.path_table)
        for i, pdb_id in tqdm(enumerate(self.pdb_ids)):
            if i <= limit:
                protein, family = self.get_prot_info(pdb_id)
                numbering = self.get_res_nums(protein)
                if i == 0:
                    self.structure = self.load_cifs(pdb_id)
                    self.mappings = self.get_mapping(pdb_id)
                    numb = [pdb_id, protein, family, numbering]
                    # numb = [pdb_id, protein, self.entry_to_ac(protein), family, numbering]
                    self.numbering = self.numbering.append(pd.DataFrame(numb).T)
                else:
                    self.structure = self.structure.append(self.load_cifs(pdb_id), ignore_index=True)
                    self.mappings = self.mappings.append(self.get_mapping(pdb_id), ignore_index=True)
                    # self.numbering = self.numbering.append(pd.DataFrame(data=[pdb_id, protein, self.entry_to_ac(protein),
                    #                                                           family, numbering]).T, 
                    self.numbering = self.numbering.append(pd.DataFrame(data=[pdb_id, protein, family, numbering]).T, 
                                                           ignore_index=True)
        self.numbering.columns = ['PDB', 'identifier', 'family', 'numbering']            
        # self.numbering.columns = ['pdb', 'identifier', 'uniprot', 'family', 'numbering']
        
        
    def entry_to_ac(self, entry: str):
        response = requests.get('https://www.uniprot.org/uniprot/'+entry+'.txt')
        return response.text.split('\n')[1].split('AC   ')[1][:6]

    
    def get_prot_info(self, pdb_id):
        # query structure
        response = requests.get('https://gpcrdb.org/services/structure/'+pdb_id.upper()+'/')
        protein = response.json()['protein']
        family = response.json()['family']                    
        return protein, family
    
    def get_res_nums(self, protein):
        # query uniprot -> res num
        response = requests.get('https://gpcrdb.org/services/residues/extended/'+protein+'/')    
        # select res num
        # assign res_num to structure data
        return response.json()
    
    def get_mapping(self, pdb_id):
        maps = get_mappings_data(pdb_id)[pdb_id.lower()]['UniProt']
        uniprots = maps.keys()
        full_table=pd.DataFrame()
        for i, uniprot in enumerate(uniprots):
            table = pd.DataFrame.from_dict(maps[uniprot])
            table['PDB'] = pdb_id
            table['uniprot'] = uniprot
            if i + j == 0:
                full_table = table
            else:
                full_table = full_table.append(table, ignore_index=True)
        return full_table
    

    def load_cifs(self, pdb_id):
        print("Loading cif file of", pdb_id)
        path = 'data/mmcif/' + pdb_id + '.cif'
        print(path)
        try:
            doc = cif.read_file(path)  # copy all the data from mmCIF file
            lol = []  # list of lists
            for b, block in enumerate(doc):
                table = block.find('_atom_site.', self.cols)
                for row in table:
                    lol.append([pdb_id]+list(row))
        except Exception as e:
            print("Oops. %s" % e)
            sys.exit(1)
            
        cols = ['PDB']+self.cols
        return pd.DataFrame(data=lol, columns=cols)

In [195]:
data = DataLoader(limit=3)

0it [00:00, ?it/s]

Loading cif file of 4IB4
data/mmcif/4IB4.cif


1it [00:04,  4.02s/it]

Loading cif file of 7L0S
data/mmcif/7L0S.cif


2it [00:07,  3.93s/it]

Loading cif file of 4PXF
data/mmcif/4PXF.cif


3it [00:10,  3.31s/it]

Loading cif file of 6WPW
data/mmcif/6WPW.cif


528it [00:13, 40.48it/s]


In [196]:
data.numbering

Unnamed: 0,PDB,identifier,family,numbering
0,4IB4,5ht2b_human,001_001_001_007,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr..."
1,7L0S,ntr1_rat,001_002_021_001,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr..."
2,4PXF,opsd_bovin,001_009_001_001,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr..."
3,6WPW,glr_human,002_001_003_005,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr..."


In [197]:
data.mappings

Unnamed: 0,identifier,name,mappings,PDB,uniprot
0,C562_ECOLX,C562_ECOLX,"{'entity_id': 1, 'end': {'author_residue_numbe...",4IB4,P0ABE7
1,5HT2B_HUMAN,5HT2B_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",4IB4,P41595
2,5HT2B_HUMAN,5HT2B_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",4IB4,P41595
3,GNAI1_HUMAN,GNAI1_HUMAN,"{'entity_id': 3, 'end': {'author_residue_numbe...",7L0S,P63096
4,GBB1_HUMAN,GBB1_HUMAN,"{'entity_id': 4, 'end': {'author_residue_numbe...",7L0S,P62873
5,NTR1_RAT,NTR1_RAT,"{'entity_id': 1, 'end': {'author_residue_numbe...",7L0S,P20789
6,NEUT_RAT,NEUT_RAT,"{'entity_id': 2, 'end': {'author_residue_numbe...",7L0S,P20068
7,GBG1_HUMAN,GBG1_HUMAN,"{'entity_id': 5, 'end': {'author_residue_numbe...",7L0S,P63211
8,ARRS_BOVIN,ARRS_BOVIN,"{'entity_id': 2, 'end': {'author_residue_numbe...",4PXF,P08168
9,OPSD_BOVIN,OPSD_BOVIN,"{'entity_id': 1, 'end': {'author_residue_numbe...",4PXF,P02699


In [198]:
def lookup(pdb, identifier, mappings):
    return list(mappings[(mappings['identifier']==identifier.upper()) & (mappings['PDB']==pdb)]['mappings'])

In [199]:
maps = data.mappings

In [200]:
new = data.numbering

In [201]:
new['mapping'] = new.apply(lambda x: lookup(x.PDB, x.identifier, maps), axis=1)

In [202]:
for i in range(len(new)):
    print(new['mapping'].iloc[i])

[{'entity_id': 1, 'end': {'author_residue_number': 248, 'author_insertion_code': '', 'residue_number': 223}, 'chain_id': 'A', 'start': {'author_residue_number': None, 'author_insertion_code': '', 'residue_number': 11}, 'unp_end': 248, 'unp_start': 36, 'struct_asym_id': 'A'}, {'entity_id': 1, 'end': {'author_residue_number': None, 'author_insertion_code': '', 'residue_number': 421}, 'chain_id': 'A', 'start': {'author_residue_number': 314, 'author_insertion_code': '', 'residue_number': 330}, 'unp_end': 405, 'unp_start': 314, 'struct_asym_id': 'A'}]
[{'entity_id': 1, 'end': {'author_residue_number': None, 'author_insertion_code': '', 'residue_number': 327}, 'chain_id': 'C', 'start': {'author_residue_number': None, 'author_insertion_code': '', 'residue_number': 5}, 'unp_end': 390, 'unp_start': 50, 'struct_asym_id': 'A'}]
[{'entity_id': 1, 'end': {'author_residue_number': None, 'author_insertion_code': '', 'residue_number': 348}, 'chain_id': 'A', 'start': {'author_residue_number': None, 'au

In [203]:
new

Unnamed: 0,PDB,identifier,family,numbering,mapping
0,4IB4,5ht2b_human,001_001_001_007,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr...","[{'entity_id': 1, 'end': {'author_residue_numb..."
1,7L0S,ntr1_rat,001_002_021_001,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr...","[{'entity_id': 1, 'end': {'author_residue_numb..."
2,4PXF,opsd_bovin,001_009_001_001,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr...","[{'entity_id': 1, 'end': {'author_residue_numb..."
3,6WPW,glr_human,002_001_003_005,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr...","[{'entity_id': 6, 'end': {'author_residue_numb..."


In [204]:
new.iloc[1]['mapping']

[{'entity_id': 1,
  'end': {'author_residue_number': None,
   'author_insertion_code': '',
   'residue_number': 327},
  'chain_id': 'C',
  'start': {'author_residue_number': None,
   'author_insertion_code': '',
   'residue_number': 5},
  'unp_end': 390,
  'unp_start': 50,
  'struct_asym_id': 'A'}]

In [205]:
df = pd.merge(new, data.table, on='PDB')

In [206]:
# add uniprot id to structure

In [207]:
data.structure

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,Cartn_x,Cartn_y,Cartn_z
0,4IB4,ATOM,A,A,23,48,GLU,1,N,N,18.652,38.221,29.331
1,4IB4,ATOM,A,A,23,48,GLU,2,CA,C,18.815,38.640,27.940
2,4IB4,ATOM,A,A,23,48,GLU,3,C,C,17.736,38.042,27.032
3,4IB4,ATOM,A,A,23,48,GLU,4,O,O,17.244,36.939,27.287
4,4IB4,ATOM,A,A,23,48,GLU,5,CB,C,20.213,38.268,27.406
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23044,6WPW,ATOM,R,F,432,424,LEU,9258,N,N,106.541,138.131,153.474
23045,6WPW,ATOM,R,F,432,424,LEU,9259,CA,C,105.135,137.743,153.464
23046,6WPW,ATOM,R,F,432,424,LEU,9260,C,C,104.970,136.268,153.812
23047,6WPW,ATOM,R,F,432,424,LEU,9261,O,O,103.879,135.712,153.703


In [208]:
pdbs = list(set(list(data.structure['PDB'])))

In [209]:
pdbs

['4PXF', '4IB4', '7L0S', '6WPW']

In [210]:
for pdb in pdbs:
    print("Check out", pdb)
    mapping  = df[df['PDB']==pdb]['mapping'].iloc[0]
    pref  = df[df['PDB']==pdb]['Preferred Chain'].iloc[0]
    struc_ = data.structure[(data.structure['PDB']==pdb) & (data.structure['label_asym_id']==pref)]

Check out 4PXF
Check out 4IB4
Check out 7L0S
Check out 6WPW


In [211]:
struc_

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,Cartn_x,Cartn_y,Cartn_z


In [212]:
mapping

[{'entity_id': 6,
  'end': {'author_residue_number': None,
   'author_insertion_code': '',
   'residue_number': 485},
  'chain_id': 'R',
  'start': {'author_residue_number': 27,
   'author_insertion_code': '',
   'residue_number': 35},
  'unp_end': 477,
  'unp_start': 27,
  'struct_asym_id': 'F'}]

In [327]:
def get_gaps(mapping):
    starts = []
    ends = []
    for section in mapping:
        start = int(section['start']['residue_number'])
        end = int(section['end']['residue_number'])
        unp_start = int(section['unp_start'])
        unp_end = int(section['unp_end'])
        length = end-start
        unp_length = unp_end-unp_start
        """
        print(start, '\n', 
              end,  '\n',
              'diff:', length, '\n\n', 
              unp_start,  '\n', 
              unp_end, '\n',
              'diff:', unp_length, '\n\n\n\n')"""
        starts.append([start, unp_start])
        ends.append([end, unp_end])
    print(starts)
    print(ends, '\n')
    return starts, ends

In [328]:
df.iloc[2]

PDB                                                             4PXF
identifier                                                opsd_bovin
family                                               001_009_001_001
numbering          [{'sequence_number': 1, 'amino_acid': 'M', 'pr...
mapping            [{'entity_id': 1, 'end': {'author_residue_numb...
uniprot(gene)                                                   OPSD
Cl.                                                     A(Rhodopsin)
Resolution                                                       2.8
Preferred Chain                                                    A
State                                                         Active
Function                                                     unknown
starts                                                           NaN
ends                                                             NaN
try                                         [[[1, 1]], [[348, 348]]]
Name: 2, dtype: object

In [338]:
df['lol'] = df.apply(lambda x: get_gaps(x.mapping), axis=1)

[[11, 36], [330, 314]]
[[223, 248], [421, 405]] 

[[5, 50]]
[[327, 390]] 

[[1, 1]]
[[348, 348]] 

[[35, 27]]
[[485, 477]] 



In [339]:
df['lol'].iloc[0]

([[11, 36], [330, 314]], [[223, 248], [421, 405]])

In [343]:
df['starts'] = df.apply(lambda x: x.lol[0], axis=1)
df['ends'] = df.apply(lambda x: x.lol[1], axis=1)
df.drop('lol', axis=1)

Unnamed: 0,PDB,identifier,family,numbering,mapping,uniprot(gene),Cl.,Resolution,Preferred Chain,State,Function,starts,ends,try
0,4IB4,5ht2b_human,001_001_001_007,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr...","[{'entity_id': 1, 'end': {'author_residue_numb...",5HT2B,A(Rhodopsin),2.7,A,Intermediate,Agonist,"[[11, 36], [330, 314]]","[[223, 248], [421, 405]]","([[11, 36], [330, 314]], [[223, 248], [421, 40..."
1,7L0S,ntr1_rat,001_002_021_001,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr...","[{'entity_id': 1, 'end': {'author_residue_numb...",NTR1,A(Rhodopsin),4.5,C,Active,Agonist,"[[5, 50]]","[[327, 390]]","([[5, 50]], [[327, 390]])"
2,4PXF,opsd_bovin,001_009_001_001,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr...","[{'entity_id': 1, 'end': {'author_residue_numb...",OPSD,A(Rhodopsin),2.8,A,Active,unknown,"[[1, 1]]","[[348, 348]]","([[1, 1]], [[348, 348]])"
3,6WPW,glr_human,002_001_003_005,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr...","[{'entity_id': 6, 'end': {'author_residue_numb...",GLR,B1(Secretin),3.1,R,Active,Agonist,"[[35, 27]]","[[485, 477]]","([[35, 27]], [[485, 477]])"


In [216]:
list(set(list(data.mappings['PDB'])))

['4PXF', '4IB4', '7L0S', '6WPW']

In [217]:
lm = list(set(list(data.mappings['PDB'])))

In [218]:
eg = lm[0]

In [219]:
print("Checking out", eg)

Checking out 4PXF


In [220]:
data.numbering[data.numbering['PDB']==eg]

Unnamed: 0,PDB,identifier,family,numbering,mapping
2,4PXF,opsd_bovin,001_009_001_001,"[{'sequence_number': 1, 'amino_acid': 'M', 'pr...","[{'entity_id': 1, 'end': {'author_residue_numb..."


In [221]:
sequence_numbers = []
amino_acids = []
generic_numbers = []


for i in data.numbering[data.numbering['PDB']==eg].iloc[0]['numbering']:
    if i['alternative_generic_numbers'] != []:
        print(i)
        sequence_numbers.append(i['sequence_number'])
        amino_acids.append(i['amino_acid'])
        generic_numbers.append(['display_generic_number'])

{'sequence_number': 33, 'amino_acid': 'E', 'protein_segment': 'TM1', 'display_generic_number': '1.28x28', 'alternative_generic_numbers': [{'scheme': 'BW', 'label': '1.28'}, {'scheme': 'Wootten', 'label': '1.32'}, {'scheme': 'Pin', 'label': '1.32'}, {'scheme': 'Wang', 'label': '1.25'}, {'scheme': 'Fungal', 'label': '1.30'}, {'scheme': 'GPCRdb(A)', 'label': '1.28x28'}, {'scheme': 'GPCRdb(B)', 'label': '1.32x32'}, {'scheme': 'GPCRdb(C)', 'label': '1.32x32'}, {'scheme': 'GPCRdb(F)', 'label': '1.25x25'}, {'scheme': 'GPCRdb(D)', 'label': '1.30x30'}, {'scheme': 'Oliveira', 'label': '108'}, {'scheme': 'BS', 'label': 'I:-05'}]}
{'sequence_number': 34, 'amino_acid': 'P', 'protein_segment': 'TM1', 'display_generic_number': '1.29x29', 'alternative_generic_numbers': [{'scheme': 'BW', 'label': '1.29'}, {'scheme': 'Wootten', 'label': '1.33'}, {'scheme': 'Pin', 'label': '1.33'}, {'scheme': 'Wang', 'label': '1.26'}, {'scheme': 'Fungal', 'label': '1.31'}, {'scheme': 'GPCRdb(A)', 'label': '1.29x29'}, {'s

In [222]:
for i in zip(sequence_numbers, amino_acids):
    print(i)

(33, 'E')
(34, 'P')
(35, 'W')
(36, 'Q')
(37, 'F')
(38, 'S')
(39, 'M')
(40, 'L')
(41, 'A')
(42, 'A')
(43, 'Y')
(44, 'M')
(45, 'F')
(46, 'L')
(47, 'L')
(48, 'I')
(49, 'M')
(50, 'L')
(51, 'G')
(52, 'F')
(53, 'P')
(54, 'I')
(55, 'N')
(56, 'F')
(57, 'L')
(58, 'T')
(59, 'L')
(60, 'Y')
(61, 'V')
(62, 'T')
(63, 'V')
(64, 'Q')
(65, 'H')
(66, 'K')
(67, 'K')
(68, 'L')
(69, 'R')
(70, 'T')
(71, 'P')
(72, 'L')
(73, 'N')
(74, 'Y')
(75, 'I')
(76, 'L')
(77, 'L')
(78, 'N')
(79, 'L')
(80, 'A')
(81, 'V')
(82, 'A')
(83, 'D')
(84, 'L')
(85, 'F')
(86, 'M')
(87, 'V')
(88, 'F')
(89, 'G')
(90, 'G')
(91, 'F')
(92, 'T')
(93, 'T')
(94, 'T')
(95, 'L')
(96, 'Y')
(97, 'T')
(98, 'S')
(99, 'L')
(100, 'H')
(101, 'G')
(102, 'Y')
(103, 'F')
(104, 'V')
(105, 'F')
(106, 'G')
(107, 'P')
(108, 'T')
(109, 'G')
(110, 'C')
(111, 'N')
(112, 'L')
(113, 'E')
(114, 'G')
(115, 'F')
(116, 'F')
(117, 'A')
(118, 'T')
(119, 'L')
(120, 'G')
(121, 'G')
(122, 'E')
(123, 'I')
(124, 'A')
(125, 'L')
(126, 'W')
(127, 'S')
(128, 'L')
(129, 'V')


1. figure out gpcr uniprot id (not auxilary etc)
2. select correpsonding sifts mapping
3. use sifts mapping to enumerate residues
4. number residues

In [223]:
pref_chain = data.table[data.table['PDB']==eg].iloc[0]['Preferred Chain']

In [227]:
x = data.structure[(data.structure['PDB']==eg) & 
               (data.structure['label_atom_id']=='CA') & 
               (data.structure['auth_asym_id']==pref_chain)]

In [228]:
x

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,Cartn_x,Cartn_y,Cartn_z
10971,4PXF,ATOM,A,A,2,2,ASN,2,CA,C,135.590,278.207,41.991
10979,4PXF,ATOM,A,A,3,3,GLY,10,CA,C,138.231,276.345,39.993
10983,4PXF,ATOM,A,A,4,4,THR,14,CA,C,141.627,276.715,38.345
10990,4PXF,ATOM,A,A,5,5,GLU,21,CA,C,142.155,275.612,34.738
10999,4PXF,ATOM,A,A,6,6,GLY,30,CA,C,145.221,274.374,32.883
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13539,4PXF,ATOM,A,A,323,323,CYS,2570,CA,C,146.402,222.939,52.431
13545,4PXF,ATOM,A,A,324,324,GLY,2576,CA,C,144.025,220.370,50.934
13549,4PXF,ATOM,A,A,325,325,LYS,2580,CA,C,141.092,218.576,52.614
13558,4PXF,ATOM,A,A,326,326,ASN,2589,CA,C,138.118,220.272,54.359


In [124]:
df2 = pd.DataFrame(data=[sequence_numbers, amino_acids]).T
df2.columns = ['id', 'res']

In [125]:
df2

Unnamed: 0,id,res
0,33,E
1,34,P
2,35,W
3,36,Q
4,37,F
...,...,...
256,318,V
257,319,T
258,320,T
259,321,L


In [103]:
import gemmi


seq_id = list(x['label_seq_id'])
seq_res_ = list(x['label_comp_id'])
seq_res = [gemmi.find_tabulated_residue(resname).one_letter_code for resname in seq_res_]

In [114]:
y = zip([int(x) for x in seq_id], seq_res)

In [115]:
for j in y:
    if int(j[0]) >= 33:
        print(j)

(33, 'E')
(34, 'P')
(35, 'W')
(36, 'Q')
(37, 'F')
(38, 'S')
(39, 'M')
(40, 'L')
(41, 'A')
(42, 'A')
(43, 'Y')
(44, 'M')
(45, 'F')
(46, 'L')
(47, 'L')
(48, 'I')
(49, 'M')
(50, 'L')
(51, 'G')
(52, 'F')
(53, 'P')
(54, 'I')
(55, 'N')
(56, 'F')
(57, 'L')
(58, 'T')
(59, 'L')
(60, 'Y')
(61, 'V')
(62, 'T')
(63, 'V')
(64, 'Q')
(65, 'H')
(66, 'K')
(67, 'K')
(68, 'L')
(69, 'R')
(70, 'T')
(71, 'P')
(72, 'L')
(73, 'N')
(74, 'Y')
(75, 'I')
(76, 'L')
(77, 'L')
(78, 'N')
(79, 'L')
(80, 'A')
(81, 'V')
(82, 'A')
(83, 'D')
(84, 'L')
(85, 'F')
(86, 'M')
(87, 'V')
(88, 'F')
(89, 'G')
(90, 'G')
(91, 'F')
(92, 'T')
(93, 'T')
(94, 'T')
(95, 'L')
(96, 'Y')
(97, 'T')
(98, 'S')
(99, 'L')
(100, 'H')
(101, 'G')
(102, 'Y')
(103, 'F')
(104, 'V')
(105, 'F')
(106, 'G')
(107, 'P')
(108, 'T')
(109, 'G')
(110, 'C')
(111, 'N')
(112, 'L')
(113, 'E')
(114, 'G')
(115, 'F')
(116, 'F')
(117, 'A')
(118, 'T')
(119, 'L')
(120, 'G')
(121, 'G')
(122, 'E')
(123, 'I')
(124, 'A')
(125, 'L')
(126, 'W')
(127, 'S')
(128, 'L')
(129, 'V')


In [119]:
df1 = pd.DataFrame([seq_id, seq_res]).T
df1.columns = ['id', 'res']

In [141]:
df1

0       2
1       3
2       4
3       5
4       6
       ..
321    66
322    67
323    68
324    69
325    70
Name: id, Length: 326, dtype: int8

In [134]:
j = 0
for i in range(len(df1)):
    x = df1.iloc[i]
    while(int(df2.iloc[j]['id']) < int(x['id'])):
        j+=1
        print(df2.iloc[j]['id'], x['id'])
    y = df2.iloc[j]
    if x['res'] != y['res']:
        print(x['res'], y['res'])
        print(x['id'], y['id'])
    j+=1

N E
2 33
G P
3 34
T W
4 35
E Q
5 36
G F
6 37
P S
7 38
N M
8 39
F L
9 40
Y A
10 41
V A
11 42
P Y
12 43
F M
13 44
S F
14 45
N L
15 46
K L
16 47
T I
17 48
G M
18 49
V L
19 50
V G
20 51
R F
21 52
S P
22 53
P I
23 54
F N
24 55
E F
25 56
A L
26 57
P T
27 58
Q L
28 59
Y V
30 61
L T
31 62
A V
32 63
E Q
33 64
P H
34 65
W K
35 66
Q K
36 67
F L
37 68
S R
38 69
M T
39 70
L P
40 71
A L
41 72
A N
42 73
M I
44 75
F L
45 76
L N
47 78
I L
48 79
M A
49 80
L V
50 81
G A
51 82
F D
52 83
P L
53 84
I F
54 85
N M
55 86
F V
56 87
L F
57 88
T G
58 89
L G
59 90
Y F
60 91
V T
61 92
V T
63 94
Q L
64 95
H Y
65 96
K T
66 97
K S
67 98
R H
69 100
T G
70 101
P Y
71 102
L F
72 103
N V
73 104
Y F
74 105
I G
75 106
L P
76 107
L T
77 108
N G
78 109
L C
79 110
A N
80 111
V L
81 112
A E
82 113
D G
83 114
L F
84 115
M A
86 117
V T
87 118
F L
88 119
F E
91 122
T I
92 123
T A
93 124
T L
94 125
L W
95 126
Y S
96 127
T L
97 128
S V
98 129
L V
99 130
H L
100 131
G A
101 132
Y I
102 133
F E
103 134
V R
104 135
F Y
105 136
G V
106 

IndexError: single positional indexer is out-of-bounds

In [131]:
df['same'] = 

In [274]:
df

Unnamed: 0,id,res_x,res_y


In [146]:
eg

'4PXF'

In [147]:
data.mappings

Unnamed: 0,identifier,name,mappings,PDB,uniprot
0,C562_ECOLX,C562_ECOLX,"{'entity_id': 1, 'end': {'author_residue_numbe...",4IB4,P0ABE7
1,5HT2B_HUMAN,5HT2B_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",4IB4,P41595
2,5HT2B_HUMAN,5HT2B_HUMAN,"{'entity_id': 1, 'end': {'author_residue_numbe...",4IB4,P41595
3,GNAI1_HUMAN,GNAI1_HUMAN,"{'entity_id': 3, 'chain_id': 'A', 'start': {'a...",7L0S,P63096
4,GBB1_HUMAN,GBB1_HUMAN,"{'entity_id': 4, 'chain_id': 'B', 'start': {'a...",7L0S,P62873
5,NTR1_RAT,NTR1_RAT,"{'entity_id': 1, 'chain_id': 'C', 'start': {'a...",7L0S,P20789
6,NEUT_RAT,NEUT_RAT,"{'entity_id': 2, 'chain_id': 'D', 'start': {'a...",7L0S,P20068
7,GBG1_HUMAN,GBG1_HUMAN,"{'entity_id': 5, 'chain_id': 'G', 'start': {'a...",7L0S,P63211
8,ARRS_BOVIN,ARRS_BOVIN,"{'entity_id': 2, 'chain_id': 'B', 'start': {'a...",4PXF,P08168
9,OPSD_BOVIN,OPSD_BOVIN,"{'entity_id': 1, 'chain_id': 'A', 'start': {'a...",4PXF,P02699


In [151]:
pd.DataFrame.from_dict(data.mappings[data.mappings['PDB']==eg]['mappings'])

Unnamed: 0,mappings
10,"{'entity_id': 2, 'chain_id': 'D', 'start': {'a..."
11,"{'entity_id': 6, 'chain_id': 'R', 'start': {'a..."
12,"{'entity_id': 1, 'chain_id': 'C', 'start': {'a..."
13,"{'entity_id': 3, 'chain_id': 'G', 'start': {'a..."


fRom mappings I have to choose the correct chain and use it to select the protein region

In [162]:
data.structure

Unnamed: 0,PDB,group_PDB,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,Cartn_x,Cartn_y,Cartn_z
0,4IB4,ATOM,A,23,48,GLU,1,N,N,18.652,38.221,29.331
1,4IB4,ATOM,A,23,48,GLU,2,CA,C,18.815,38.640,27.940
2,4IB4,ATOM,A,23,48,GLU,3,C,C,17.736,38.042,27.032
3,4IB4,ATOM,A,23,48,GLU,4,O,O,17.244,36.939,27.287
4,4IB4,ATOM,A,23,48,GLU,5,CB,C,20.213,38.268,27.406
...,...,...,...,...,...,...,...,...,...,...,...,...
23044,6WPW,ATOM,F,432,424,LEU,9258,N,N,106.541,138.131,153.474
23045,6WPW,ATOM,F,432,424,LEU,9259,CA,C,105.135,137.743,153.464
23046,6WPW,ATOM,F,432,424,LEU,9260,C,C,104.970,136.268,153.812
23047,6WPW,ATOM,F,432,424,LEU,9261,O,O,103.879,135.712,153.703


In [278]:
eg = lm[2]
print(eg)

7L0S


In [279]:
data.table[data.table['PDB']==eg]

Unnamed: 0,uniprot(gene),Cl.,PDB,Resolution,Preferred Chain,State,Function
524,NTR1,A(Rhodopsin),7L0S,4.5,C,Active,Agonist


In [280]:
pref_chain = data.table[data.table['PDB']==eg]['Preferred Chain'].iloc[0]

In [281]:
pref_chain

'C'

In [282]:
data.structure[data.structure['PDB']==eg]['label_asym_id'].unique()

array(['A', 'B', 'C', 'D', 'E'], dtype=object)

In [283]:
a = data.structure[(data.structure['PDB']==eg) & 
                   (data.structure['label_atom_id']=='CA') & 
                   (data.structure['group_PDB']!='HETATM') &
                   (data.structure['auth_asym_id']==pref_chain)]
b = data.mappings[data.mappings['PDB']==eg]

In [284]:
a

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,Cartn_x,Cartn_y,Cartn_z
3065,7L0S,ATOM,C,A,7,52,ASN,2,CA,C,91.212,135.067,142.112
3073,7L0S,ATOM,C,A,8,53,SER,10,CA,C,91.897,135.531,138.396
3079,7L0S,ATOM,C,A,9,54,ASP,16,CA,C,95.477,136.231,137.353
3087,7L0S,ATOM,C,A,10,55,LEU,24,CA,C,98.151,135.126,134.810
3095,7L0S,ATOM,C,A,11,56,ASP,32,CA,C,95.649,136.168,132.083
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5455,7L0S,ATOM,C,A,318,381,LEU,2392,CA,C,91.100,116.562,91.494
5463,7L0S,ATOM,C,A,319,382,SER,2400,CA,C,89.383,113.841,89.463
5469,7L0S,ATOM,C,A,320,383,THR,2406,CA,C,88.363,111.974,92.612
5476,7L0S,ATOM,C,A,321,384,LEU,2413,CA,C,87.280,115.272,94.193


In [285]:
b.iloc[0]

identifier                                          GNAI1_HUMAN
name                                                GNAI1_HUMAN
mappings      {'entity_id': 3, 'end': {'author_residue_numbe...
PDB                                                        7L0S
uniprot                                                  P63096
Name: 3, dtype: object

In [286]:
correct = []
for d in range(len(b)):
    chain = pd.DataFrame.from_dict(b.iloc[d]['mappings'])['chain_id'].iloc[0]
    # if chain == pref_chain:
    correct.append(pd.DataFrame.from_dict(b.iloc[d]['mappings']))

In [296]:
tot = pd.concat(correct)

In [297]:
tot

Unnamed: 0,entity_id,end,chain_id,start,unp_end,unp_start,struct_asym_id
author_residue_number,3,354.0,A,,354,1,C
author_insertion_code,3,,A,,354,1,C
residue_number,3,354.0,A,1.0,354,1,C
author_residue_number,4,340.0,B,,340,2,D
author_insertion_code,4,,B,,340,2,D
residue_number,4,361.0,B,23.0,340,2,D
author_residue_number,1,,C,,390,50,A
author_insertion_code,1,,C,,390,50,A
residue_number,1,327.0,C,5.0,390,50,A
author_residue_number,2,13.0,D,8.0,162,157,B


In [298]:
tot[tot['chain_id']==pref_chain]

Unnamed: 0,entity_id,end,chain_id,start,unp_end,unp_start,struct_asym_id
author_residue_number,1,,C,,390,50,A
author_insertion_code,1,,C,,390,50,A
residue_number,1,327.0,C,5.0,390,50,A
