In [305]:
from utils import *
from gpcrdb_soup import *

In [306]:
from Bio.PDB.MMCIFParser import MMCIFParser
parser = MMCIFParser()

In [307]:
eg = '2rh1'

In [308]:
path = get_rcsb_download(eg, fileformat = 'cif')

In [309]:
path

'https://files.rcsb.org/download/2rh1.cif'

In [310]:

def download(url: str, folder: str, fileformat: str):
    if not os.path.isdir(folder):
        os.mkdir(folder)
    try:
        r = requests.get(url)
        loc = len(fileformat)+1
        fname = folder + '/' + url[-(loc+4):-loc] + '.' + fileformat
        with open(fname, 'wb') as f:
            f.write(r.content)
    except Exception:
        print("Url invalid:", url)

    
def download_pdb(url, folder, fileformat):
    download(url, folder, fileformat)

In [311]:
download(url=path, folder='data/mmcif', fileformat='cif')

In [312]:
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
import pandas as pd
d = MMCIF2Dict("data/mmcif/"+eg+".cif")
pd.DataFrame({k:v for k,v in d.items() if "_atom_site." in k})

Unnamed: 0,_atom_site.group_PDB,_atom_site.id,_atom_site.type_symbol,_atom_site.label_atom_id,_atom_site.label_alt_id,_atom_site.label_comp_id,_atom_site.label_asym_id,_atom_site.label_entity_id,_atom_site.label_seq_id,_atom_site.pdbx_PDB_ins_code,...,_atom_site.Cartn_y,_atom_site.Cartn_z,_atom_site.occupancy,_atom_site.B_iso_or_equiv,_atom_site.pdbx_formal_charge,_atom_site.auth_seq_id,_atom_site.auth_comp_id,_atom_site.auth_asym_id,_atom_site.auth_atom_id,_atom_site.pdbx_PDB_model_num
0,ATOM,1,N,N,.,ASP,A,1,36,?,...,-1.611,23.137,1.00,98.48,?,29,ASP,A,N,1
1,ATOM,2,C,CA,.,ASP,A,1,36,?,...,-2.262,22.148,1.00,98.06,?,29,ASP,A,CA,1
2,ATOM,3,C,C,.,ASP,A,1,36,?,...,-1.713,20.742,1.00,97.74,?,29,ASP,A,C,1
3,ATOM,4,O,O,.,ASP,A,1,36,?,...,-1.100,20.143,1.00,96.54,?,29,ASP,A,O,1
4,ATOM,5,C,CB,.,ASP,A,1,36,?,...,-3.786,22.184,1.00,97.64,?,29,ASP,A,CB,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3799,HETATM,3800,O,O,.,HOH,R,10,.,?,...,-5.317,-4.531,1.00,75.67,?,544,HOH,A,O,1
3800,HETATM,3801,O,O,.,HOH,R,10,.,?,...,59.621,17.222,1.00,67.78,?,545,HOH,A,O,1
3801,HETATM,3802,O,O,.,HOH,R,10,.,?,...,27.672,6.120,1.00,59.62,?,546,HOH,A,O,1
3802,HETATM,3803,O,O,.,HOH,R,10,.,?,...,49.833,44.410,1.00,55.49,?,547,HOH,A,O,1


In [313]:
structure = parser.get_structure("", "data/mmcif/"+eg+".cif")


In [314]:
import nglview as nv
view = nv.show_biopython(structure)

In [315]:
view

NGLWidget()

In [316]:
cols = ['group_PDB', 'label_asym_id', 'label_seq_id', 'auth_seq_id', 'label_comp_id', 'id', 'label_atom_id',
        'type_symbol', 'Cartn_x', 'Cartn_y', 'Cartn_z']

In [317]:
import sys
from gemmi import cif


print("Loading cif file of", eg)
path = 'data/mmcif/'+eg+'.cif'
try:
    doc = cif.read_file(path)  # copy all the data from mmCIF file
    lol = []  # list of lists
    for b, block in enumerate(doc):
        table = block.find('_atom_site.', cols)
        for row in table:
            lol.append(list(row))
except Exception as e:
    print("Oops. %s" % e)
    sys.exit(1)

Loading cif file of 2rh1


In [318]:
cols = ['group_PDB', 'label_asym_id', 'label_seq_id', 'auth_seq_id', 'label_comp_id', 'id', 'label_atom_id',
        'atom_type', 'x', 'y', 'z']
df = pd.DataFrame(data=lol, columns=cols)

In [319]:
def fix_label_seq_id(x):
    try:
        return int(x)
    except:
        return None

In [320]:
df['label_seq_id']=df.apply(lambda x: fix_label_seq_id(x.label_seq_id), axis=1)

Columns
group_PDB 	id 	type_symbol 	label_atom_id 	label_alt_id 	label_comp_id 	label_asym_id 	label_entity_id 	label_seq_id 	pdbx_PDB_ins_code 	Cartn_x 	Cartn_y 	Cartn_z 	occupancy 	B_iso_or_equiv 	Cartn_x_esd 	Cartn_y_esd 	Cartn_z_esd 	occupancy_esd 	B_iso_or_equiv_esd 	pdbx_formal_charge 	auth_seq_id 	auth_comp_id 	auth_asym_id 	auth_atom_id 	pdbx_PDB_model_num

In [321]:
import sys
from math import degrees
import gemmi

ramas = {aa: [] for aa in [
    'LEU', 'ALA', 'GLY', 'VAL', 'GLU', 'SER', 'LYS', 'ASP', 'THR', 'ILE',
    'ARG', 'PRO', 'ASN', 'PHE', 'GLN', 'TYR', 'HIS', 'MET', 'CYS', 'TRP']}

rol = []
for path in ['data/mmcif/'+eg+'.cif']:
    st = gemmi.read_structure(path)
    if 0.1 < st.resolution < 5:
        model = st[0]
        if len(st) > 1:
            print("There are multiple models!")
        for chain in model:
            for r, res in enumerate(chain.get_polymer()):
                # previous_residue() and next_residue() return previous/next
                # residue only if the residues are bonded. Otherwise -- None.
                prev_res = chain.previous_residue(res)
                next_res = chain.next_residue(res)
                phi, psi = gemmi.calculate_phi_psi(prev_res, res, next_res)
                omega = gemmi.calculate_omega(res, next_res)
                rol.append([res.name, res.label_seq, res.subchain, 
                            degrees(phi), degrees(omega), degrees(psi)])
                try:
                    ramas[res.name].append([degrees(phi), degrees(omega), degrees(psi)])
                except KeyError:
                    pass

In [322]:

# Write data to files
for aa, data in ramas.items():
    with open('data/ramach/' + aa + '.tsv', 'w') as f:
        for phi, omega, psi in data:
            f.write('%.4f\t%.4f\n' % (degrees(phi), degrees(psi)))

In [323]:
cols = ['name', 'label_seq_id', 'chain', 'phi', 'omega', 'psi']

In [324]:
res_df = pd.DataFrame(data=rol, columns=cols)

In [325]:
res_df

Unnamed: 0,name,label_seq_id,chain,phi,omega,psi
0,ASP,36,A,,-178.854617,-63.354934
1,GLU,37,A,-88.932088,175.628098,-37.957306
2,VAL,38,A,-61.144594,171.915204,-28.890129
3,TRP,39,A,-57.800759,-179.414726,-36.478257
4,VAL,40,A,-62.667860,176.504892,-50.509324
...,...,...,...,...,...,...
437,GLU,473,A,-68.371975,-178.763342,-49.869200
438,LEU,474,A,-63.647181,177.505669,-29.818260
439,LEU,475,A,-95.306659,174.452619,21.992094
440,CYS,476,A,54.544220,179.407366,56.480819


In [326]:
full = pd.merge(res_df, df, on=['label_seq_id', 'label_seq_id'])

In [327]:
import numpy as np
full = full.astype({'label_seq_id': np.int16, 'auth_seq_id': np.int16, 'id': np.int16, 
                    'phi': np.float32, 'omega': np.float32, 'psi': np.float32,
                    'x': np.float32, 'y': np.float32, 'z': np.float32})

In [328]:
def get_atom_id(x):
    atom_dict = {'C': 0, 'O': 1, 'N': 2, 'H': 3, 'S': 4}
    try:
        return atom_dict[str(x)]
    except:
        return 5
full['atom_id'] = full.apply(lambda x: get_atom_id(x.atom_type), axis=1)

In [329]:
full

Unnamed: 0,name,label_seq_id,chain,phi,omega,psi,group_PDB,label_asym_id,auth_seq_id,label_comp_id,id,label_atom_id,atom_type,x,y,z,atom_id
0,ASP,36,A,,-178.854614,-63.354935,ATOM,A,29,ASP,1,N,N,-52.821999,-1.611000,23.136999,2
1,ASP,36,A,,-178.854614,-63.354935,ATOM,A,29,ASP,2,CA,C,-51.922001,-2.262000,22.148001,0
2,ASP,36,A,,-178.854614,-63.354935,ATOM,A,29,ASP,3,C,C,-52.178001,-1.713000,20.742001,0
3,ASP,36,A,,-178.854614,-63.354935,ATOM,A,29,ASP,4,O,O,-51.291000,-1.100000,20.143000,1
4,ASP,36,A,,-178.854614,-63.354935,ATOM,A,29,ASP,5,CB,C,-52.105999,-3.786000,22.184000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3538,LEU,477,A,-121.579613,,,ATOM,A,342,LEU,3539,O,O,-55.715000,40.650002,10.971000,1
3539,LEU,477,A,-121.579613,,,ATOM,A,342,LEU,3540,CB,C,-55.575001,37.877998,11.303000,0
3540,LEU,477,A,-121.579613,,,ATOM,A,342,LEU,3541,CG,C,-55.513000,36.347000,11.407000,0
3541,LEU,477,A,-121.579613,,,ATOM,A,342,LEU,3542,CD1,C,-54.415001,35.912998,12.368000,0


In [330]:
xyz = full[['x', 'y', 'z']].to_numpy(dtype=float)

In [331]:
ids = full[['label_seq_id', 'auth_seq_id']].to_numpy(dtype='int')

In [332]:
atoms = full['atom_id'].to_numpy(dtype=int)

In [333]:
xyz

array([[-52.8219986 ,  -1.61099994,  23.13699913],
       [-51.92200089,  -2.26200008,  22.14800072],
       [-52.1780014 ,  -1.71300006,  20.74200058],
       ...,
       [-55.51300049,  36.34700012,  11.40699959],
       [-54.41500092,  35.9129982 ,  12.36800003],
       [-56.85100174,  35.7820015 ,  11.8380003 ]])

In [334]:
ids

array([[ 36,  29],
       [ 36,  29],
       [ 36,  29],
       ...,
       [477, 342],
       [477, 342],
       [477, 342]])

In [335]:
atoms[:100]

array([2, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 0, 1, 1, 2, 0, 0, 1, 0, 0, 0, 2,
       0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 2, 0,
       0, 1, 0, 0, 0, 2, 0, 0, 1, 2, 0, 0, 1, 0, 0, 4, 0, 2, 0, 0, 1, 2,
       0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 2, 0, 0, 1, 0, 0, 4, 0,
       2, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0])

In [338]:
"""import plotly.express as px
select = full[(100 < full['label_seq_id']) & (full['label_seq_id'] < 120)]
df = px.data.iris()
fig = px.scatter_3d(select, x='x', y='y', z='z',
                    color='atom_type', opacity=0.5)
fig.update_traces(marker=dict(size=7))
fig.show()"""

"import plotly.express as px\nselect = full[(100 < full['label_seq_id']) & (full['label_seq_id'] < 120)]\ndf = px.data.iris()\nfig = px.scatter_3d(select, x='x', y='y', z='z',\n                    color='atom_type', opacity=0.5)\nfig.update_traces(marker=dict(size=7))\nfig.show()"

In [340]:
"""import sys
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator


def plot(data_file, label, output=None):
    x, y = [], []
    for line in open(data_file):
        phi, psi = line.split()
        if phi != 'nan' and psi != 'nan':
            x.append(float(phi))
            y.append(float(psi))
    print('Plotting %d points for %s' % (len(x), label))

    plt.figure(figsize=(5.5, 5.5))
    plt.title('%s, %d points.' % (label, len(x)), fontsize=12)
    plt.xlim([-180, 180])
    plt.ylim([-180, 180])
    ax = plt.gca()
    ax.xaxis.set_major_locator(MultipleLocator(60))
    ax.yaxis.set_major_locator(MultipleLocator(60))
    plt.xlabel(r'$\phi$', fontsize=14)
    plt.ylabel(r'$\psi$', fontsize=14, labelpad=0)
    plt.grid(color='#AAAAAA', linestyle='--')
    plt.hexbin(x, y, gridsize=2*180, bins='log', cmap='Blues')
    if output:
        plt.savefig(output, dpi=300)  # dpi=70 for small images in docs
    else:
        plt.show()


for aa in ramas:
    plot('data/ramach/%s.tsv' % aa, aa)
    #plot('ramas/%s.tsv' % aa, aa, 'ramas/%s.png' % aa)"""

"import sys\nimport matplotlib.pyplot as plt\nfrom matplotlib.ticker import MultipleLocator\n\n\ndef plot(data_file, label, output=None):\n    x, y = [], []\n    for line in open(data_file):\n        phi, psi = line.split()\n        if phi != 'nan' and psi != 'nan':\n            x.append(float(phi))\n            y.append(float(psi))\n    print('Plotting %d points for %s' % (len(x), label))\n\n    plt.figure(figsize=(5.5, 5.5))\n    plt.title('%s, %d points.' % (label, len(x)), fontsize=12)\n    plt.xlim([-180, 180])\n    plt.ylim([-180, 180])\n    ax = plt.gca()\n    ax.xaxis.set_major_locator(MultipleLocator(60))\n    ax.yaxis.set_major_locator(MultipleLocator(60))\n    plt.xlabel(r'$\\phi$', fontsize=14)\n    plt.ylabel(r'$\\psi$', fontsize=14, labelpad=0)\n    plt.grid(color='#AAAAAA', linestyle='--')\n    plt.hexbin(x, y, gridsize=2*180, bins='log', cmap='Blues')\n    if output:\n        plt.savefig(output, dpi=300)  # dpi=70 for small images in docs\n    else:\n        plt.show()\

In [297]:
uniprot = pdbtouniprot('4JKV')

In [298]:
uniprot

'P0ABE7'

In [299]:
pdb = uniprottopdb(uniprot)

In [300]:
pdb

['1APC',
 '1LM3',
 '1M6T',
 '1QPU',
 '1QQ3',
 '256B',
 '2BC5',
 '2QLA',
 '3C62',
 '3C63',
 '3DE8',
 '3DE9',
 '3FOO',
 '3FOP',
 '3HNI',
 '3HNJ',
 '3HNK',
 '3HNL',
 '3IQ5',
 '3IQ6',
 '3L1M',
 '3M15',
 '3M4B',
 '3M4C',
 '3M79',
 '3NMI',
 '3NMJ',
 '3NMK',
 '3TOL',
 '3TOM',
 '3U8P',
 '4EA3',
 '4EIY',
 '4IAQ',
 '4IAR',
 '4IB4',
 '4JE9',
 '4JEA',
 '4JEB',
 '4JKV',
 '4L6R',
 '4N6H',
 '4NC3',
 '4NTJ',
 '4O9R',
 '4OR2',
 '4PXZ',
 '4PY0',
 '4QIM',
 '4QIN',
 '4RWA',
 '4RWD',
 '4U9D',
 '4U9E',
 '4YAY',
 '4Z34',
 '4Z35',
 '4Z36',
 '4ZUD',
 '5AWI',
 '5BU7',
 '5DHG',
 '5DHH',
 '5IU4',
 '5IU7',
 '5IU8',
 '5IUA',
 '5IUB',
 '5JTB',
 '5K2A',
 '5K2B',
 '5K2C',
 '5K2D',
 '5L31',
 '5L32',
 '5L7D',
 '5L7I',
 '5MZJ',
 '5MZP',
 '5N2R',
 '5N2S',
 '5NDD',
 '5NDZ',
 '5NJ6',
 '5NLX',
 '5NM2',
 '5NM4',
 '5OLG',
 '5OLH',
 '5OLO',
 '5OLV',
 '5OLZ',
 '5OM1',
 '5OM4',
 '5TUD',
 '5TVN',
 '5UEN',
 '5UIG',
 '5UNF',
 '5UNG',
 '5UNH',
 '5UVI',
 '5VRA',
 '5WIU',
 '5WIV',
 '5XJM',
 '5XZI',
 '5XZJ',
 '5YM7',
 '5YO3',
 '5YO4',
 

In [304]:
from utils3 import *


get_mappings_data('2RH1')

{'2rh1': {'UniProt': {'P07550': {'identifier': 'ADRB2_HUMAN',
    'name': 'ADRB2_HUMAN',
    'mappings': [{'entity_id': 1,
      'end': {'author_residue_number': 230,
       'author_insertion_code': '',
       'residue_number': 237},
      'chain_id': 'A',
      'start': {'author_residue_number': None,
       'author_insertion_code': '',
       'residue_number': 8},
      'unp_end': 230,
      'unp_start': 1,
      'struct_asym_id': 'A'},
     {'entity_id': 1,
      'end': {'author_residue_number': None,
       'author_insertion_code': '',
       'residue_number': 500},
      'chain_id': 'A',
      'start': {'author_residue_number': 264,
       'author_insertion_code': '',
       'residue_number': 399},
      'unp_end': 365,
      'unp_start': 264,
      'struct_asym_id': 'A'}]},
   'P00720': {'identifier': 'ENLYS_BPT4',
    'name': 'ENLYS_BPT4',
    'mappings': [{'entity_id': 1,
      'end': {'author_residue_number': 263,
       'author_insertion_code': '',
       'residue_number': 39