In [1]:
import csv, json, glob
import numpy as np

### Read atomic types

In [2]:
tmap = {}
types = {}
ntypes = 0
with open('data/groups20.txt', 'r') as f:
    data = csv.reader(f, delimiter=' ')
    for line in data:
        if line[1] in types:
            tmap[line[0]] = types[line[1]]
        else:
            types[line[1]] = ntypes
            tmap[line[0]] = ntypes
            ntypes += 1

print("Number of distinct atom types: ", ntypes)

print("Mapping between atom names and types: ", tmap)


Number of distinct atom types:  20
Mapping between atom names and types:  {'ALA_CA': 0, 'ALA_CB': 1, 'ALA_C': 2, 'ALA_N': 3, 'ALA_O': 4, 'ARG_CA': 0, 'ARG_CB': 5, 'ARG_C': 2, 'ARG_CD': 5, 'ARG_CG': 5, 'ARG_CZ': 6, 'ARG_NE': 7, 'ARG_NH1': 7, 'ARG_NH2': 7, 'ARG_N': 3, 'ARG_O': 4, 'ASN_CA': 0, 'ASN_CB': 5, 'ASN_C': 2, 'ASN_CG': 8, 'ASN_ND2': 9, 'ASN_N': 3, 'ASN_OD1': 10, 'ASN_O': 4, 'ASP_CA': 0, 'ASP_CB': 5, 'ASP_C': 2, 'ASP_CG': 11, 'ASP_N': 3, 'ASP_OD1': 12, 'ASP_OD2': 12, 'ASP_O': 4, 'CYS_CA': 0, 'CYS_CB': 5, 'CYS_C': 2, 'CYS_N': 3, 'CYS_O': 4, 'CYS_SG': 13, 'GLN_CA': 0, 'GLN_CB': 5, 'GLN_C': 2, 'GLN_CD': 8, 'GLN_CG': 5, 'GLN_NE2': 9, 'GLN_N': 3, 'GLN_OE1': 10, 'GLN_O': 4, 'GLU_CA': 0, 'GLU_CB': 5, 'GLU_C': 2, 'GLU_CD': 11, 'GLU_CG': 5, 'GLU_N': 3, 'GLU_OE1': 12, 'GLU_OE2': 12, 'GLU_O': 4, 'GLY_CA': 0, 'GLY_C': 2, 'GLY_N': 3, 'GLY_O': 4, 'HIS_CA': 0, 'HIS_CB': 5, 'HIS_C': 2, 'HIS_CD2': 6, 'HIS_CE1': 6, 'HIS_CG': 6, 'HIS_ND1': 14, 'HIS_NE2': 15, 'HIS_N': 3, 'HIS_O': 4, 'ILE_CA': 0, 'ILE

### Read & process JSON

In [3]:
# max distance between atoms
DMAX = 8.0

with open("example/tag0001.al.json") as jsonfile:
 
    data = json.load(jsonfile)

    # convert atom names to types (i.e.features)
    data['nodes'] = np.array([tmap[a] for a in data['atoms']], dtype=np.int16)

    # clean contacts based on DMAX
    data['edges'] = np.array([cont[:2] for cont in data['contacts'] if cont[2] < DMAX], dtype=np.int16)
    data['distances'] = np.array([cont[2] for cont in data['contacts'] if cont[2] < DMAX], dtype=np.float32)

    # for scores, use 2nd column only - LDDT(TS)
    data['scores'] = np.array(data['scores'], dtype=np.float32)[:,1]

    # skip unused data
    del data['atoms']
    del data['bonds']
    del data['contacts']

In [4]:
data['nodes']

array([ 3,  0,  2, ..., 16, 19,  1], dtype=int16)

In [5]:
data['scores']

array([0.77667, 0.81173, 0.85119, ..., 0.63717, 0.49464, 0.60488],
      dtype=float32)

In [6]:
data['edges']

array([[   0,   29],
       [   0,   28],
       [   0,    9],
       ...,
       [1015, 1017],
       [1015, 1016],
       [1016, 1017]], dtype=int16)

In [7]:
data['distances']

array([7.294, 7.579, 6.492, ..., 1.509, 1.432, 2.402], dtype=float32)