# Eucliean Distance Approach

We compare the actual, physical distance between atoms in the ligand and atoms in the protein.

If the distance < 3 to 5 angstroms (tuneable) between any ligand and protein atom, we can conclude that they are
"interacting".

In [77]:
import pandas as pd
import numpy as np
import os
import re

In [78]:
"""
IO related functionality
"""
def read_pdbs(pdb_dir):
    pdbs = {}
    with os.scandir(pdb_dir) as pdb_files:
        for f in pdb_files:
            identifier, molecule = get_id_from_filename(f.name)
            identifier, molecule = int(identifier), str(molecule)
            if identifier not in pdbs:
                pdbs[identifier] = {}
            try:
                pdbs[identifier][molecule] = pd.read_table(f)
            except Exception as e:
                print('failed to parse %s', f.name)
    return pdbs


def get_id_from_filename(filename):
    pattern = re.compile('^([0-9]+)_([a-z]+)_cg\.pdb$')
    return pattern.findall(filename)[0]


def read_pdb(filename):
    with open(filename, 'r') as file:
        strline_L = file.readlines()
        # print(strline_L)

    positions = []
    atoms = []
    atomtype_list = list()
    for strline in strline_L:
        # removes all whitespace at the start and end, including spaces, tabs, newlines and carriage returns
        stripped_line = strline.strip()

        line_length = len(stripped_line)
        # print("Line length:{}".format(line_length))
        if line_length != 78:
            print("ERROR: line length is different. Expected=78, current={}".format(line_length))
            return

        positions.append(np.array([float(stripped_line[30:38].strip()),
                          float(stripped_line[38:46].strip()),
                          float(stripped_line[46:54].strip())], dtype=np.float64)
                        )

        atomtype = stripped_line[76:78].strip()
        if atomtype == 'C':
            atoms.append('h') # 'h' means hydrophobic
        else:
            atoms.append('p') # 'p' means polar

    return positions, atoms



In [None]:
def euclidean_distance(p, l):
    return np.linalg.norm(p - l)


def find_interacting_ligands(protein_file, ligands_path, interaction_threshold=5):
    p_pos, p_atoms = read_pdb(protein_file)
    candidates = []

    for ligand_id in range(1, 89):
        ligand_name = '{}/{:04d}_lig_cg.pdb'.format(ligands_path, ligand_id)
        print('reading file {}'.format(ligand_name))
        l_pos, l_atoms = read_pdb(ligand_name)
        num_matches = 0
        i = 0
        for l in l_pos:
            for p in p_pos:
                dist = euclidean_distance(p, l)
                i += 1
                #if i % 100 == 0:
                    #print('%6s: iteration %s' % (ligand_id, i))
                #print('dist = %.10f' % dist)
                if dist < interaction_threshold:
                    num_matches += 1
                    print('hit! %30s interacts with %30s at distance %.6f' % (p, l, dist))
        if num_matches > 0:
            candidates.append((ligand_id, num_matches))
        print('end of search for protein %s' % ligand_id)
    return candidates

In [80]:
find_interacting_ligands('/home/darren/drug-discovery/data/train/0006_pro_cg.pdb',
                         '/home/darren/drug-discovery/data/train',
                        interaction_threshold=3)

reading file /home/darren/drug-discovery/data/train/0001_lig_cg.pdb
hit!      [ -0.348   6.991 -19.714] interacts with      [ -2.503   5.387 -20.876] at distance 2.926958
end of search for protein 1
reading file /home/darren/drug-discovery/data/train/0002_lig_cg.pdb
end of search for protein 2
reading file /home/darren/drug-discovery/data/train/0003_lig_cg.pdb
end of search for protein 3
reading file /home/darren/drug-discovery/data/train/0004_lig_cg.pdb
hit!      [-27.256  18.638 -12.483] interacts with      [-25.343  19.688 -10.989] at distance 2.644637
hit!      [-27.698  18.755 -11.007] interacts with      [-25.343  19.688 -10.989] at distance 2.533148
hit!      [-27.256  18.638 -12.483] interacts with      [-25.68   17.811 -14.127] at distance 2.422899
hit!      [-23.625  17.606 -12.673] interacts with      [-25.68   17.811 -14.127] at distance 2.525701
hit!      [-25.907  14.474 -16.136] interacts with      [-25.838  15.439 -18.666] at distance 2.708669
end of search for protein 

end of search for protein 53
reading file /home/darren/drug-discovery/data/train/0054_lig_cg.pdb
end of search for protein 54
reading file /home/darren/drug-discovery/data/train/0055_lig_cg.pdb
end of search for protein 55
reading file /home/darren/drug-discovery/data/train/0056_lig_cg.pdb
hit!      [  1.303  -6.851 -14.09 ] interacts with      [  0.683  -8.536 -14.71 ] at distance 1.899480
hit!      [  1.882  -6.924 -12.679] interacts with      [  0.683  -8.536 -14.71 ] at distance 2.856765
hit!      [  1.303  -6.851 -14.09 ] interacts with      [  2.399  -7.662 -13.167] at distance 1.646471
hit!      [  1.882  -6.924 -12.679] interacts with      [  2.399  -7.662 -13.167] at distance 1.024733
end of search for protein 56
reading file /home/darren/drug-discovery/data/train/0057_lig_cg.pdb
end of search for protein 57
reading file /home/darren/drug-discovery/data/train/0058_lig_cg.pdb
end of search for protein 58
reading file /home/darren/drug-discovery/data/train/0059_lig_cg.pdb
hit!  

[(1, 1),
 (4, 5),
 (8, 4),
 (9, 11),
 (14, 6),
 (31, 2),
 (52, 6),
 (56, 4),
 (59, 14),
 (62, 4)]