# Drug name to ChEMBL function

Single function using chembl_webresource_client: https://pypi.org/project/chembl-webresource-client/

Tries 3 different things (in order):

1. Case insensitive match against molecule_dictionary.pref_name
2. Case insensitive match against molecule_synonyms.synonyms
3. Use elastic search as a last resort (optional)

Note: not all pref_name are included in molecule_synonyms so it's not possible to skip step 1.

A name can match against many chembl compounds. Compounds are sorted by max_phase. Manual curation should be considered under this situation (reason to keep smiles, inchi and inchi key).

In [1]:
# install the webresource client
!pip install chembl_webresource_client



In [2]:
from chembl_webresource_client.new_client import new_client

def name2chembl(name, use_search=False):
    """
    Tries to retrieve the chembl_id and the structure for given a drug name.
    """
    molecule = new_client.molecule
    fields = ["molecule_chembl_id", "pref_name", "max_phase", "molecule_structures"]
    # search in pref_name
    # iexact does exact case insensitive search
    res = molecule.filter(pref_name__iexact=name).only(fields)
    res = list(res)
    if res:
        # sort by max_phase
        res = sorted(res, key=lambda k: k["max_phase"], reverse=True)
        return res, "pref_name"
    else:
        # if no pref_name match, look at the synonyms
        # some pref_name are not included in molecule_synonyms talbe so is not possible
        # to skip the first step
        res = molecule.filter(molecule_synonyms__molecule_synonym__iexact=name).only(fields)
        res = list(res)
        if res:
            # sort by max_phase
            res = sorted(res, key=lambda k: k["max_phase"], reverse=True)
            return res, "synonyms"
        else:
            if use_search:
                # last resort:
                # Â   search function uses elastic and could eventually retrieve inexact matches
                #    can also take longer than previous calls
                res = molecule.search(name).only(fields)[0]
                if res:
                    return [res], "search"
    return None, None


# Example where it gets a match from molecule_dictionary.pref_name

In [3]:
matches, where = name2chembl('sildenafil')

print(where)
matches

pref_name


[{'max_phase': 4,
  'molecule_chembl_id': 'CHEMBL192',
  'molecule_structures': {'canonical_smiles': 'CCCc1nn(C)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nc12',
   'molfile': '\n     RDKit          2D\n\n 33 36  0  0  0  0  0  0  0  0999 V2000\n    2.1000   -0.0042    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    2.1000    0.7000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n   -1.5375   -0.0042    0.0000 S   0  0  0  0  0  0  0  0  0  0  0  0\n    1.4917   -0.3667    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n    0.8792   -0.0042    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    2.8042    0.9083    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n    1.4917    1.0625    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.8792    0.6833    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n    3.2042    0.3458    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n    2.8042   -0.2417    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.2875   -0.3750    0.0000 C   0  0

# Example where it gets matches from molecule_dictionary.synonyms

In [4]:
matches, where = name2chembl('viagra')

print(where)
matches

synonyms


[{'max_phase': 4,
  'molecule_chembl_id': 'CHEMBL192',
  'molecule_structures': {'canonical_smiles': 'CCCc1nn(C)c2c(=O)[nH]c(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)nc12',
   'molfile': '\n     RDKit          2D\n\n 33 36  0  0  0  0  0  0  0  0999 V2000\n    2.1000   -0.0042    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    2.1000    0.7000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n   -1.5375   -0.0042    0.0000 S   0  0  0  0  0  0  0  0  0  0  0  0\n    1.4917   -0.3667    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n    0.8792   -0.0042    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    2.8042    0.9083    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n    1.4917    1.0625    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.8792    0.6833    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n    3.2042    0.3458    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n    2.8042   -0.2417    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.2875   -0.3750    0.0000 C   0  0

# Example where it gets a match using the search feature

In [5]:
matches, where = name2chembl('Azaguanine-8')

print(where)
matches

None


## Elasticsearch always tries to retrieve results so matches retrieved with the use_search should be manually curated

In [6]:
matches, where = name2chembl('Azaguanine-8', use_search=True)

print(where)
matches

search


[{'max_phase': 0,
  'molecule_chembl_id': 'CHEMBL374107',
  'molecule_structures': {'canonical_smiles': 'Nc1nc(O)c2[nH]nnc2n1',
   'molfile': '\n     RDKit          2D\n\n 11 12  0  0  0  0  0  0  0  0999 V2000\n   -0.4152    0.4905    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.2993    0.9030    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n    1.0137    0.4905    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    1.7282    0.9030    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n    1.0137   -0.3345    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n    0.2993   -0.7470    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.2993   -1.5720    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0\n   -0.4152   -0.3345    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n   -1.1998   -0.5894    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n   -1.6848    0.0780    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n   -1.1998    0.7455    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n  1  2 