In [30]:
import numpy as np
import pandas as pd
from chembl_webresource_client.new_client import new_client

### Get data from ChEMBL

Filter them directly when requesting them from the server.

In [17]:
dr = new_client.drug.filter(first_approval__isnull=False).order_by('first_approval')
drdf = pd.DataFrame(dr)
drdf.head()
len(drdf)

2172

In [18]:
print(drdf.columns)
print(drdf.shape)

Index(['applicants', 'atc_classification', 'availability_type',
       'development_phase', 'drug_type', 'first_approval', 'first_in_class',
       'helm_notation', 'indication_class', 'molecule_chembl_id',
       'molecule_properties', 'molecule_structures', 'molecule_synonyms',
       'ob_patent', 'oral', 'parenteral', 'prodrug', 'research_codes',
       'rule_of_five', 'sc_patent', 'synonyms', 'topical', 'usan_stem',
       'usan_stem_definition', 'usan_stem_substem', 'usan_year',
       'withdrawn_class', 'withdrawn_country', 'withdrawn_flag',
       'withdrawn_reason', 'withdrawn_year', 'atc_code_description'],
      dtype='object')
(2172, 36)


In [19]:
drdf2012 = drdf.loc[drdf.first_approval >= 2012.0]
print(drdf2012.shape)

(446, 36)


Get activities and link drugs to targets.

In [15]:
acts = new_client.activity.filter(molecule_chembl_id__in=list(drdf.molecule_chembl_id)).only(
    ['molecule_chembl_id', 'target_chembl_id'])
targets_id = [a['target_chembl_id'] for a in acts]

In [20]:
acdf = pd.DataFrame(acts)

In [31]:
counts = np.array(acdf.molecule_chembl_id.value_counts())
np.median(counts)

55.0

In [8]:
print(len(targets_id))
targets_id_unique = list(set(targets_id))
print(len(targets_id_unique))

60437
2605


In [9]:
targs = new_client.target.filter(targets_chembl_id__in=targets_id_unique)
tgdf = pd.DataFrame(targs)
tgdf.head()
tgdf.shape

(14855, 8)

<span style="color:red">I do not get why this dataframe has over 14'000 entries when I filtered them according to a target ID list of length 2600.</span>

In [10]:
accessions = []
missing = 0
for t in targs:
    if len(t['target_components']) > 0:
        accessions.append(t['target_components'][0]['accession'])
    else:
        missing += 1

In [11]:
print(len(accessions))
accessions_unique = list(set(accessions))
print(len(accessions_unique))

10269
8951


<span style="color:red">Why are there duplicates in accession numbers?</span>

### Get data from UniProt

In [12]:
import requests, sys

In [76]:
requestURLs = ["https://www.ebi.ac.uk/proteins/api/proteins/{}".format(a) for a in accessions if a != None]
# TODO: resquest for all accession numbers instead of just the first ones once it works

keywords = []

for url in requestURLs:
    r = requests.get(url, headers={"Accept": "application/json"})
    if not r.ok:
        continue
    responseBody = r.json()
    a = responseBody['accession']
    try:
        k = [w['value'] for w in responseBody['keywords']]
    except KeyError:
        print('{} has no keywords associated'.format(a))
        k = ['']

    keywords.append([a, k])

Q99P97 has no keywords associated
O30412 has no keywords associated
Q3HTL5 has no keywords associated
Q5PY51 has no keywords associated
Q56D22 has no keywords associated
Q50H31 has no keywords associated
Q83TT7 has no keywords associated
B0M1D1 has no keywords associated
Q39829 has no keywords associated
Q9ZHK8 has no keywords associated
Q93SP7 has no keywords associated
O40947 has no keywords associated
A9X4R8 has no keywords associated
O86157 has no keywords associated
Q15950 has no keywords associated
Q3B792 has no keywords associated
Q95US7 has no keywords associated
Q962H3 has no keywords associated
B5U2Z5 has no keywords associated
C1KUY1 has no keywords associated
C1KUY2 has no keywords associated
C1KUY3 has no keywords associated
Q76129 has no keywords associated
Q8W433 has no keywords associated
A0A072ZLE6 has no keywords associated
Q5U9J1 has no keywords associated
Q9L4Q3 has no keywords associated
Q86MC2 has no keywords associated
Q5L478 has no keywords associated
Q6DRH7 has

In [80]:
kwdf = pd.DataFrame(data=keywords, columns=['accession', 'keywords'])
display(kwdf.head())
kwdf.shape

Unnamed: 0,accession,keywords
0,O43451,"[3D-structure, Alternative splicing, Cell memb..."
1,O60706,"[Alternative splicing, ATP-binding, Atrial fib..."
2,O76074,"[3D-structure, Allosteric enzyme, Alternative ..."
3,O95180,"[Alternative splicing, Calcium, Calcium channe..."
4,O96760,"[Cell junction, Cell membrane, Disulfide bond,..."


(10225, 2)

<span style="color:red">Again, why does the shape not correspond to the length of accession numbers I found...?</span>

In [87]:
kwdf_tidy = pd.DataFrame([
    [a, k]
    for a, k_, in kwdf.values
    for k in k_
], columns=['accession', 'keyword'])
kwdf_tidy

Unnamed: 0,accession,keyword
0,O43451,3D-structure
1,O43451,Alternative splicing
2,O43451,Cell membrane
3,O43451,Direct protein sequencing
4,O43451,Disulfide bond
...,...,...
133848,O60669,Reference proteome
133849,O60669,Symport
133850,O60669,Transmembrane
133851,O60669,Transmembrane helix


In [88]:
kwdf_tidy.keyword.value_counts()

Reference proteome                                           9022
3D-structure                                                 5423
Membrane                                                     4974
Phosphoprotein                                               4860
Alternative splicing                                         3725
                                                             ... 
Inhibition of host proteasome antigen processing by virus       1
Inhibition of host adaptive immune response by virus            1
Inhibition of host IRF7 by virus                                1
G0/G1 host cell cycle checkpoint dysregulation by virus         1
Organic radical                                                 1
Name: keyword, Length: 894, dtype: int64