## Import necessary libraries

In [3]:
from chembl_webresource_client.new_client import new_client
import pandas as pd

## Find target proteins

In [4]:
# there are 4 species of malaria that cause issues in humans, so we'll search plasmodium

target_query = new_client.target.search('Plasmodium')
plasmodium_targets = pd.DataFrame.from_dict(target_query)

# find single_protein bc complex/organism is not in the scope
# pd.set_option('display.max_columns', 100)
single_protein_targets = plasmodium_targets[plasmodium_targets['target_type'] == "SINGLE PROTEIN"]
single_protein_targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,Duffy antigen/chemokine receptor,15.0,False,CHEMBL2321626,"[{'accession': 'Q16570', 'component_descriptio...",SINGLE PROTEIN,9606
21,"[{'xref_id': 'P13922', 'xref_name': None, 'xre...",Plasmodium falciparum K1,Dihydrofolate reductase,7.0,False,CHEMBL1939,"[{'accession': 'P13922', 'component_descriptio...",SINGLE PROTEIN,5839
22,"[{'xref_id': 'Q02768', 'xref_name': None, 'xre...",Plasmodium falciparum,Cytochrome b,7.0,False,CHEMBL1777,"[{'accession': 'Q02768', 'component_descriptio...",SINGLE PROTEIN,5833
23,"[{'xref_id': 'P05227', 'xref_name': None, 'xre...",Plasmodium falciparum,Histidine-rich protein,7.0,False,CHEMBL1923,"[{'accession': 'P05227', 'component_descriptio...",SINGLE PROTEIN,5833
24,"[{'xref_id': 'Q25704', 'xref_name': None, 'xre...",Plasmodium falciparum,Dihydropteroate synthetase,7.0,False,CHEMBL2013,"[{'accession': 'Q25704', 'component_descriptio...",SINGLE PROTEIN,5833
...,...,...,...,...,...,...,...,...,...
130,[],Plasmodium falciparum (isolate 3D7),Plasmepsin X,7.0,False,CHEMBL4523390,"[{'accession': 'Q8IAS0', 'component_descriptio...",SINGLE PROTEIN,36329
131,[],Plasmodium falciparum (isolate 3D7),Casein kinase I,7.0,False,CHEMBL4523391,"[{'accession': 'Q8IHZ9', 'component_descriptio...",SINGLE PROTEIN,36329
132,[],Plasmodium falciparum,Glutamine amidotransferase,7.0,False,CHEMBL4523484,"[{'accession': 'Q8IJR9', 'component_descriptio...",SINGLE PROTEIN,5833
133,[],Plasmodium falciparum,P-type ATPase,7.0,False,CHEMBL4630875,"[{'accession': 'Q27724', 'component_descriptio...",SINGLE PROTEIN,5833


Now, I looked through found the specific protein that to use for the machine learning. 

I chose Dihydrofolate reductase (DHFR) for a variety of reasons, but the main ones include:
- it has been targeted before in malaria drug treatment
- it is crucial to malaria spreading
- it is in all 4 parasites

In [5]:
target_protein = single_protein_targets.target_chembl_id[21]
target_protein

'CHEMBL1939'

## Get activity data 

What's important to note is the "standard_type". We already filtered out those which are not standard_type "IC50". IC50 is a measure for how much this protein inhibits DHFR's biological process (in this case). There are other types like EC50 & Ki; however, it doesn't make sense to use those for this project.

In [16]:
activity = new_client.activity
activity_data = activity.filter(target_chembl_id=target_protein).filter(standard_type='IC50')
dhfr_data = pd.DataFrame.from_dict(activity_data)

dhfr_data

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,188717,[],CHEMBL769665,In vitro antimalarial activity against Plasmod...,F,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,uM,UO_0000065,,0.33
1,,,188718,[],CHEMBL769666,In vitro antimalarial activity relative to tri...,F,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,uM,UO_0000065,,0.05
2,,,188719,[],CHEMBL769492,In vitro antimalarial activity against Plasmod...,F,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,uM,UO_0000065,,10.16
3,,,188720,[],CHEMBL769664,In vitro antimalarial activity against Plasmod...,F,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,uM,UO_0000065,,0.07
4,,,188721,[],CHEMBL769484,In vitro antimalarial activity against Plasmod...,F,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,uM,UO_0000065,,3.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,,,18465879,[],CHEMBL4181680,Inhibition of Plasmodium falciparum DHFR using...,B,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,nM,UO_0000065,,67.9
369,,,18465880,[],CHEMBL4181680,Inhibition of Plasmodium falciparum DHFR using...,B,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,nM,UO_0000065,,70.1
370,,,18465881,[],CHEMBL4181680,Inhibition of Plasmodium falciparum DHFR using...,B,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,nM,UO_0000065,,185.4
371,,,18465882,[],CHEMBL4181680,Inhibition of Plasmodium falciparum DHFR using...,B,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,nM,UO_0000065,,225.5


See the columns in the dataframe to get a sense for what is measured here. 

In [17]:
print(dhfr_data.columns)

Index(['action_type', 'activity_comment', 'activity_id', 'activity_properties',
       'assay_chembl_id', 'assay_description', 'assay_type',
       'assay_variant_accession', 'assay_variant_mutation', 'bao_endpoint',
       'bao_format', 'bao_label', 'canonical_smiles', 'data_validity_comment',
       'data_validity_description', 'document_chembl_id', 'document_journal',
       'document_year', 'ligand_efficiency', 'molecule_chembl_id',
       'molecule_pref_name', 'parent_molecule_chembl_id', 'pchembl_value',
       'potential_duplicate', 'qudt_units', 'record_id', 'relation', 'src_id',
       'standard_flag', 'standard_relation', 'standard_text_value',
       'standard_type', 'standard_units', 'standard_upper_value',
       'standard_value', 'target_chembl_id', 'target_organism',
       'target_pref_name', 'target_tax_id', 'text_value', 'toid', 'type',
       'units', 'uo_units', 'upper_value', 'value'],
      dtype='object')
330.0
50.0
10160.0
70.0
3400.0
30.0
15910.0
130.0
17480.0


There are values were standard_value is not available. See the output below for proof.

In [None]:
for value in dhfr_data['standard_value']:
    print(value)

I can just filter them out here, so there's no issues with the machine learning.

In [21]:
dhfr_data = dhfr_data[dhfr_data.standard_value.notna()]

for value in dhfr_data['standard_value']:
    print(value)

330.0
50.0
10160.0
70.0
3400.0
30.0
15910.0
130.0
17480.0
2620.0
3830.0
30.0
5730.0
50.0
50000.0
250.0
9.3
73.0
3120.0
470.0
14580.0
110.0
6750.0
60.0
8130.0
40.0
15150.0
2270.0
17140.0
130.0
15260.0
130.0
21490.0
110.0
22730.0
3410.0
100000.0
730.0
90810.0
760.0
100000.0
500.0
150.0
20.0
4290.0
30.0
7700.0
60.0
9750.0
50.0
15220.0
2280.0
3520.0
30.0
3330.0
30.0
12750.0
60.0
19080.0
2860.0
3690.0
30.0
3517.0
30.0
29610.0
150.0
3570.0
540.0
4180.0
30.0
3460.0
30.0
3020.0
20.0
2.7
12.7
680.0
100.0
19880.0
150.0
19850.0
170.0
21120.0
110.0
5060.0
760.0
16260.0
120.0
5370.0
50.0
11300.0
60.0
2400.0
360.0
50000.0
370.0
50000.0
420.0
50000.0
250.0
4660.0
700.0
100000.0
730.0
25610.0
210.0
38020.0
190.0
310.0
50.0
27760.0
130.0
30580.0
260.0
26080.0
130.0
70.0
10.0
5010.0
40.0
13150.0
110.0
19450.0
100.0
6840.0
1030.0
25240.0
180.0
2850.0
20.0
4820.0
20.0
4480.0
670.0
4380.0
30.0
2640.0
20.0
3110.0
20.0
470.0
70.0
16590.0
120.0
14360.0
120.0
16470.0
80.0
17500.0
4300.0
19900.0
17400.0
30900.0

## Get descriptors/fingerprint data for them