## Import necessary libraries

In [1]:
from chembl_webresource_client.new_client import new_client
import pandas as pd

## Find target proteins

In [2]:
# there are 4 species of malaria that cause issues in humans, so we'll search plasmodium

target_query = new_client.target.search('Plasmodium')
plasmodium_targets = pd.DataFrame.from_dict(target_query)

# find single_protein bc complex/organism is not in the scope
# pd.set_option('display.max_columns', 100)
single_protein_targets = plasmodium_targets[plasmodium_targets['target_type'] == "SINGLE PROTEIN"]
single_protein_targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,Duffy antigen/chemokine receptor,15.0,False,CHEMBL2321626,"[{'accession': 'Q16570', 'component_descriptio...",SINGLE PROTEIN,9606
21,"[{'xref_id': 'P13922', 'xref_name': None, 'xre...",Plasmodium falciparum K1,Dihydrofolate reductase,7.0,False,CHEMBL1939,"[{'accession': 'P13922', 'component_descriptio...",SINGLE PROTEIN,5839
22,"[{'xref_id': 'Q02768', 'xref_name': None, 'xre...",Plasmodium falciparum,Cytochrome b,7.0,False,CHEMBL1777,"[{'accession': 'Q02768', 'component_descriptio...",SINGLE PROTEIN,5833
23,"[{'xref_id': 'P05227', 'xref_name': None, 'xre...",Plasmodium falciparum,Histidine-rich protein,7.0,False,CHEMBL1923,"[{'accession': 'P05227', 'component_descriptio...",SINGLE PROTEIN,5833
24,"[{'xref_id': 'Q25704', 'xref_name': None, 'xre...",Plasmodium falciparum,Dihydropteroate synthetase,7.0,False,CHEMBL2013,"[{'accession': 'Q25704', 'component_descriptio...",SINGLE PROTEIN,5833
...,...,...,...,...,...,...,...,...,...
130,[],Plasmodium falciparum (isolate 3D7),Plasmepsin X,7.0,False,CHEMBL4523390,"[{'accession': 'Q8IAS0', 'component_descriptio...",SINGLE PROTEIN,36329
131,[],Plasmodium falciparum (isolate 3D7),Casein kinase I,7.0,False,CHEMBL4523391,"[{'accession': 'Q8IHZ9', 'component_descriptio...",SINGLE PROTEIN,36329
132,[],Plasmodium falciparum,Glutamine amidotransferase,7.0,False,CHEMBL4523484,"[{'accession': 'Q8IJR9', 'component_descriptio...",SINGLE PROTEIN,5833
133,[],Plasmodium falciparum,P-type ATPase,7.0,False,CHEMBL4630875,"[{'accession': 'Q27724', 'component_descriptio...",SINGLE PROTEIN,5833


Now, I looked through found the specific protein that to use for the machine learning. 

I chose Dihydrofolate reductase (DHFR) for a variety of reasons, but the main ones include:
- it has been targeted before in malaria drug treatment
- it is crucial to malaria spreading
- it is in all 4 parasites

In [3]:
target_protein = single_protein_targets.target_chembl_id[21]
target_protein

'CHEMBL1939'

## Get activity data 

What's important to note is the "standard_type". We already filtered out those which are not standard_type "IC50". IC50 is a measure for how much this protein inhibits DHFR's biological process (in this case). There are other types like EC50 & Ki; however, it doesn't make sense to use those for this project.

In [4]:
activity = new_client.activity
activity_data = activity.filter(target_chembl_id=target_protein).filter(standard_type='IC50')
dhfr_data = pd.DataFrame.from_dict(activity_data)

dhfr_data

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,188717,[],CHEMBL769665,In vitro antimalarial activity against Plasmod...,F,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,uM,UO_0000065,,0.33
1,,,188718,[],CHEMBL769666,In vitro antimalarial activity relative to tri...,F,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,uM,UO_0000065,,0.05
2,,,188719,[],CHEMBL769492,In vitro antimalarial activity against Plasmod...,F,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,uM,UO_0000065,,10.16
3,,,188720,[],CHEMBL769664,In vitro antimalarial activity against Plasmod...,F,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,uM,UO_0000065,,0.07
4,,,188721,[],CHEMBL769484,In vitro antimalarial activity against Plasmod...,F,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,uM,UO_0000065,,3.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,,,18465879,[],CHEMBL4181680,Inhibition of Plasmodium falciparum DHFR using...,B,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,nM,UO_0000065,,67.9
369,,,18465880,[],CHEMBL4181680,Inhibition of Plasmodium falciparum DHFR using...,B,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,nM,UO_0000065,,70.1
370,,,18465881,[],CHEMBL4181680,Inhibition of Plasmodium falciparum DHFR using...,B,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,nM,UO_0000065,,185.4
371,,,18465882,[],CHEMBL4181680,Inhibition of Plasmodium falciparum DHFR using...,B,,,BAO_0000190,...,Plasmodium falciparum K1,Dihydrofolate reductase,5839,,,IC50,nM,UO_0000065,,225.5


See the columns in the dataframe to get a sense for what is measured here. 

In [5]:
print(dhfr_data.columns)

Index(['action_type', 'activity_comment', 'activity_id', 'activity_properties',
       'assay_chembl_id', 'assay_description', 'assay_type',
       'assay_variant_accession', 'assay_variant_mutation', 'bao_endpoint',
       'bao_format', 'bao_label', 'canonical_smiles', 'data_validity_comment',
       'data_validity_description', 'document_chembl_id', 'document_journal',
       'document_year', 'ligand_efficiency', 'molecule_chembl_id',
       'molecule_pref_name', 'parent_molecule_chembl_id', 'pchembl_value',
       'potential_duplicate', 'qudt_units', 'record_id', 'relation', 'src_id',
       'standard_flag', 'standard_relation', 'standard_text_value',
       'standard_type', 'standard_units', 'standard_upper_value',
       'standard_value', 'target_chembl_id', 'target_organism',
       'target_pref_name', 'target_tax_id', 'text_value', 'toid', 'type',
       'units', 'uo_units', 'upper_value', 'value'],
      dtype='object')


There are values where standard_value is not available. See the output below for proof.

In [6]:
for value in dhfr_data['standard_value']:
    print(value)

330.0
50.0
10160.0
70.0
3400.0
30.0
15910.0
130.0
17480.0
2620.0
3830.0
30.0
5730.0
50.0
50000.0
250.0
9.3
73.0
3120.0
470.0
14580.0
110.0
6750.0
60.0
8130.0
40.0
15150.0
2270.0
17140.0
130.0
15260.0
130.0
21490.0
110.0
22730.0
3410.0
100000.0
730.0
90810.0
760.0
100000.0
500.0
150.0
20.0
4290.0
30.0
7700.0
60.0
9750.0
50.0
15220.0
2280.0
3520.0
30.0
3330.0
30.0
12750.0
60.0
19080.0
2860.0
3690.0
30.0
3517.0
30.0
29610.0
150.0
3570.0
540.0
4180.0
30.0
3460.0
30.0
3020.0
20.0
2.7
12.7
680.0
100.0
19880.0
150.0
19850.0
170.0
21120.0
110.0
5060.0
760.0
16260.0
120.0
5370.0
50.0
11300.0
60.0
2400.0
360.0
50000.0
370.0
50000.0
420.0
50000.0
250.0
4660.0
700.0
100000.0
730.0
25610.0
210.0
38020.0
190.0
310.0
50.0
27760.0
130.0
30580.0
260.0
26080.0
130.0
70.0
10.0
5010.0
40.0
13150.0
110.0
19450.0
100.0
6840.0
1030.0
25240.0
180.0
2850.0
20.0
4820.0
20.0
4480.0
670.0
4380.0
30.0
2640.0
20.0
3110.0
20.0
470.0
70.0
16590.0
120.0
14360.0
120.0
16470.0
80.0
17500.0
4300.0
19900.0
17400.0
30900.0

I can just filter them out here, so there's no issues with the machine learning.

In [7]:
dhfr_data = dhfr_data[dhfr_data.standard_value.notna()]

for value in dhfr_data['standard_value']:
    print(value)

330.0
50.0
10160.0
70.0
3400.0
30.0
15910.0
130.0
17480.0
2620.0
3830.0
30.0
5730.0
50.0
50000.0
250.0
9.3
73.0
3120.0
470.0
14580.0
110.0
6750.0
60.0
8130.0
40.0
15150.0
2270.0
17140.0
130.0
15260.0
130.0
21490.0
110.0
22730.0
3410.0
100000.0
730.0
90810.0
760.0
100000.0
500.0
150.0
20.0
4290.0
30.0
7700.0
60.0
9750.0
50.0
15220.0
2280.0
3520.0
30.0
3330.0
30.0
12750.0
60.0
19080.0
2860.0
3690.0
30.0
3517.0
30.0
29610.0
150.0
3570.0
540.0
4180.0
30.0
3460.0
30.0
3020.0
20.0
2.7
12.7
680.0
100.0
19880.0
150.0
19850.0
170.0
21120.0
110.0
5060.0
760.0
16260.0
120.0
5370.0
50.0
11300.0
60.0
2400.0
360.0
50000.0
370.0
50000.0
420.0
50000.0
250.0
4660.0
700.0
100000.0
730.0
25610.0
210.0
38020.0
190.0
310.0
50.0
27760.0
130.0
30580.0
260.0
26080.0
130.0
70.0
10.0
5010.0
40.0
13150.0
110.0
19450.0
100.0
6840.0
1030.0
25240.0
180.0
2850.0
20.0
4820.0
20.0
4480.0
670.0
4380.0
30.0
2640.0
20.0
3110.0
20.0
470.0
70.0
16590.0
120.0
14360.0
120.0
16470.0
80.0
17500.0
4300.0
19900.0
17400.0
30900.0

Save it to a csv file so this data is here permanently.

In [8]:
dhfr_data.to_csv('dhfr.csv', index=False)

## Get the PubChem Fingerprints

The PubChem fingerprints are a closer view/closer description of the molecule. This is what we use for the model to learn off of.

I used/rewrote some of this code from one of the links in my project notebook. I could likely use PubChem's online tool for fetching data (https://pubchem.ncbi.nlm.nih.gov/docs/programmatic-access) ; however, this seems to generate it way quicker.

Below are the commands I ran in the terminal to get the PaDEL tools for this. They are on the .gitignore in my repository.
!wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip
!wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh
!unzip padel.zip

All we need for this script to work is just the canocial_smiles (aka the chemical structure of it) and the molecule's ChEMBL id. 

In [16]:
selection = ['canonical_smiles','molecule_chembl_id']
dhfr_selection = dhfr_data[selection]
dhfr_selection.to_csv('dhfr.smi', sep='\t', index=False, header=False)

Make sure the format is correct. Show the first 5 lines of this .smi file.

In [15]:
!cat dhfr.smi | head -5

CCCCOc1cc(Cc2cnc(N)nc2N)ccc1OCc1cc(OC)c(OC)c(OC)c1	CHEMBL416373
CCCCOc1cc(Cc2cnc(N)nc2N)ccc1OCc1cc(OC)c(OC)c(OC)c1	CHEMBL416373
CCCCOc1cc(Cc2cnc(N)nc2N)ccc1OCc1cc(OC)c(OC)c(OC)c1	CHEMBL416373
CCCCOc1cc(Cc2cnc(N)nc2N)ccc1OCc1cc(OC)c(OC)c(OC)c1	CHEMBL416373
CCCCOc1cc(Cc2cnc(N)nc2N)ccc1OCc1cc(OC)c(OC)c(OC)c1	CHEMBL416373


In [17]:
!bash padel.sh

Processing CHEMBL416373 in dhfr.smi (1/366). 
Processing CHEMBL416373 in dhfr.smi (2/366). 
Processing CHEMBL416373 in dhfr.smi (3/366). 
Processing CHEMBL416373 in dhfr.smi (4/366). 
Processing CHEMBL416373 in dhfr.smi (5/366). 
Processing CHEMBL416373 in dhfr.smi (6/366). 
Processing CHEMBL416373 in dhfr.smi (7/366). 
Processing CHEMBL416373 in dhfr.smi (8/366). 
Processing CHEMBL291931 in dhfr.smi (13/366). Average speed: 1.08 s/mol.
Processing CHEMBL291931 in dhfr.smi (12/366). Average speed: 1.08 s/mol.
Processing CHEMBL291931 in dhfr.smi (9/366). Average speed: 2.11 s/mol.
Processing CHEMBL291931 in dhfr.smi (11/366). Average speed: 1.08 s/mol.
Processing CHEMBL291931 in dhfr.smi (10/366). Average speed: 1.07 s/mol.
Processing CHEMBL291931 in dhfr.smi (16/366). Average speed: 0.31 s/mol.
Processing CHEMBL29773 in dhfr.smi (17/366). Average speed: 0.31 s/mol.
Processing CHEMBL291931 in dhfr.smi (14/366). Average speed: 0.44 s/mol.
Processing CHEMBL291931 in dhfr.smi (15/366). Aver

## Machine Learning

All imports for machine learning. I am just going to use a lazypredict to test every model, then select one and try to tune the hyperparameters for it.

In [76]:
from sklearn.model_selection import train_test_split
from sklearn.utils import all_estimators
from sklearn.base import RegressorMixin
import lazypredict
from lazypredict.Supervised import LazyRegressor

# other random imports supposedly

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV

Get the inputs (the pubchem fingerprints) and get the outputs, the IC50 values.

In [86]:
inputs = pd.read_csv("descriptors_output.csv")
inputs = inputs.drop('Name', axis=1)
# inputs = inputs.drop_duplicates(subset='Name')

inputs

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
362,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
363,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
364,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [87]:
outputs = dhfr_data['standard_value']
# outputs = dhfr_data[['molecule_chembl_id', 'standard_value']]
# outputs = outputs.rename(columns={'molecule_chembl_id': 'Name'})
# outputs = outputs.drop_duplicates(subset='Name')

outputs

0        330.0
1         50.0
2      10160.0
3         70.0
4       3400.0
        ...   
368       67.9
369       70.1
370      185.4
371      225.5
372      289.9
Name: standard_value, Length: 366, dtype: object

If you can't tell the numbers for the ChEMBL_ID's don't add up, so we have to merge the dataframes.

In [51]:
# combined = pd.merge(inputs, outputs, on='Name')

# combined

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,standard_value
0,CHEMBL416373,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,330.0
1,CHEMBL291931,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,17480.0
2,CHEMBL29773,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9.3
3,CHEMBL22138,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3120.0
4,CHEMBL56146,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15150.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,CHEMBL4203015,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,70.1
95,CHEMBL4211317,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,185.4
96,CHEMBL4213293,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,67.9
97,CHEMBL4212687,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,225.5


In [88]:
input_train, input_test, output_train, output_test = train_test_split(inputs, outputs, test_size=0.2)

input_train
# input_train.shape, output_train.shape, input_test.shape, output_test.shape

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
167,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
267,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
139,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
365,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
107,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
157,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
211,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [89]:
removed_regressors = [
    "TheilSenRegressor",
    "ARDRegression", 
    "CCA", 
    "IsotonicRegression", 
    "StackingRegressor",
    "MultiOutputRegressor", 
    "MultiTaskElasticNet", 
    "MultiTaskElasticNetCV", 
    "MultiTaskLasso", 
    "MultiTaskLassoCV", 
    "PLSCanonical", 
    "PLSRegression", 
    "QuantileRegressor", 
    "RadiusNeighborsRegressor", 
    "RegressorChain", 
    "VotingRegressor", 
]

REGRESSORS = [
    est
    for est in all_estimators()
    if (issubclass(est[1], RegressorMixin) and (est[0] not in removed_regressors))
]

clf = LazyRegressor(custom_metric=None, verbose=1, ignore_warnings=False, regressors = REGRESSORS)
models, predictions = clf.fit(input_train, input_test, output_train, output_test)

'tuple' object has no attribute '__name__'
Invalid Regressor(s)


  0%|                                                                                                                                                                                                  | 0/39 [00:00<?, ?it/s]

{'Model': 'AdaBoostRegressor', 'R-Squared': -0.30430671759110073, 'Adjusted R-Squared': 1.117839592059592, 'RMSE': 17208.217579094104, 'Time taken': 0.03360486030578613}
{'Model': 'BaggingRegressor', 'R-Squared': 0.00643869487574833, 'Adjusted R-Squared': 1.0897648208837505, 'RMSE': 15019.080938019191, 'Time taken': 0.05010390281677246}


  8%|██████████████▎                                                                                                                                                                           | 3/39 [00:00<00:09,  3.86it/s]

{'Model': 'BayesianRidge', 'R-Squared': 0.04573159198524723, 'Adjusted R-Squared': 1.086214843793412, 'RMSE': 14719.10136844529, 'Time taken': 0.6926538944244385}
{'Model': 'DecisionTreeRegressor', 'R-Squared': -0.033534707211789305, 'Adjusted R-Squared': 1.0933762792406692, 'RMSE': 15318.228929141835, 'Time taken': 0.023640155792236328}
{'Model': 'DummyRegressor', 'R-Squared': -0.006101866916066623, 'Adjusted R-Squared': 1.0908978171842485, 'RMSE': 15113.567865895458, 'Time taken': 0.01454305648803711}


 15%|████████████████████████████▌                                                                                                                                                             | 6/39 [00:01<00:06,  5.37it/s]

{'Model': 'ElasticNet', 'R-Squared': 0.08208069640901638, 'Adjusted R-Squared': 1.0829308281709675, 'RMSE': 14436.046576452785, 'Time taken': 0.35772109031677246}


 21%|██████████████████████████████████████▏                                                                                                                                                   | 8/39 [00:02<00:10,  2.91it/s]

{'Model': 'ElasticNetCV', 'R-Squared': 0.0483039151729413, 'Adjusted R-Squared': 1.0859824433073952, 'RMSE': 14699.249597576245, 'Time taken': 1.11411714553833}
{'Model': 'ExtraTreeRegressor', 'R-Squared': -0.03353452615889907, 'Adjusted R-Squared': 1.0933762628831678, 'RMSE': 15318.227587430849, 'Time taken': 0.07844901084899902}


 23%|██████████████████████████████████████████▉                                                                                                                                               | 9/39 [00:02<00:10,  2.84it/s]

{'Model': 'ExtraTreesRegressor', 'R-Squared': 0.041969206571062845, 'Adjusted R-Squared': 1.0865547622776144, 'RMSE': 14748.089257844415, 'Time taken': 0.3736257553100586}
{'Model': 'GammaRegressor', 'R-Squared': 0.03838914651826464, 'Adjusted R-Squared': 1.086878208297236, 'RMSE': 14775.619589542193, 'Time taken': 0.06814384460449219}


 28%|████████████████████████████████████████████████████▏                                                                                                                                    | 11/39 [00:03<00:08,  3.43it/s]

{'Model': 'GaussianProcessRegressor', 'R-Squared': -0.01885392862052182, 'Adjusted R-Squared': 1.0920499217689332, 'RMSE': 15209.046414850669, 'Time taken': 0.32767200469970703}


 31%|████████████████████████████████████████████████████████▉                                                                                                                                | 12/39 [00:03<00:08,  3.32it/s]

{'Model': 'GradientBoostingRegressor', 'R-Squared': 0.04121617346851192, 'Adjusted R-Squared': 1.0866227962089092, 'RMSE': 14753.884279253856, 'Time taken': 0.31364989280700684}


 36%|██████████████████████████████████████████████████████████████████▍                                                                                                                      | 14/39 [00:04<00:08,  2.98it/s]

{'Model': 'HistGradientBoostingRegressor', 'R-Squared': 0.0848082848612528, 'Adjusted R-Squared': 1.0826844000063471, 'RMSE': 14414.582332060805, 'Time taken': 0.700263261795044}
{'Model': 'HuberRegressor', 'R-Squared': 0.013605721392578762, 'Adjusted R-Squared': 1.0891173048741853, 'RMSE': 14964.813037035707, 'Time taken': 0.15190792083740234}
KNeighborsRegressor model failed to execute
unsupported operand type(s) for /: 'str' and 'int'


 44%|████████████████████████████████████████████████████████████████████████████████▋                                                                                                        | 17/39 [00:05<00:06,  3.61it/s]

{'Model': 'KernelRidge', 'R-Squared': -0.2947897353945341, 'Adjusted R-Squared': 1.1169797656977736, 'RMSE': 17145.322044666384, 'Time taken': 0.4328489303588867}
Lars model failed to execute
Input contains NaN.


 46%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                                                   | 18/39 [00:06<00:11,  1.90it/s]

{'Model': 'LarsCV', 'R-Squared': -0.0049732086643914375, 'Adjusted R-Squared': 1.0907958468224017, 'RMSE': 15105.088187884081, 'Time taken': 1.2698261737823486}


 49%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                                                              | 19/39 [00:06<00:09,  2.15it/s]

{'Model': 'Lasso', 'R-Squared': 0.038770775346782194, 'Adjusted R-Squared': 1.0868437294550555, 'RMSE': 14772.687342082232, 'Time taken': 0.29418301582336426}


 51%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                          | 20/39 [00:21<01:20,  4.25s/it]

{'Model': 'LassoCV', 'R-Squared': 0.022256715317247378, 'Adjusted R-Squared': 1.0883357175517834, 'RMSE': 14899.04541570127, 'Time taken': 14.2811861038208}


 59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                            | 23/39 [00:21<00:28,  1.76s/it]

{'Model': 'LassoLars', 'R-Squared': 0.05983587870308826, 'Adjusted R-Squared': 1.0849405703646962, 'RMSE': 14609.920739662033, 'Time taken': 0.1799299716949463}
{'Model': 'LassoLarsCV', 'R-Squared': 0.021124496021590056, 'Adjusted R-Squared': 1.0884380096416137, 'RMSE': 14907.669410494495, 'Time taken': 0.08769893646240234}
LassoLarsIC model failed to execute
You are using LassoLarsIC in the case where the number of samples is smaller than the number of features. In this setting, getting a good estimate for the variance of the noise is not possible. Provide an estimate of the noise variance in the constructor.


 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 25/39 [00:21<00:15,  1.08s/it]

{'Model': 'LinearRegression', 'R-Squared': -1.2496150484740302e+24, 'Adjusted R-Squared': 1.1289838928045074e+23, 'RMSE': 1.6843570022342726e+16, 'Time taken': 0.32819485664367676}
{'Model': 'LinearSVR', 'R-Squared': -0.21609676625759544, 'Adjusted R-Squared': 1.109870128634659, 'RMSE': 16616.137979218503, 'Time taken': 0.0732119083404541}


 74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 29/39 [00:23<00:05,  1.75it/s]

{'Model': 'MLPRegressor', 'R-Squared': -0.16941991701248482, 'Adjusted R-Squared': 1.105653037056821, 'RMSE': 16294.133349339949, 'Time taken': 1.433480978012085}
{'Model': 'NuSVR', 'R-Squared': -0.08526387418882608, 'Adjusted R-Squared': 1.0980498302175548, 'RMSE': 15696.892947105123, 'Time taken': 0.0700528621673584}
{'Model': 'OrthogonalMatchingPursuit', 'R-Squared': -0.20868979525322628, 'Adjusted R-Squared': 1.1092009344721356, 'RMSE': 16565.458117862123, 'Time taken': 0.018214941024780273}
{'Model': 'OrthogonalMatchingPursuitCV', 'R-Squared': 0.039330202264717395, 'Adjusted R-Squared': 1.0867931871716283, 'RMSE': 14768.38792966532, 'Time taken': 0.04466891288757324}


 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 31/39 [00:23<00:03,  2.34it/s]

{'Model': 'PassiveAggressiveRegressor', 'R-Squared': -0.09648890711677716, 'Adjusted R-Squared': 1.0990639730439662, 'RMSE': 15777.861675125992, 'Time taken': 0.11257410049438477}
{'Model': 'PoissonRegressor', 'R-Squared': 0.01104846560309769, 'Adjusted R-Squared': 1.0893483440729874, 'RMSE': 14984.19883662284, 'Time taken': 0.17439532279968262}
RANSACRegressor model failed to execute
`min_samples` may not be larger than number of samples: n_samples = 292.


 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 33/39 [00:24<00:02,  2.80it/s]

{'Model': 'RandomForestRegressor', 'R-Squared': 0.0333307052796189, 'Adjusted R-Squared': 1.0873352209338958, 'RMSE': 14814.431323263118, 'Time taken': 0.39101696014404297}
{'Model': 'Ridge', 'R-Squared': 0.014162110755612911, 'Adjusted R-Squared': 1.089067037023317, 'RMSE': 14960.59188660862, 'Time taken': 0.08199310302734375}


 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 37/39 [00:25<00:00,  3.72it/s]

{'Model': 'RidgeCV', 'R-Squared': 0.08807395479431945, 'Adjusted R-Squared': 1.0823893580445727, 'RMSE': 14388.841640136301, 'Time taken': 0.5742430686950684}
{'Model': 'SGDRegressor', 'R-Squared': -41177741982667.03, 'Adjusted R-Squared': 3720266292989.572, 'RMSE': 96689048643.84653, 'Time taken': 0.04585599899291992}
{'Model': 'SVR', 'R-Squared': -0.08903969277162105, 'Adjusted R-Squared': 1.0983909623419905, 'RMSE': 15724.175324947939, 'Time taken': 0.12430572509765625}


 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 38/39 [00:25<00:00,  3.81it/s]

{'Model': 'TransformedTargetRegressor', 'R-Squared': -1.2496150484740302e+24, 'Adjusted R-Squared': 1.1289838928045074e+23, 'RMSE': 1.6843570022342726e+16, 'Time taken': 0.2237539291381836}


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:25<00:00,  1.52it/s]

{'Model': 'TweedieRegressor', 'R-Squared': 0.0786989235090344, 'Adjusted R-Squared': 1.083236359633466, 'RMSE': 14462.614570362386, 'Time taken': 0.35759711265563965}





In [90]:
predictions = predictions[['R-Squared', 'RMSE']]
predictions

Unnamed: 0_level_0,R-Squared,RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
LinearRegression,-1.24961504847403e+24,1.6843570022342726e+16
TransformedTargetRegressor,-1.24961504847403e+24,1.6843570022342726e+16
SGDRegressor,-41177741982667.03,96689048643.85
AdaBoostRegressor,-0.3,17208.22
KernelRidge,-0.29,17145.32
LinearSVR,-0.22,16616.14
OrthogonalMatchingPursuit,-0.21,16565.46
MLPRegressor,-0.17,16294.13
PassiveAggressiveRegressor,-0.1,15777.86
SVR,-0.09,15724.18
