# Python Notebook 2: Fitting of Regression Algorithms with Target and Feature Selection 

In [1]:
# Import Pandas and Numpy
import pandas as pd
import numpy as np

# Import fingerprint generator 
from padelpy import padeldescriptor

# Import feature reducing modules
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

# Import Train-test split
from sklearn.model_selection import train_test_split

# Import Standard Scaler for target scaling
from sklearn.preprocessing import StandardScaler 

# Import metrics for model evaluation
from sklearn.metrics import mean_squared_error, r2_score

# Import lazy predict regressor
from lazypredict.Supervised import LazyRegressor

# Set full view of columns
pd.set_option('display.max_columns', None)

# Choose directory for importing/exporting CSV 
import os
os.chdir("/Users/danigeiger/Desktop/Capstone/Analysis")
os.getcwd()



'/Users/danigeiger/Desktop/Capstone/Analysis'

## Part 1: Import Cleaned Data of Full (538 Instances) and Small Molecules (525 Instances)

In [2]:
# Import full set 538 rows
full_set = pd.read_csv("Part4_IC50_cleansed.csv")

# Import set without large peptides
small_molecule = pd.read_csv("Part10_low_mw.csv")

print(f'The full_set includes {full_set.shape[0]} molecules.')
print(f'The small_molecule set includes {small_molecule.shape[0]} molecules. The reason for this is a potential bias for parts of the larger molecules might not actually come in contact with the CGRP receptor protein.')



The full_set includes 538 molecules.
The small_molecule set includes 525 molecules. The reason for this is a potential bias for parts of the larger molecules might not actually come in contact with the CGRP receptor protein.


In [3]:
# View head of full set
full_set.head(5)

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL329678,CCC1CN2CCC1CC2[C@H](O)c1cc(-c2cccc([N+](=O)[O-...,41000.0
1,CHEMBL89589,CCC1CN2CCC1CC2[C@H](O)c1cc(-c2cccc(N)c2)nc2ccc...,100000.0
2,CHEMBL316211,CCC1CN2CCC1CC2[C@H](O)c1cc(-c2ccccc2)nc2ccc(OC...,96000.0
3,CHEMBL88111,CCC1CN2CCC1CC2[C@H](O)c1cc(-c2ccc3c(c2)OCO3)nc...,74000.0
4,CHEMBL88196,CCC1CN2CCC1CC2[C@H](O)c1cc(-c2ccsc2)nc2ccc(OC)...,100000.0


In [4]:
# View head of small molecule set
small_molecule.head(5)

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL329678,CCC1CN2CCC1CC2[C@H](O)c1cc(-c2cccc([N+](=O)[O-...,41000.0
1,CHEMBL89589,CCC1CN2CCC1CC2[C@H](O)c1cc(-c2cccc(N)c2)nc2ccc...,100000.0
2,CHEMBL316211,CCC1CN2CCC1CC2[C@H](O)c1cc(-c2ccccc2)nc2ccc(OC...,96000.0
3,CHEMBL88111,CCC1CN2CCC1CC2[C@H](O)c1cc(-c2ccc3c(c2)OCO3)nc...,74000.0
4,CHEMBL88196,CCC1CN2CCC1CC2[C@H](O)c1cc(-c2ccsc2)nc2ccc(OC)...,100000.0


## Part 2: Convert Standard Values into Logarithmic Scale

### View Distribution Before Transformation

In [5]:
# Convert the standard_value column to a numeric type
full_set['standard_value'] = pd.to_numeric(full_set['standard_value'], errors='coerce')

# Compare mean and median
full_set['standard_value'].describe()

count      538.00
mean      7127.79
std      33307.99
min          0.01
25%          2.75
50%         60.00
75%        645.00
max     300000.00
Name: standard_value, dtype: float64

### View Distribution After Transformation

In [6]:
# Convert values from nanomolar to molar
full_set['standard_value'] = full_set['standard_value']*10**-9

# Take the negative log of the molar amount
full_set['standard_value']= -np.log10(full_set['standard_value'])

# Reevalutate distribution
full_set['standard_value'].describe()


count   538.00
mean      7.30
std       1.63
min       3.52
25%       6.19
50%       7.22
75%       8.56
max      11.00
Name: standard_value, dtype: float64

### Repeat Transformation on Small Molecule Set

In [7]:
# Convert the standard_value column to a numeric type
small_molecule['standard_value'] = pd.to_numeric(small_molecule['standard_value'], errors='coerce')

# convert values from nanomolar to molar
small_molecule['standard_value'] = small_molecule['standard_value']*10**-9

# take the negative log of the molar amount
small_molecule['standard_value']= -np.log10(small_molecule['standard_value'])

# view distribution
small_molecule['standard_value'].describe()

count   525.00
mean      7.27
std       1.64
min       3.52
25%       6.16
50%       7.19
75%       8.52
max      11.00
Name: standard_value, dtype: float64

## Part 3: Converting Canonical SMILES to PubChem Fingerprints via PaDEL (Feature Extraction)

In [8]:
# Select the required columns for smi file for padel processing
df_smiles_full_set = full_set[['canonical_smiles', 'molecule_chembl_id']]
df_smiles_small_molecule = small_molecule[['canonical_smiles', 'molecule_chembl_id']]

# Save to a .smi file (PaDEL format)
df_smiles_full_set.to_csv('molecules_full.smi', sep='\t', index=False, header=False)
df_smiles_small_molecule.to_csv('molecules_small.smi', sep='\t', index=False, header=False)

### Calculate Fingerprints Using PaDel-Descriptor Function 

In [9]:
padeldescriptor(
    mol_dir='molecules_full.smi',          
    d_file='fingerprints_full_output.csv', 
    fingerprints=True,                # binary data: where 1 represents the presence of some characteristic, and 0 means lacking that characteristic
    retainorder=True,                 # you need this so we can tie labels/targets back to the correct row
)

padeldescriptor(
    mol_dir='molecules_small.smi',          
    d_file='fingerprints_small_output.csv', 
    fingerprints=True,                # binary data: where 1 represents the presence of some characteristic, and 0 means lacking that characteristic
    retainorder=True,                 # you need this so we can tie labels/targets back to the correct row
)

# note: this module only excepts two columns: the first column has to be a canonical smile and the (optional) second column a molecular identifier

## Part 4: Merge Fingerprints with Targets

In [10]:
# Load fingerprint data (engineered features) as DataFrames 
fingerprints_full = pd.read_csv('fingerprints_full_output.csv')
fingerprints_small = pd.read_csv('fingerprints_small_output.csv')

# Combine fingerprints with target values and molecular identifiers
fingerprints_with_targets_full_df = pd.concat([fingerprints_full, full_set['standard_value']], axis =1)
fingerprints_with_targets_small_df = pd.concat([fingerprints_small, small_molecule['standard_value']], axis =1)

# Export processed data for sharing and transparency
fingerprints_with_targets_full_df.to_csv("fingerprints_with_targets_full.csv")
fingerprints_with_targets_small_df.to_csv("fingerprints_with_targets_small.csv")

Refer to the official PubChem specifications for Pubchem Fingerprinting: https://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.pdf

## Part 5: Split Data into Feature and Target Sets

In [11]:
# Split full molecule data set into fingerprint and target sets
X_full_set = fingerprints_with_targets_full_df.drop(['Name', 'standard_value'], axis =1)
y_full_set = fingerprints_with_targets_full_df['standard_value']

# Split small molecule data set into fingerprint and target sets
X_small_set = fingerprints_with_targets_small_df.drop(['Name', 'standard_value'], axis =1)
y_small_set = fingerprints_with_targets_small_df['standard_value']

In [12]:
y_full_set.shape

(538,)

In [13]:
y_small_set.shape

(525,)

## Part 6: Perform VarianceThreshold Reduction Using Bernoulli Variance Formula

In [14]:
# Compute binary variance threshold using the Bernoulli variance formula  
full_set_selection = VarianceThreshold(threshold=0.8 * (1 - 0.8))  

# Apply variance threshold to filter low-variance features  
X_reduced_full_set = full_set_selection.fit_transform(X_full_set)  

# Display the shape of the reduced feature set  
X_reduced_full_set.shape  


(538, 133)

In [15]:
# Compute binary variance threshold using the Bernoulli variance formula  
small_set_selection = VarianceThreshold(threshold=0.8*(1-0.8)) 

# Apply variance threshold to filter low-variance features  
X_reduced_small_set = small_set_selection.fit_transform(X_small_set)

# Display the shape of the reduced feature set  
X_reduced_small_set.shape 

(525, 132)

## Part 7: Use Principal Component Analysis (PCA) to Optimize Feature Reduction for Models Affected by Feature-to-row Ratio

### Calculate PCA Components for a 10:1 Feature-to-row Ratio

In [16]:
# Compute the number of principal components based on a 10% feature-to-row ratio  
# 80% of the data is used for training  

# Full set: 536 instances * 80% training split * 10% feature-to-row ratio  
pca_full_set = round(536 * 0.8 * 0.1)  

# Small set: 523 instances * 80% training split * 10% feature-to-row ratio  
pca_small_set = round(523 * 0.8 * 0.1)  

# Print the calculated number of principal components  
print(pca_full_set)  
print(pca_small_set)  


43
42


### Apply PCA to Reduce the Feature Dimensions for Both Datasets

In [17]:
pca_538 = PCA(n_components=pca_full_set)  
pca_525 = PCA(n_components=pca_small_set)

X_pca_full_set = pca_538.fit_transform(X_reduced_full_set)
X_pca_small_set = pca_525.fit_transform(X_reduced_small_set)

## Part 8:

In the following sections, we will perform Lazy Predict calculations to evaluate various machine learning models on two datasets with differing feature representations. Specifically, we will:

1. Utilize the full set of 880 PaDEL fingerprints for feature representation. Tree based models tend to be resistant to noisy data so using the full set of features is okay.
2. Employ 131/132 features selected using a variance threshold to retain only those features which meet the binomial variance heuristic, estimated from our 80/20 train-test-split. 
3. Use 42/43 features obtained by applying PCA on the variance-threshold-reduced sets, which allows us to get a 10:1 instance:feature ratio, thus reducing the curse of dimentionality for models which are sensitive to it.

For each of these three feature variations, we will analyze two datasets: one containing 538 data points and the other containing 525 data points. This results in a total of six Lazy Predict runs. The outputs will allow us to compare the performance of different models across datasets and feature representations, guiding our selection of the top four models we will use in the next notebook.



## Part 8A: Evaluate Full Set with Standardized Target Values

### 1. Full Data Set: No Dimensionality Reduction, Unstandardized Targets

In [18]:
X_full_train, X_full_test, y_train, y_test = train_test_split(X_full_set , y_full_set, test_size = 0.2, random_state=42)
lazy = LazyRegressor(verbose=3, ignore_warnings=True, custom_metric=None)
models, predictions = lazy.fit(X_full_train, X_full_test, y_train, y_test)

models ['Data Set'] = 'Full Set'
models['Feature Reduction'] = 'None'
models ['Target Variable'] = 'Unstandardized'
models1 = models


  2%|▏         | 1/42 [00:00<00:08,  4.91it/s]

{'Model': 'AdaBoostRegressor', 'R-Squared': 0.7028409862587189, 'Adjusted R-Squared': 1.0410801220546733, 'RMSE': 0.9077564572735639, 'Time taken': 0.2035069465637207}
{'Model': 'BaggingRegressor', 'R-Squared': 0.8008272973148904, 'Adjusted R-Squared': 1.0275342108363137, 'RMSE': 0.7431727912932062, 'Time taken': 0.08947014808654785}


  7%|▋         | 3/42 [00:00<00:06,  6.49it/s]

{'Model': 'BayesianRidge', 'R-Squared': 0.7900295355030016, 'Adjusted R-Squared': 1.0290269246785255, 'RMSE': 0.7630517580292112, 'Time taken': 0.18335199356079102}
{'Model': 'DecisionTreeRegressor', 'R-Squared': 0.6943660987422389, 'Adjusted R-Squared': 1.0422517150317576, 'RMSE': 0.9206099305675031, 'Time taken': 0.027623891830444336}
{'Model': 'DummyRegressor', 'R-Squared': -0.0725985595666443, 'Adjusted R-Squared': 1.1482791290356988, 'RMSE': 1.7246203148475063, 'Time taken': 0.015770912170410156}
{'Model': 'ElasticNet', 'R-Squared': 0.2662796451306825, 'Adjusted R-Squared': 1.1014316252855516, 'RMSE': 1.4263962109168835, 'Time taken': 0.01955699920654297}


 17%|█▋        | 7/42 [00:13<01:20,  2.29s/it]

{'Model': 'ElasticNetCV', 'R-Squared': 0.7668002199556458, 'Adjusted R-Squared': 1.0322382124867517, 'RMSE': 0.8041535083459672, 'Time taken': 13.155580997467041}
{'Model': 'ExtraTreeRegressor', 'R-Squared': 0.7728168071750472, 'Adjusted R-Squared': 1.0314064620571963, 'RMSE': 0.7937120837260538, 'Time taken': 0.026468276977539062}


 21%|██▏       | 9/42 [00:14<00:55,  1.70s/it]

{'Model': 'ExtraTreesRegressor', 'R-Squared': 0.7368897998858286, 'Adjusted R-Squared': 1.0363731155196594, 'RMSE': 0.8541688614062958, 'Time taken': 0.8976089954376221}
{'Model': 'GammaRegressor', 'R-Squared': 0.7701860202818736, 'Adjusted R-Squared': 1.0317701496509555, 'RMSE': 0.7982944585154587, 'Time taken': 0.06976199150085449}


 26%|██▌       | 11/42 [00:14<00:36,  1.19s/it]

{'Model': 'GaussianProcessRegressor', 'R-Squared': -12.248623774300668, 'Adjusted R-Squared': 2.8315280928296787, 'RMSE': 6.061223195046545, 'Time taken': 0.13689494132995605}


 29%|██▊       | 12/42 [00:15<00:30,  1.02s/it]

{'Model': 'GradientBoostingRegressor', 'R-Squared': 0.8111105045485938, 'Adjusted R-Squared': 1.02611263050814, 'RMSE': 0.7237337007317854, 'Time taken': 0.2631371021270752}


 33%|███▎      | 14/42 [00:15<00:21,  1.32it/s]

{'Model': 'HistGradientBoostingRegressor', 'R-Squared': 0.8103218422916406, 'Adjusted R-Squared': 1.0262216574609748, 'RMSE': 0.725243014161773, 'Time taken': 0.7041690349578857}
{'Model': 'HuberRegressor', 'R-Squared': 0.7108923969537289, 'Adjusted R-Squared': 1.0399670717389549, 'RMSE': 0.895374350862813, 'Time taken': 0.1132662296295166}
{'Model': 'KNeighborsRegressor', 'R-Squared': 0.7891790812391217, 'Adjusted R-Squared': 1.0291444939372274, 'RMSE': 0.7645955104818908, 'Time taken': 0.04518699645996094}
{'Model': 'KernelRidge', 'R-Squared': -19.228803288071777, 'Adjusted R-Squared': 3.7964883098497157, 'RMSE': 7.489623533129245, 'Time taken': 0.027331829071044922}


 40%|████      | 17/42 [00:16<00:10,  2.33it/s]

{'Model': 'Lars', 'R-Squared': -1.4274730920963714e+37, 'Adjusted R-Squared': 1.9733801143967925e+36, 'RMSE': 6.291567466657986e+18, 'Time taken': 0.20903611183166504}


 43%|████▎     | 18/42 [00:17<00:13,  1.80it/s]

{'Model': 'LarsCV', 'R-Squared': 0.28023342188380806, 'Adjusted R-Squared': 1.0995026148041764, 'RMSE': 1.4127676153797561, 'Time taken': 1.084650993347168}
{'Model': 'Lasso', 'R-Squared': -0.0725985595666443, 'Adjusted R-Squared': 1.1482791290356988, 'RMSE': 1.7246203148475063, 'Time taken': 0.018681049346923828}


 48%|████▊     | 20/42 [00:29<00:55,  2.51s/it]

{'Model': 'LassoCV', 'R-Squared': 0.763826729372433, 'Adjusted R-Squared': 1.0326492764304258, 'RMSE': 0.8092640809881101, 'Time taken': 12.171034097671509}
{'Model': 'LassoLars', 'R-Squared': -0.0725985595666443, 'Adjusted R-Squared': 1.1482791290356988, 'RMSE': 1.7246203148475063, 'Time taken': 0.01990222930908203}


 52%|█████▏    | 22/42 [00:29<00:34,  1.71s/it]

{'Model': 'LassoLarsCV', 'R-Squared': 0.7329555790878465, 'Adjusted R-Squared': 1.036916993588631, 'RMSE': 0.8605312521685621, 'Time taken': 0.2444751262664795}


 57%|█████▋    | 24/42 [00:29<00:21,  1.19s/it]

{'Model': 'LinearRegression', 'R-Squared': 0.19601160146388075, 'Adjusted R-Squared': 1.1111456830017632, 'RMSE': 1.4931374343701476, 'Time taken': 0.174821138381958}


 60%|█████▉    | 25/42 [00:30<00:17,  1.02s/it]

{'Model': 'LinearSVR', 'R-Squared': 0.6165585477250723, 'Adjusted R-Squared': 1.0530080560638466, 'RMSE': 1.031156118877077, 'Time taken': 0.2711820602416992}


 69%|██████▉   | 29/42 [00:31<00:07,  1.79it/s]

{'Model': 'MLPRegressor', 'R-Squared': 0.6212577400742458, 'Adjusted R-Squared': 1.052358426113767, 'RMSE': 1.024818073432458, 'Time taken': 1.000809907913208}
{'Model': 'NuSVR', 'R-Squared': 0.7735819158082802, 'Adjusted R-Squared': 1.0313006912254703, 'RMSE': 0.7923744228729049, 'Time taken': 0.06066703796386719}
{'Model': 'OrthogonalMatchingPursuit', 'R-Squared': 0.6891081352259532, 'Adjusted R-Squared': 1.0429785911250944, 'RMSE': 0.9284950049785017, 'Time taken': 0.026768207550048828}
{'Model': 'OrthogonalMatchingPursuitCV', 'R-Squared': 0.707401299401555, 'Adjusted R-Squared': 1.0404496911680021, 'RMSE': 0.9007641421719195, 'Time taken': 0.07471609115600586}


 76%|███████▌  | 32/42 [00:31<00:03,  2.75it/s]

{'Model': 'PassiveAggressiveRegressor', 'R-Squared': 0.7720482226525973, 'Adjusted R-Squared': 1.031512713405907, 'RMSE': 0.7950535556308604, 'Time taken': 0.05189204216003418}
{'Model': 'PoissonRegressor', 'R-Squared': 0.7880821129427968, 'Adjusted R-Squared': 1.0292961420092, 'RMSE': 0.766582146378591, 'Time taken': 0.04485297203063965}
{'Model': 'QuantileRegressor', 'R-Squared': -0.06235138784763272, 'Adjusted R-Squared': 1.1468625303613653, 'RMSE': 1.7163623826872967, 'Time taken': 0.13001394271850586}


 90%|█████████ | 38/42 [00:32<00:00,  4.58it/s]

{'Model': 'RandomForestRegressor', 'R-Squared': 0.8164539784784406, 'Adjusted R-Squared': 1.0253739332077607, 'RMSE': 0.7134234501566789, 'Time taken': 0.690788745880127}
{'Model': 'Ridge', 'R-Squared': 0.6796912132815398, 'Adjusted R-Squared': 1.0442804136677974, 'RMSE': 0.9424521705482316, 'Time taken': 0.02559828758239746}
{'Model': 'RidgeCV', 'R-Squared': 0.768660437380575, 'Adjusted R-Squared': 1.0319810506463547, 'RMSE': 0.8009397499584697, 'Time taken': 0.0507659912109375}
{'Model': 'SGDRegressor', 'R-Squared': -2.9798015055827526e+17, 'Adjusted R-Squared': 4.119363838467113e+16, 'RMSE': 909009789.64748, 'Time taken': 0.020926952362060547}
{'Model': 'SVR', 'R-Squared': 0.7864633947414865, 'Adjusted R-Squared': 1.0295199182980115, 'RMSE': 0.7695043156801874, 'Time taken': 0.062293052673339844}


 95%|█████████▌| 40/42 [00:32<00:00,  5.42it/s]

{'Model': 'TransformedTargetRegressor', 'R-Squared': 0.19601160146388075, 'Adjusted R-Squared': 1.1111456830017632, 'RMSE': 1.4931374343701476, 'Time taken': 0.12764596939086914}
{'Model': 'TweedieRegressor', 'R-Squared': 0.7683403645241329, 'Adjusted R-Squared': 1.032025298444338, 'RMSE': 0.801493633738146, 'Time taken': 0.027670860290527344}


100%|██████████| 42/42 [00:32<00:00,  1.27it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001186 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 780
[LightGBM] [Info] Number of data points in the train set: 430, number of used features: 260
[LightGBM] [Info] Start training from score 7.389417
{'Model': 'LGBMRegressor', 'R-Squared': 0.804118054515754, 'Adjusted R-Squared': 1.0270792870372278, 'RMSE': 0.7370078220518114, 'Time taken': 0.07360005378723145}





### 2. Full Data Set: Variance Threshold Dimensionality Reduction, Unstandardized Targets

In [19]:
X_train_reduced, X_test_reduced, y_train, y_test = train_test_split(X_reduced_full_set , y_full_set, test_size = 0.2, random_state=42)
lazy = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy.fit(X_train_reduced, X_test_reduced, y_train, y_test)

models ['Data Set'] = 'Full Set' 
models['Feature Reduction'] = 'VarThres' 
models ['Target Variable'] = 'Unstandardized' 
models2 = models

100%|██████████| 42/42 [00:05<00:00,  7.53it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 399
[LightGBM] [Info] Number of data points in the train set: 430, number of used features: 133
[LightGBM] [Info] Start training from score 7.389417





### 3. Full Data Set: Variance Threshold and PCA Dimensionality Reduction, Unstandardized Targets

In [20]:
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca_full_set , y_full_set, test_size = 0.2, random_state=42)
lazy = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy.fit(X_train_pca, X_test_pca, y_train, y_test)


models['Data Set'] = 'Full Set' 
models['Feature Reduction'] = 'VarThres and PCA'
models['Target Variable'] = 'Unstandardized'
models3 = models


100%|██████████| 42/42 [00:02<00:00, 14.35it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000282 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5804
[LightGBM] [Info] Number of data points in the train set: 430, number of used features: 43
[LightGBM] [Info] Start training from score 7.389417





## Part 8B: Evaluate Full Set with Standardized Target Values

### Standardize Full Set Target Values

In [21]:
scaler_target = StandardScaler()
y_train_scaled = scaler_target.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_target.transform(y_test.values.reshape(-1, 1)).flatten()

### 4. Full Data Set: No Dimensionality Reduction, Standardized Targets

In [22]:
lazy = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy.fit(X_full_train, X_full_test, y_train_scaled, y_test_scaled)

models ['Data Set'] = 'Full Set' 
models['Feature Reduction'] = 'None'
models ['Target Variable'] = 'Standardized'
models4 = models

100%|██████████| 42/42 [00:33<00:00,  1.26it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 780
[LightGBM] [Info] Number of data points in the train set: 430, number of used features: 260
[LightGBM] [Info] Start training from score 0.000000





### 5. Full Data Set: Variance Threshold Dimensionality Reduction, Standardized Targets

In [23]:
lazy = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy.fit(X_train_reduced, X_test_reduced, y_train_scaled, y_test_scaled)

models ['Data Set'] = 'Full set' 
models['Feature Reduction'] = 'VarThres' 
models ['Target Variable'] = 'Standardized'
models5 = models

100%|██████████| 42/42 [00:05<00:00,  7.63it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000143 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 399
[LightGBM] [Info] Number of data points in the train set: 430, number of used features: 133
[LightGBM] [Info] Start training from score 0.000000





### 6. Full Data Set: Variance Threshold and PCA Dimensionality Reduction, Standardized Targets

In [24]:
lazy = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy.fit(X_train_pca, X_test_pca, y_train_scaled, y_test_scaled)

models ['Data Set'] = 'Full Set' 
models['Feature Reduction'] = 'VarThres and PCA'
models ['Target Variable'] = 'Standardized'
models6 = models

100%|██████████| 42/42 [00:02<00:00, 14.44it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000420 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5804
[LightGBM] [Info] Number of data points in the train set: 430, number of used features: 43
[LightGBM] [Info] Start training from score 0.000000





## Part 8C: Evaluate the Smaller Molecule Set with Unstandardized Targets to Assess Impact of Removing Larger Molecules

### 7. Small Molecule Data Set: No Dimensionality Reduction, Unstandardized Targets

In [25]:
X_small_train, X_small_test, y_train, y_test = train_test_split(X_small_set , y_small_set, test_size = 0.2, random_state=42)
lazy = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy.fit(X_small_train, X_small_test, y_train, y_test)


models ['Data Set'] = 'Small Set'
models['Feature Reduction'] = 'None'
models ['Target Variable'] = 'Unstandardized'
models7 = models

100%|██████████| 42/42 [00:28<00:00,  1.49it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001917 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 768
[LightGBM] [Info] Number of data points in the train set: 420, number of used features: 256
[LightGBM] [Info] Start training from score 7.359582





### 8. Small Molecule Data Set: Variance Threshold Dimensionality Reduction, Unstandardized Targets

In [26]:
X_train_reduced, X_test_reduced, y_train, y_test = train_test_split(X_reduced_small_set , y_small_set, test_size = 0.2, random_state=42)
lazy = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy.fit(X_train_reduced, X_test_reduced, y_train, y_test)

models['Data Set'] = 'Small Set'
models['Feature Reduction'] = 'VarThres' 
models['Target Variable'] = 'Unstandardized' 
models8 = models


100%|██████████| 42/42 [00:05<00:00,  8.04it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000929 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 420, number of used features: 132
[LightGBM] [Info] Start training from score 7.359582





### 9. Small Molecule Data Set: Variance Threshold and PCA Dimensionality Reduction, Unstandardized Targets

In [27]:
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca_small_set , y_small_set, test_size = 0.2, random_state=42)
lazy = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy.fit(X_train_pca, X_test_pca, y_train, y_test)

models['Data Set'] = 'Small Set'
models['Feature Reduction'] = 'VarThres and PCA'
models['Target Variable'] = 'Unstandardized' 
models9 = models


100%|██████████| 42/42 [00:02<00:00, 15.35it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000314 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5472
[LightGBM] [Info] Number of data points in the train set: 420, number of used features: 42
[LightGBM] [Info] Start training from score 7.359582





## Part 8D: Evaluate the Smaller Molecule Set with Standardized Target Values

### Standardize Small Molecule Set Target Values

In [28]:
scaler_target = StandardScaler()
y_train_scaled = scaler_target.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_target.transform(y_test.values.reshape(-1, 1)).flatten()

### 10. Small Molecule Data Set: No Dimensionality Reduction, Standardized Targets

In [29]:
lazy = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy.fit(X_small_train, X_small_test, y_train_scaled, y_test_scaled)

models['Data Set'] = 'Small Set'
models['Feature Reduction'] = 'None'
models['Target Variable'] = 'Standardized'
models10 = models

100%|██████████| 42/42 [00:28<00:00,  1.50it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001023 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 768
[LightGBM] [Info] Number of data points in the train set: 420, number of used features: 256
[LightGBM] [Info] Start training from score 0.000000





### 11. Small Molecule Data Set: Variance Threshold Dimensionality Reduction, Standardized Targets

In [30]:
lazy = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy.fit(X_train_reduced, X_test_reduced, y_train_scaled, y_test_scaled)

models['Data Set'] = 'Small Set'
models['Feature Reduction'] = 'VarThres'
models['Target Variable'] = 'Standardized'
models11 = models

100%|██████████| 42/42 [00:05<00:00,  8.26it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000807 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 420, number of used features: 132
[LightGBM] [Info] Start training from score 0.000000





### 12. Small Molecule Data Set: Variance Threshold and PCA Dimensionality Reduction, Standardized Targets

In [31]:
lazy = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy.fit(X_train_pca, X_test_pca, y_train_scaled, y_test_scaled)

models['Data Set'] =  'Small Set'
models['Feature Reduction'] = 'VarThres and PCA'
models['Target Variable'] = 'Standardized'
models12 = models


100%|██████████| 42/42 [00:02<00:00, 15.08it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000191 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5472
[LightGBM] [Info] Number of data points in the train set: 420, number of used features: 42
[LightGBM] [Info] Start training from score 0.000000





## Part 9: Evaluating 472 Model Combinations for Optimal R-Squared, RMSE, and Execution Time

In [32]:
results = pd.concat([models1, models2, models3, models4, models5, models6, models7, models8, models9, models10, models11, models12], axis= 0)
print(results.shape)
best_options = results.nlargest(20, 'R-Squared')

best_options


(472, 7)


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken,Data Set,Feature Reduction,Target Variable
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BaggingRegressor,1.7,0.83,0.69,0.04,Full Set,VarThres,Unstandardized
BaggingRegressor,1.72,0.83,0.43,0.05,Full set,VarThres,Standardized
RandomForestRegressor,1.02,0.83,0.44,0.69,Small Set,,Standardized
RandomForestRegressor,1.02,0.82,0.72,0.69,Small Set,,Unstandardized
RandomForestRegressor,1.74,0.82,0.44,0.3,Full set,VarThres,Standardized
RandomForestRegressor,1.74,0.82,0.71,0.3,Full Set,VarThres,Unstandardized
RandomForestRegressor,1.03,0.82,0.44,0.69,Full Set,,Standardized
RandomForestRegressor,1.03,0.82,0.71,0.69,Full Set,,Unstandardized
GradientBoostingRegressor,1.02,0.81,0.73,0.26,Small Set,,Unstandardized
GradientBoostingRegressor,1.77,0.81,0.45,0.12,Full set,VarThres,Standardized


### Export Top Twenty Algorithm and Data Set Combinations as CSV

In [33]:
best_options.to_csv("lazy_predict_best_models.csv")

## Credits

### I would like to express my sincere gratitude to Natasenamat for his YouTube video, which provided valuable guidance in curating my data, performing feature engineering using PaDELpy and suggesting LazyPredict. I would also like to express my gratitude to Shankar Pandala for building the Lazy Predict module.

References
  
Nantasenamat, C. (2024, February 19). *Bioinformatics project from scratch - Drug discovery part 1 (Data collection and pre-processing)* [Video]. YouTube. [https://www.youtube.com/watch?v=plVLRashaA8](https://www.youtube.com/watch?v=plVLRashaA8)


Pandala, S. (n.d.). *LazyPredict: A fast way to compare machine learning models*. GitHub. Retrieved February 25, 2025, from [https://github.com/shankarpandala/lazypredict/blob/dev/lazypredict/Supervised.py](https://github.com/shankarpandala/lazypredict/blob/dev/lazypredict/Supervised.py)

