In [1]:
#!pip install pandas scikit-learn

In [2]:
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error
from scipy.stats import pearsonr, spearmanr
from sklearn.ensemble import ExtraTreesRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import random


In [3]:
github_location = "https://github.com/enveda/modbioterp-enveda/raw/refs/heads/main/"
#data_folder = "content"
data_folder = "/home/antonio.gomes/modbioterp-enveda/workshop_data"
PDB_FILE_LOCATION = f"{github_location}/workshop_data/cotb2_pp_mg.pdb"
#!wget $PDB_FILE_LOCATION -O /content/cotb2_pp_mg.pdb
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [4]:
energies = f"{github_location}/workshop_data/lig_1_energy.txt"
loaded_ligands = pd.read_csv(energies, header = None, sep=" ")
loaded_ligands.columns = ["ligand", "energy"]
labels = loaded_ligands.energy

In [5]:
data_for_ML = pd.read_parquet(f"{data_folder}/data_for_ML.parquet")
data_for_ML.head(5)

Unnamed: 0,PMI1,PMI2,PMI3,NPR1,NPR2,RadiusOfGyration,InertialShapeFactor,Eccentricity,Asphericity,SpherocityIndex,...,blosum62_6151,blosum62_6152,blosum62_6153,blosum62_6154,blosum62_6155,blosum62_6156,blosum62_6157,blosum62_6158,blosum62_6159,blosum62_6160
0,858.038436,1583.425,2184.329171,0.392816,0.724902,3.102941,0.000845,0.919617,0.247343,0.166761,...,-1.0,-1.0,-1.0,-2.0,-1.0,1.0,0.0,-3.0,-2.0,0.0
1,849.158235,1572.291482,2074.488795,0.409334,0.757918,3.059078,0.000893,0.912385,0.225252,0.231516,...,-1.0,-1.0,-1.0,-2.0,-1.0,1.0,0.0,-3.0,-2.0,0.0
2,790.542596,1615.00898,2102.130286,0.376067,0.768273,3.063071,0.000972,0.926592,0.259586,0.201936,...,-1.0,-1.0,-1.0,-2.0,-1.0,1.0,0.0,-3.0,-2.0,0.0
3,882.532834,1438.45873,2037.900117,0.43306,0.705853,3.012094,0.0008,0.901365,0.21087,0.194837,...,-1.0,-1.0,-1.0,-2.0,-1.0,1.0,0.0,-3.0,-2.0,0.0
4,776.232018,1621.67934,2083.545226,0.372553,0.778327,3.054148,0.001003,0.928011,0.262621,0.210445,...,-1.0,-1.0,-1.0,-2.0,-1.0,1.0,0.0,-3.0,-2.0,0.0


## Model fitting and evaluation

In [6]:
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    data_for_ML,
    labels,
    test_size=0.2,
    random_state=RANDOM_SEED
)
# Standardize the features
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

# Initialize the model
model = ExtraTreesRegressor(n_estimators=100, random_state=RANDOM_SEED)

# Train the model
model.fit(train_data, train_labels)

# Make predictions on the test set
test_predictions = model.predict(test_data)

# Calculate evaluation metrics
mse = mean_squared_error(test_labels, test_predictions)
r2 = r2_score(test_labels, test_predictions)
mae = median_absolute_error(test_labels, test_predictions)
pearson = pearsonr(test_labels, test_predictions)
spearman = spearmanr(test_labels, test_predictions)

# Print the evaluation metrics
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"Median Absolute Error: {mae}")
print(f"Pearson Correlation: {pearson[0]}")
print(f"Spearman Correlation: {spearman[0]}")

Mean Squared Error: 1.2994492197914325
R-squared: 0.9950488293205808
Median Absolute Error: 0.2439351239999965
Pearson Correlation: 0.9975214265962451
Spearman Correlation: 0.9701498035374507
