In [29]:
import pandas as pd
import sklearn as sk
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [30]:
df = pd.read_csv('delaney-processed.csv')
print(f'Columns: {df.columns}')

Columns: Index(['Compound ID', 'ESOL predicted log solubility in mols per litre',
       'Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors',
       'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area',
       'measured log solubility in mols per litre', 'smiles'],
      dtype='object')


In [31]:
x = df[['Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors', 'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area']]
y = df['measured log solubility in mols per litre']

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.3, random_state=42, shuffle=True)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42, shuffle=True)

In [32]:
rf = XGBRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_val)

rmse = root_mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("Root Mean Squared Error (RMSE):", rmse)
print("MAE Score:", mae)
print("R² Score:", r2)

Root Mean Squared Error (RMSE): 0.8395822278674843
MAE Score: 0.5804746670641843
R² Score: 0.8600569122272508


In [33]:
y_test_pred = rf.predict(x_test)
rmse = root_mean_squared_error(y_test, y_test_pred)
mae = mean_absolute_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print("Root Mean Squared Error (RMSE):", rmse)
print("MAE Score:", mae)
print("R² Score:", r2)

Root Mean Squared Error (RMSE): 0.7977824373020789
MAE Score: 0.5503186083499123
R² Score: 0.8442102077573959
