In [None]:
import os
import numpy as np  # 1.23.5
import pandas as pd  # 1.5.0
import seaborn as sns  # 0.12.0

from rdkit import Chem  # 2023.03.3
from rdkit.Chem import rdFingerprintGenerator

from sklearn.ensemble import RandomForestRegressor  # 1.2.2
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr  # 1.9.1
from lightgbm import LGBMRegressor  # 4.0.0

from tqdm.auto import tqdm

DATA_DIRECTORY_PATH = os.path.join((os.path.split(os.getcwd())[0]), "data")

# 1) Feature generation

##### 1a) Read data

In [None]:
# Read csv file.

df = pd.read_csv(os.path.join(DATA_DIRECTORY_PATH, "FIA49k.csv.gz"))
df.shape

##### 1b) Calculate fingerprints

In [None]:
# Generate Morgan fingerprints with RDKit.
# Hydrogen atoms are added before features are calculated after generating the mol objects from SMILES strings.

RADIUS = 3
FPSIZE = 2048

fingerprints = {}
for _, data in tqdm(df.iterrows()):
    mol = Chem.MolFromSmiles(data.la_smiles)
    mol = Chem.AddHs(mol)

    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=RADIUS, fpSize=FPSIZE)
    fingerprints[data.Compound] = fpgen.GetFingerprintAsNumPy(mol)

In [None]:
# Formatting.

fingerprint_df = pd.DataFrame(fingerprints).T.astype("int32")
fingerprint_df.columns = [f"fp_bit_{i+1}" for i in range(FPSIZE)]
fingerprint_df = fingerprint_df.reset_index(drop=False, names="Compound")
fingerprint_df.shape

In [None]:
# Save data to csv file.

fingerprint_df.to_csv("morgan_fp_fia49k.csv", index=False)

# 2) RandomForestRegressors

##### 2a) Load and prepare data

In [None]:
# Read csv files.

final_df = pd.read_csv("morgan_fp_fia49k.csv")
df2 = pd.read_csv(os.path.join(DATA_DIRECTORY_PATH, "FIA49k.csv.gz"))

final_df.shape, df2.shape

In [None]:
# Formatting.

final_df = final_df.merge(df2[["Compound", "set_assignment", "fia_gas-DSDBLYP", "fia_solv-DSDBLYP"]], on="Compound")
final_df.shape

In [None]:
# Set up train, validation, and test data frames.

X_train = final_df.loc[final_df["set_assignment"] == "train"]
X_validate = final_df.loc[final_df["set_assignment"] == "validate"]
X_test = final_df.loc[final_df["set_assignment"] == "test"]

y_train_gas = X_train["fia_gas-DSDBLYP"]
y_train_solv = X_train["fia_solv-DSDBLYP"]

y_validate_gas = X_validate["fia_gas-DSDBLYP"]
y_validate_solv = X_validate["fia_solv-DSDBLYP"]

y_test_gas = X_test["fia_gas-DSDBLYP"]
y_test_solv = X_test["fia_solv-DSDBLYP"]

compounds_train = list(X_train["Compound"])
compounds_validate = list(X_validate["Compound"])
compounds_test = list(X_test["Compound"])

to_be_droped = [col for col in list(final_df.columns) if "fp_bit_" not in col]

In [None]:
# Drop all columns which are not feature columns. 

X_train = X_train.drop(to_be_droped, axis=1)
X_validate = X_validate.drop(to_be_droped, axis=1)
X_test = X_test.drop(to_be_droped, axis=1)

print("Data shapes:")
print("train:      ", X_train.shape, y_train_gas.shape, y_train_solv.shape)
print("validate:   ", X_validate.shape, y_validate_gas.shape, y_validate_solv.shape)
print("test:       ", X_test.shape, y_test_gas.shape, y_test_solv.shape)

##### 2b) FIA_gas model

In [None]:
# Train RandomForestRegressor with FIA_gas.

rf_gas = RandomForestRegressor(
    n_estimators=100,
    n_jobs=-1,
    verbose=2
)

rf_gas.fit(X_train, y_train_gas)

In [None]:
# Get and visualize prediction results.

predictions_gas = rf_gas.predict(X_test)

print(f"MAE_gas: {round(mean_absolute_error(y_test_gas, predictions_gas), 3)} kJ/mol")
print(f"r2_gas: {round(pearsonr(y_test_gas, predictions_gas)[0]**2, 4)}")

fig = sns.scatterplot(
    x=y_test_gas,
    y=predictions_gas
)

fig.set(xlabel="FIA calculated by DFT [kJ/mol]", ylabel="FIA prediction [kJ/mol]", title="Predicted FIA vs. DFT FIA")
fig

##### 2c) FIA_solv model

In [None]:
# Train RandomForestRegressor with FIA_solv.

rf_solv = RandomForestRegressor(
    n_estimators=100,
    n_jobs=-1,
    verbose=2
)

rf_solv.fit(X_train, y_train_solv)

In [None]:
# Get and visualize prediction results.

predictions_solv = rf_solv.predict(X_test)

print(f"MAE_solv: {round(mean_absolute_error(y_test_solv, predictions_solv), 3)} kJ/mol")
print(f"r2_solv: {round(pearsonr(y_test_solv, predictions_solv)[0]**2, 4)}")

fig = sns.scatterplot(
    x=y_test_solv,
    y=predictions_solv
)

fig.set(xlabel="FIA calculated by DFT [kJ/mol]", ylabel="FIA prediction [kJ/mol]", title="Predicted FIA vs. DFT FIA")
fig

# 3) LGBMRegressor

##### 3a) Load and prepare data

In [None]:
# Execute all cells given above under 2a).

##### 3b) FIA_gas model

In [None]:
# Train LGBMRegressor with FIA_gas.

lgbm_gas = LGBMRegressor(
    n_estimators=2000,
    early_stopping_round=15,
    verbose=2
)

lgbm_gas.fit(
    X_train, 
    y_train_gas,
    eval_set=[(X_validate, y_validate_gas)],
    eval_metric="r2"
)

In [None]:
# Get and visualize prediction results.

predictions_gas = lgbm_gas.predict(X_test)

print(f"MAE_gas: {round(mean_absolute_error(y_test_gas, predictions_gas), 3)} kJ/mol")
print(f"r2_gas: {round(pearsonr(y_test_gas, predictions_gas)[0]**2, 4)}")

fig = sns.scatterplot(
    x=y_test_gas,
    y=predictions_gas
)

fig.set(xlabel="FIA calculated by DFT [kJ/mol]", ylabel="FIA prediction [kJ/mol]", title="Predicted FIA vs. DFT FIA")
fig

##### 3c) FIA_solv model

In [None]:
# Train LGBMRegressor with FIA_solv.

lgbm_solv = LGBMRegressor(
    n_estimators=2000,
    early_stopping_round=15,
    verbose=2
)

lgbm_solv.fit(
    X_train, 
    y_train_solv,
    eval_set=[(X_validate, y_validate_solv)],
    eval_metric="r2"
)

In [None]:
# Get and visualize prediction results.

predictions_solv = lgbm_solv.predict(X_test)

print(f"MAE_solv: {round(mean_absolute_error(y_test_solv, predictions_solv), 3)} kJ/mol")
print(f"r2_solv: {round(pearsonr(y_test_solv, predictions_solv)[0]**2, 4)}")

fig = sns.scatterplot(
    x=y_test_solv,
    y=predictions_solv
)

fig.set(xlabel="FIA calculated by DFT [kJ/mol]", ylabel="FIA prediction [kJ/mol]", title="Predicted FIA vs. DFT FIA")
fig