In [1]:
import os
import pandas as pd

from sklearn.model_selection import train_test_split

## Benchmark 1. AqSolDB

**Description:** AqSolDB: A curated reference set of aqueous solubility and 2D descriptors for a diverse set of compounds.

**Data source**: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/OVHAW8

1. Load original data from the data source link above. The ``curated-solubility-dataset.csv`` file will be downloaded.

In [2]:
orig_data = pd.read_csv("curated-solubility-dataset.csv")

2. Prepare the data in a suitable format (smiles and property columns)

In [3]:
selected_data = orig_data[["SMILES", "Solubility"]]

3. Reproduce the original train/test splitting, if any. Split randomly otherwise.

In [4]:
train_data, test_data = train_test_split(selected_data, test_size=0.2, random_state=42)

4. Save the prepared data in csv format (smiles and property columns, without index and header)

In [5]:
data_folder = "AqSolDB"

# create a folder
os.mkdir(data_folder)

# save files
train_data.to_csv(os.path.join(data_folder, "train.csv"), index=False, header=None)
test_data.to_csv(os.path.join(data_folder, "test.csv"), index=False, header=None)

## Benchmark 2. Flashpoint

**Description:** Sun et al. collected a dataset of the flashpoints of 10575 molecules from academic papers, the Gelest chemical catalogue, the DIPPR database, Lange's Handbook of Chemistry, the Hazardous Chemicals Handbook, and the PubChem database.

**Data source**: https://github.com/cheminfo/molecule-features/blob/main/data/flashpoint/data.csv

1. Load original data from the data source link above. The ``data.csv`` file will be downloaded.

In [6]:
orig_data = pd.read_csv("data.csv")

2. Prepare the data in a suitable format (smiles and property columns)

In [7]:
selected_data = orig_data[["smiles", "flashpoint"]]

3. Reproduce the original train/test splitting, if any. Split randomly otherwise.

In [8]:
train_data, test_data = train_test_split(selected_data, test_size=0.2, random_state=42)

4. Save the prepared data in csv format (smiles and property columns, without index and header)

In [9]:
data_folder = "Flashpoint"

# create a folder
os.mkdir(data_folder)

# save files
train_data.to_csv(os.path.join(data_folder, "train.csv"), index=False, header=None)
test_data.to_csv(os.path.join(data_folder, "test.csv"), index=False, header=None)