# 1 часть модели, способная предсказывать необходимый параметр по SMILES.
**Примечание:** Количество SMILES для одного масла может быть произвольным;

In [63]:
import deepchem as dc
import polars as pl
import pandas as pd
import numpy as np

from rdkit import Chem, RDLogger

from typing import Optional

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


def read(n: str, **kwargs):
    return pl.read_csv(f"../data/{n}", **kwargs).to_pandas()


RDLogger.DisableLog("rdApp.*")

private = read("public/data.csv")
check = read("public/check.csv")
check = check.drop_duplicates(subset=["blend_id"]).reset_index(drop=True)

In [64]:
featurizer = dc.feat.CircularFingerprint(size=1024)


def preprocess(
    data: pd.DataFrame, preprocessor: Optional[ColumnTransformer] = None
) -> tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[ColumnTransformer]]:
    data = data.copy()

    if "oil_property_param_value" in data.columns:
        data = data[["blend_id", "smiles", "oil_property_param_value"]]
    else:
        data = data[["blend_id", "smiles"]]

    data.dropna(subset=["smiles"], inplace=True)

    def compute_descriptors(smiles: str):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None

        feature = featurizer.featurize([mol])[0]
        if feature is None:
            feature = np.zeros(featurizer.size)

        features = {}

        for i, bit in enumerate(feature):
            features[f"CFin_{i}"] = bit

        return features

    descriptors_df = pd.DataFrame.from_records(
        data.smiles.apply(compute_descriptors).dropna()
    )
    data = pd.concat([data, descriptors_df], axis=1)
    data = data.drop("smiles", axis=1)

    data.dropna(subset=["blend_id"], inplace=True)
    categorical_cols = ["blend_id"]
    if (
        preprocessor is None
        and len(data.select_dtypes(include=["object", "category"]).columns) > 0
    ):
        categorical_transformer = Pipeline(
            steps=[
                (
                    "imputer",
                    SimpleImputer(strategy="constant", fill_value="missing"),
                ),
                ("onehot", OneHotEncoder(handle_unknown="ignore")),
            ]
        )
        preprocessor = ColumnTransformer(
            transformers=[
                ("cat", categorical_transformer, categorical_cols),
            ],
            remainder="passthrough",
        )

        encoding_data = pd.DataFrame(
            preprocessor.fit_transform(data[categorical_cols]).toarray(),
            columns=preprocessor.get_feature_names_out(categorical_cols),
        )
        data = pd.concat([data, encoding_data], axis=1)
        data = data.drop(categorical_cols, axis=1)
    else:
        encoding_data = pd.DataFrame(
            preprocessor.transform(data[categorical_cols]).toarray(),
            columns=preprocessor.get_feature_names_out(categorical_cols),
        )
        data = pd.concat([data, encoding_data], axis=1)
        data = data.drop(categorical_cols, axis=1)

    data.dropna(inplace=True)

    if "oil_property_param_value" in data.columns:
        y = data.pop("oil_property_param_value")
        return pd.DataFrame(data), pd.DataFrame(y), preprocessor

    return pd.DataFrame(data), None, preprocessor


X, y, preprocessor = preprocess(private)
X.head(1)

Unnamed: 0,CFin_0,CFin_1,CFin_2,CFin_3,CFin_4,CFin_5,CFin_6,CFin_7,CFin_8,CFin_9,...,cat__blend_id_f9a8223a-0cf1-11ed-87b0-005056921581,cat__blend_id_fb57b4fe-632f-11ec-8eb2-005056921581,cat__blend_id_fb583e86-4de6-11ed-9a6d-005056921581,cat__blend_id_fbc44952-0f2d-11ed-9900-005056921581,cat__blend_id_fc68e5c8-b25f-11ec-8ff7-005056921581,cat__blend_id_fd893206-5e3a-11ec-803b-005056921581,cat__blend_id_fd96a6a2-b0fb-11ec-9d4e-005056921581,cat__blend_id_fe2c1c0a-4f84-11ed-93f9-005056921581,cat__blend_id_fe497da4-4ac5-11ed-8f31-005056921581,cat__blend_id_fe8635a0-1262-11ed-8f47-005056921581
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
dataset = dc.data.NumpyDataset(X=X, y=y.oil_property_param_value.to_numpy())

splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
    dataset=dataset, frac_train=0.6, frac_valid=0.2, frac_test=0.2
)

In [58]:
from catboost import CatBoostRegressor, Pool

cm = CatBoostRegressor(
  loss_function="MAE",
  eval_metric="MAE",
)

valid_pool = Pool(valid_dataset.X, valid_dataset.y)

# Model training
cm.fit(X=train_dataset.X, y=train_dataset.y, eval_set=valid_pool);

0:	learn: 56702.7755663	test: 55230.3527668	best: 55230.3527668 (0)	total: 2.28ms	remaining: 2.28s
1:	learn: 56676.5266752	test: 55227.0390676	best: 55227.0390676 (1)	total: 4.61ms	remaining: 2.3s
2:	learn: 56642.4560059	test: 55188.0165536	best: 55188.0165536 (2)	total: 6.69ms	remaining: 2.22s
3:	learn: 56606.0628056	test: 55154.2737874	best: 55154.2737874 (3)	total: 8.74ms	remaining: 2.18s
4:	learn: 56589.1279852	test: 55125.4380881	best: 55125.4380881 (4)	total: 10.9ms	remaining: 2.18s
5:	learn: 56565.3272242	test: 55103.3461231	best: 55103.3461231 (5)	total: 13.4ms	remaining: 2.23s
6:	learn: 56532.7169468	test: 55079.2245661	best: 55079.2245661 (6)	total: 15.6ms	remaining: 2.22s
7:	learn: 56510.5736740	test: 55075.4855808	best: 55075.4855808 (7)	total: 17.7ms	remaining: 2.19s
8:	learn: 56484.4809070	test: 55056.8161914	best: 55056.8161914 (8)	total: 19.7ms	remaining: 2.17s
9:	learn: 56460.1566266	test: 55012.3270545	best: 55012.3270545 (9)	total: 21.7ms	remaining: 2.15s
10:	learn: 

In [59]:
from sklearn.metrics import mean_absolute_error as mae

# evaluate the model
valid_score = mae(valid_dataset.y, cm.predict(valid_dataset.X))
test_score = mae(test_dataset.y, cm.predict(test_dataset.X))

print(f"Valid score: {valid_score:.2f}, Test score: {test_score:.2f}")

Valid score: 45855.45, Test score: 53075.31


In [60]:
X_check, _, _ = preprocess(check, preprocessor)
X_check_dataset = dc.data.NumpyDataset(X=X_check.to_numpy())

predictions = pd.DataFrame(cm.predict(X_check_dataset.X), columns=["results"])
predictions = pd.concat([check.blend_id, predictions], axis=1)
predictions.head(1)

Unnamed: 0,blend_id,results
0,d272c9a6-3332-11ed-9685-005056921581,55839.832224


In [61]:
predictions.to_csv("../predictions.csv", index=False)

In [62]:
!7z a ../solution.zip catboost-smiles2target.ipynb ../predictions.csv


7-Zip 23.01 (x64) : Copyright (c) 1999-2023 Igor Pavlov : 2023-06-20

Open archive: ../solution.zip
--
Path = ../solution.zip
Type = zip
Physical Size = 34454

Scanning the drive:
2 files, 147705 bytes (145 KiB)

Updating archive: ../solution.zip

Add new data to archive: 2 files, 147705 bytes (145 KiB)


Files read from disk: 2
Archive size: 31983 bytes (32 KiB)
Everything is Ok
