In [20]:
!pip install numpy pandas scikit-learn ipykernel openbabel-wheel rdkit tqdm xgboost optuna --quiet

In [31]:
import numpy as np
import pandas as pd

from rdkit.Chem import AllChem, Descriptors, rdFingerprintGenerator
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit import DataStructs, Chem

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [22]:
train_data = pd.read_csv('/kaggle/input/final-d/final_s_data.csv') 

'''final_s_data.csv был собран из различных источников, 
    почищен от невалидных SMILES и дубликатов, однако ноутбук,
    в котормо это все происходило, утерян(('''

test_data = pd.read_csv('/kaggle/input/siburdata/sibur_element_119_final_test_data80.csv')
train_df = train_data.copy()  
test_df = test_data.copy()

In [23]:
def smiles_to_mol(smiles): #убрать невалидные SMILES
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"Невалидный SMILES: {smiles}")  
    return mol

In [24]:
train_df = train_df[train_df["SMILES"].notna()].copy() #небольшая работа с данными
test_df = test_df[test_df["SMILES"].notna()].copy()
train_df["mol"] = train_df["SMILES"].apply(smiles_to_mol)
test_df["mol"] = test_df["SMILES"].apply(smiles_to_mol)
train_df = train_df[train_df["mol"].notnull()].reset_index(drop=True)
test_df = test_df[test_df["mol"].notnull()].reset_index(drop=True)

print(train_data.shape, train_df.shape)
train_df

(29987, 2) (29987, 3)


Unnamed: 0,SMILES,LogP,mol
0,N=c1[nH]onc1-c1ccccc1,3.093000,<rdkit.Chem.rdchem.Mol object at 0x7c1c28d81930>
1,O=C(Cc1ccccc1)Nc1ccc(Br)cc1,5.245000,<rdkit.Chem.rdchem.Mol object at 0x7c1c28975cb0>
2,CCc1cccs1,4.294000,<rdkit.Chem.rdchem.Mol object at 0x7c1c25a44580>
3,CC(C)c1cccs1,2.841948,<rdkit.Chem.rdchem.Mol object at 0x7c1c2890a570>
4,CCCc1cccs1,2.795413,<rdkit.Chem.rdchem.Mol object at 0x7c1c2890a3b0>
...,...,...,...
29982,CC(C)OPN,0.000000,<rdkit.Chem.rdchem.Mol object at 0x7c1c25483370>
29983,CCSCSCO,1.400000,<rdkit.Chem.rdchem.Mol object at 0x7c1c254833e0>
29984,[2H]C([2H])(O)C([2H])([2H])C([2H])([2H])Br,0.700000,<rdkit.Chem.rdchem.Mol object at 0x7c1c25483450>
29985,CI(C)CSO,2.000000,<rdkit.Chem.rdchem.Mol object at 0x7c1c254834c0>


In [25]:
descriptor_names = [desc[0] for desc in Descriptors._descList] #вытаскиваем из молекул фичи с помощью дескрипторов
calc = MolecularDescriptorCalculator(descriptor_names)

def calc_desc(mol):
    return calc.CalcDescriptors(mol)

train_desc = pd.DataFrame(train_df["mol"].apply(calc_desc).tolist(), columns=descriptor_names)
test_desc = pd.DataFrame(test_df["mol"].apply(calc_desc).tolist(), columns=descriptor_names)

In [26]:
morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=2048) #делаем отпечатки молекул, для более подробной информации 
                                                                                    #смотреть https://habr.com/ru/companies/hsespb/articles/565650/
def mol_to_fp_new(mol):
    fp = morgan_generator.GetFingerprint(mol)
    arr = np.zeros((1,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

train_fp = np.stack(train_df["mol"].apply(mol_to_fp_new).values)
test_fp = np.stack(test_df["mol"].apply(mol_to_fp_new).values)

train_fp = pd.DataFrame(train_fp, columns=[f'fp_{i}' for i in range(train_fp.shape[1])])
test_fp = pd.DataFrame(test_fp, columns=[f'fp_{i}' for i in range(test_fp.shape[1])])

In [27]:
X = pd.concat([train_desc, train_fp], axis=1).replace([np.inf, -np.inf], np.nan).fillna(0)
X_test_final = pd.concat([test_desc, test_fp], axis=1).replace([np.inf, -np.inf], np.nan).fillna(0)
y = train_df["LogP"].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fp_2038,fp_2039,fp_2040,fp_2041,fp_2042,fp_2043,fp_2044,fp_2045,fp_2046,fp_2047
8528,10.072162,10.072162,0.168250,-1.466582,0.491029,31.391304,327.377000,302.177,327.168188,130,...,0,0,0,0,0,0,0,0,0,0
618,9.962453,9.962453,0.461481,-0.674905,0.583961,9.000000,131.175000,118.071,131.094629,54,...,0,0,0,0,0,0,0,0,0,0
20920,7.895833,7.895833,0.630401,-1.930556,0.448840,18.500000,82.526204,75.474,82.015446,26,...,0,0,0,0,0,0,0,0,0,0
22587,8.697917,8.697917,0.092269,-0.300926,0.534357,14.142857,104.149000,92.053,104.083730,44,...,1,0,0,1,0,0,0,0,0,0
22276,5.167299,5.167299,0.785278,0.785278,0.331327,9.000000,123.246000,114.174,123.017641,38,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,3.187500,3.187500,0.412037,-0.412037,0.522757,14.666667,107.222000,94.118,107.076870,40,...,0,0,0,0,0,0,0,0,0,0
5390,11.443477,11.443477,0.173682,-0.190499,0.894689,10.052632,262.273000,248.161,262.117824,100,...,0,0,0,0,0,0,0,0,0,0
860,13.030734,13.030734,0.331049,-1.423684,0.730624,17.555556,373.472000,345.248,373.216555,146,...,0,0,0,0,0,0,0,0,1,0
15795,9.050981,9.050981,0.051520,-0.111312,0.610277,24.588235,252.299000,240.203,252.068097,90,...,0,0,0,0,0,0,0,0,0,0


In [34]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'random_state': 77
    }

    model = XGBRegressor(**params, n_jobs=-1)
    
    # кросс-валидация, чтобы не терять данные
    kf = KFold(n_splits=5, shuffle=True, random_state=77)
    scores = cross_val_score(model, X, y, cv=kf, 
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1)
    
    return -np.mean(scores)  

sampler = optuna.samplers.TPESampler(seed=77)
study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=10)  

best_params = study.best_params
best_params["random_state"] = 77
print("Лучшие параметры:", best_params)

final_model = XGBRegressor(**best_params, n_jobs=-1)
final_model.fit(X, y)
test_preds = final_model.predict(X_test_final)

[I 2025-04-24 22:25:08,383] A new study created in memory with name: no-name-5d37b3e8-b307-4cff-86e9-54f0bdfff64f
[I 2025-04-24 22:30:03,870] Trial 0 finished with value: 0.36662648260594083 and parameters: {'n_estimators': 928, 'max_depth': 9, 'learning_rate': 0.07362691168996059, 'subsample': 0.5696572840513675, 'colsample_bytree': 0.5436597739948, 'gamma': 0.02013800592468076, 'reg_alpha': 4.066375651748906e-06, 'reg_lambda': 0.00021307994293669834}. Best is trial 0 with value: 0.36662648260594083.
[I 2025-04-24 22:33:31,974] Trial 1 finished with value: 0.4182707202778702 and parameters: {'n_estimators': 316, 'max_depth': 8, 'learning_rate': 0.009822500842608337, 'subsample': 0.8575959436398785, 'colsample_bytree': 0.9183399705405975, 'gamma': 0.0005103276981163463, 'reg_alpha': 2.3401112920290767e-06, 'reg_lambda': 1.7706858846048125e-06}. Best is trial 0 with value: 0.36662648260594083.
[I 2025-04-24 22:40:00,452] Trial 2 finished with value: 0.8515926804714933 and parameters: {'

Лучшие параметры: {'n_estimators': 928, 'max_depth': 9, 'learning_rate': 0.07362691168996059, 'subsample': 0.5696572840513675, 'colsample_bytree': 0.5436597739948, 'gamma': 0.02013800592468076, 'reg_alpha': 4.066375651748906e-06, 'reg_lambda': 0.00021307994293669834, 'random_state': 77}


In [36]:
submission = pd.DataFrame({
    "ID": test_df["ID"],
    "LogP": test_preds
})
submission.to_csv("submission.csv", index=False)

In [38]:
importance = final_model.get_booster().get_score(importance_type='gain') #нужно было Кириллу

importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
for feature, score in importance:
    print(f"{feature}: {score:.3f}")

fr_benzene: 288.963
NumAromaticCarbocycles: 277.589
MolLogP: 124.616
PEOE_VSA6: 112.448
fp_1171: 105.154
NumHDonors: 78.826
fp_1380: 68.129
NHOHCount: 52.530
fr_Al_OH_noTert: 36.242
fr_Al_OH: 23.970
fr_NH2: 23.148
MolMR: 22.209
TPSA: 20.886
VSA_EState6: 20.415
SlogP_VSA7: 17.383
LabuteASA: 16.832
SMR_VSA7: 16.572
fr_C_O: 15.031
fp_935: 14.150
fr_halogen: 11.242
NumAromaticRings: 10.543
PEOE_VSA7: 10.327
HeavyAtomCount: 9.811
fp_1917: 9.150
Chi4n: 8.366
fr_phenol: 8.245
fp_1516: 7.346
fr_C_O_noCOO: 7.328
NOCount: 7.254
SMR_VSA5: 6.837
fp_188: 5.653
fp_1750: 5.349
fp_45: 5.086
fr_alkyl_halide: 5.082
Chi0: 5.044
SMR_VSA4: 5.034
fp_717: 4.852
NumHeteroatoms: 4.850
SlogP_VSA5: 4.821
NumHAcceptors: 4.732
fp_327: 4.546
fp_1967: 4.479
EState_VSA1: 4.302
fr_amide: 4.287
fp_2010: 4.282
fp_229: 4.214
fr_unbrch_alkane: 4.122
VSA_EState10: 3.986
fp_1612: 3.907
fr_quatN: 3.789
fp_140: 3.744
fp_2011: 3.645
VSA_EState9: 3.626
NumSaturatedCarbocycles: 3.620
NumAmideBonds: 3.517
NumAtomStereoCenters: 3.

Итого, после данного ноутбука мы поняли, что лучше всего использовать оригинальный датасет от СИБУРа, который следует предварительно почистить.
И еще одна важная мысль — использование дескрипторов не приведет нас к победе, поскольку есть способы гораздо лучше (это мы поняли уже, когда начали искать другие решения)