In [1]:
import polars as pl
from tqdm import tqdm
from rdkit import Chem

In [2]:
data = pl.read_csv('dataset/zinc15_250k_2D.csv')
data

smiles,zinc_id,mwt,logp,reactive,purchasable,tranche_name
str,str,f64,f64,i64,i64,str
"""CCN(CCSC)C(=O)N[C@@](C)(CC)C(F…","""ZINC000933510105""",286.363,3.112,0,20,"""CGAD"""
"""CC1(C)CN(C(=O)Nc2cc3ccccc3nn2)…","""ZINC000888615590""",342.399,2.432,0,20,"""EEAD"""
"""CC[C@H](NC(C)=O)C(=O)NCC1(NC(=…","""ZINC001432326999""",337.38,-0.4,0,20,"""EBAD"""
"""O=C(N[C@@H]1CC[C@H](F)C1)[C@H]…","""ZINC000345651486""",305.349,2.568,0,20,"""DFAD"""
"""COCC(=O)N(C)CC(=O)NCC1(Nc2nccn…","""ZINC001110378235""",348.367,-1.315,0,20,"""EAAD"""
…,…,…,…,…,…,…
"""Cc1ccn(C)c1C(=O)OCCc1cn(Cc2ccc…","""ZINC000835980608""",324.384,2.373,0,20,"""DEAD"""
"""Nc1nc(Cl)cc2ncn(-c3cccc(C(=O)N…","""ZINC001174169784""",341.802,2.892,0,20,"""EFAD"""
"""Cc1c(C(=O)N[C@H](C)Cn2cccn2)cn…","""ZINC000042501218""",309.373,2.196,0,20,"""DEAD"""
"""C[C@@H]1CN(CCS(C)(=O)=O)C[C@H]…","""ZINC001208492797""",305.444,-0.581,0,20,"""DBAD"""


In [3]:
from rdkit.Chem.Descriptors import CalcMolDescriptors

In [4]:
row = []
for i, id in tqdm(data[['smiles', 'zinc_id']].iter_rows(), total=len(data)):
    desc = CalcMolDescriptors(Chem.MolFromSmiles(i))
    sub = {'zinc_id': id, 'smiles': i}
    for k, v in desc.items():
        sub[k] = v
    row.append(sub)
zinc = pl.DataFrame(row)
zinc.write_csv('data/zinc/descriptors.csv')

100%|██████████| 250000/250000 [21:05<00:00, 197.48it/s]


In [21]:
import rdkit
rdkit.__version__

'2025.03.3'

In [20]:
len(CalcMolDescriptors(Chem.MolFromSmiles(i)))

217

In [17]:
len(zinc.columns[2:])

217

In [5]:
import os
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

drop_cols = []
continuous_cols = []
categorical_cols = []
binary_cols = []

def is_regression_column(series: pl.Series) -> bool:
    # 열에 소수점 이하가 있는지 판단
    if series.dtype not in [pl.Float32, pl.Float64]:
        return False
    return (series - series.cast(pl.Int64)).abs().sum() > 0

def plot_column_distribution(df: pl.DataFrame, save_dir: str = 'data/zinc/plots'):
    os.makedirs(save_dir, exist_ok=True)
    for col in tqdm(df.columns[2:]):
        series = df[col]
        values = series.to_numpy()
        if df[col].n_unique() == 1:
            drop_cols.append(col)
            continue
        plt.figure(figsize=(6, 4))
        
        tp = ''
        
        if is_regression_column(series):
            sns.histplot(values, bins=30, kde=True, color="steelblue")
            plt.title(f"[Continuous] Histogram for '{col}'")
            continuous_cols.append(col)
            tp = 'Continuous'
        else:
            unique_values = np.unique(values)
            if len(unique_values) == 2:
                sns.histplot(values, bins=30, kde=False, color="lightcoral")
                plt.title(f"[Binary/Other] Histogram for '{col}'")
                binary_cols.append(col)
                tp = 'Binary'
            else:
                sns.countplot(x=values, palette="Set2")
                plt.title(f"[Categorical] Countplot for '{col}'")
                categorical_cols.append(col)
                tp = 'Categorical'

        plt.xlabel(col)
        plt.ylabel("Count")
        plt.tight_layout()

        # 저장 경로 구성
        filename = f"{col.replace(' ', '_')}.png"
        os.makedirs(save_dir + '/' + tp + '/', exist_ok=True)
        save_path = os.path.join(save_dir + '/' + tp + '/', filename)
        plt.savefig(save_path)
        plt.close()
        
    # columns
    print(f'Continuous ({len(continuous_cols)}):', continuous_cols)
    print(f'Categorical ({len(categorical_cols)}):', categorical_cols)
    print(f'Binary ({len(binary_cols)}):', binary_cols)
    print(f'Drop ({len(drop_cols)}):', drop_cols)
    return continuous_cols, categorical_cols, binary_cols, drop_cols

In [6]:
continuous_cols, categorical_cols, binary_cols, drop_cols = plot_column_distribution(zinc)

100%|██████████| 217/217 [02:29<00:00,  1.45it/s]

Continuous (105): ['MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2'




In [47]:
eye_drops = ['BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BCUT2D_MWHI', 'EState_VSA9', 'EState_VSA11', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14',
 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SMR_VSA2', 'SMR_VSA4', 'SMR_VSA9', 'SMR_VSA10', 'VSA_EState4', 'VSA_EState5', 'VSA_EState9', 'VSA_EState10',
 'NumAromaticCarbocycles', 'NumBridgeheadAtoms', 'NumSpiroAtoms', 'NumUnspecifiedAtomStereoCenters']
len(eye_drops)

29

In [48]:
len(continuous_cols), len(categorical_cols), len(binary_cols), len(drop_cols)

(105, 94, 10, 8)

In [49]:
regression_float = [c for c in continuous_cols if c not in eye_drops]
regression_int = [c for c in categorical_cols if ('Num' in c or 'Count' in c) and c not in eye_drops]
classification_col = [c for c in categorical_cols if c not in regression_int and c not in eye_drops]
len(regression_int)

20

In [50]:
classification_col_filtered = []
for c in classification_col:
    cz = zinc[c]
    zeros =  cz.to_list().count(0)
    if zeros / len(cz) > 0.7:
        continue
    else:
        classification_col_filtered.append(c)
len(classification_col_filtered), classification_col_filtered

(10,
 ['fr_Ar_N',
  'fr_C_O',
  'fr_C_O_noCOO',
  'fr_NH0',
  'fr_NH1',
  'fr_Ndealkylation2',
  'fr_amide',
  'fr_aryl_methyl',
  'fr_benzene',
  'fr_ether'])

In [51]:
len(regression_float), len(regression_int), len(classification_col_filtered)

(80, 20, 10)

In [52]:
targets = {'Regression_float': regression_float, 'Regression_int': regression_int, 'Classification': classification_col_filtered}

In [53]:
import pickle

with open('data/zinc/target_cols.pkl', 'wb') as f:
    pickle.dump(targets, f)

In [54]:
import pickle
with open('data/zinc/target_cols.pkl', 'rb') as f:
    target_cols = pickle.load(f)

target_cols

{'Regression_float': ['MaxAbsEStateIndex',
  'MaxEStateIndex',
  'MinAbsEStateIndex',
  'MinEStateIndex',
  'qed',
  'SPS',
  'MolWt',
  'HeavyAtomMolWt',
  'ExactMolWt',
  'MaxPartialCharge',
  'MinPartialCharge',
  'MaxAbsPartialCharge',
  'MinAbsPartialCharge',
  'FpDensityMorgan1',
  'FpDensityMorgan2',
  'FpDensityMorgan3',
  'BCUT2D_MWLOW',
  'BCUT2D_CHGHI',
  'BCUT2D_CHGLO',
  'BCUT2D_LOGPHI',
  'BCUT2D_LOGPLOW',
  'AvgIpc',
  'BalabanJ',
  'BertzCT',
  'Chi0',
  'Chi0n',
  'Chi0v',
  'Chi1',
  'Chi1n',
  'Chi1v',
  'Chi2n',
  'Chi2v',
  'Chi3n',
  'Chi3v',
  'Chi4n',
  'Chi4v',
  'HallKierAlpha',
  'Ipc',
  'Kappa1',
  'Kappa2',
  'Kappa3',
  'LabuteASA',
  'PEOE_VSA1',
  'PEOE_VSA10',
  'PEOE_VSA2',
  'PEOE_VSA6',
  'PEOE_VSA7',
  'PEOE_VSA8',
  'PEOE_VSA9',
  'SMR_VSA1',
  'SMR_VSA3',
  'SMR_VSA5',
  'SMR_VSA6',
  'SMR_VSA7',
  'SlogP_VSA1',
  'SlogP_VSA2',
  'SlogP_VSA3',
  'SlogP_VSA4',
  'SlogP_VSA5',
  'SlogP_VSA6',
  'TPSA',
  'EState_VSA1',
  'EState_VSA10',
  'EState_V