## Import

In [39]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
%reload_ext autoreload

In [41]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install -r requirements.txt
!pip install deepchem
!pip install rdkit
!pip install pycm
!pip install pytorch-lightning wandb rdkit ogb
!pip install torch_geometric
!pip install optuna
!pip install seaborn

In [42]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import deepchem as dc
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from rdkit import Chem
from rdkit.Chem import Draw, rdFingerprintGenerator, AllChem
from rdkit.Chem.Descriptors import MolWt, TPSA, NumHDonors, NumHAcceptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

In [43]:
df=pd.read_csv('/Users/suki/Chemoinformatics_project/data/raw/sider.csv')

In [44]:
df.head()

Unnamed: 0,smiles,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,...,"Congenital, familial and genetic disorders",Infections and infestations,"Respiratory, thoracic and mediastinal disorders",Psychiatric disorders,Renal and urinary disorders,"Pregnancy, puerperium and perinatal conditions",Ear and labyrinth disorders,Cardiac disorders,Nervous system disorders,"Injury, poisoning and procedural complications"
0,C(CNCCNCCNCCN)N,1,1,0,0,1,1,1,0,0,...,0,0,1,1,0,0,1,1,1,0
1,CC(C)(C)C1=CC(=C(C=C1NC(=O)C2=CNC3=CC=CC=C3C2=...,0,1,0,0,1,1,1,0,0,...,0,1,1,0,0,0,1,0,1,0
2,CC[C@]12CC(=C)[C@H]3[C@H]([C@@H]1CC[C@]2(C#C)O...,0,1,0,1,1,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
3,CCC12CC(=C)C3C(C1CC[C@]2(C#C)O)CCC4=CC(=O)CCC34,1,1,0,1,1,1,1,0,1,...,1,1,1,1,1,1,0,0,1,1
4,C1C(C2=CC=CC=C2N(C3=CC=CC=C31)C(=O)N)O,1,1,0,1,1,1,1,0,1,...,0,1,1,1,0,0,1,0,1,0


## Pre-processing

In [None]:
from src.sider_preprocessing import sider_preprocessing

df_cleaned = sider_preprocessing(df)

## Featurizer

In [46]:
from src.sider_featurizer import featurizer

#use rdkit_all=True to get all rdkit descriptors, false to get 7 of listed in the function

feature_dfs = featurizer(df=df_cleaned, methods=None, mol_col='Molecule',smiles='canonical_smiles', rdkit_all=True,fpSize=2048)



df with RDKit features shape: (1427, 234)




df with MACCS features shape: (1427, 196)
df with Morgan Fingerprint features shape: (1427, 2078)


In [20]:
feature_dfs.keys()

dict_keys(['rdkit', 'maccs', 'MorganFP'])

In [21]:
feature_dfs['rdkit'].head()

Unnamed: 0,canonical_smiles,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,...,RDKit_207,RDKit_208,RDKit_209,RDKit_210,RDKit_211,RDKit_212,RDKit_213,RDKit_214,RDKit_215,RDKit_216
0,NCCNCCNCCNCCN,1,1,0,0,1,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.248876
1,CC(C)(C)c1cc(C(C)(C)C)c(NC(=O)c2c[nH]c3ccccc3c...,0,1,0,0,1,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.568603
2,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H...,0,1,0,1,1,0,1,0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.54096
3,C#C[C@]1(O)CCC2C3CCC4=CC(=O)CCC4C3C(=C)CC21CC,1,1,0,1,1,1,1,0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.583491
4,NC(=O)N1c2ccccc2CC(O)c2ccccc21,1,1,0,1,1,1,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.758119


In [56]:
concatenated_features_df = featurizer(df=df_cleaned, methods=None, mol_col='Molecule',smiles='canonical_smiles', rdkit_all=True,concatenate=True,fpSize=2048)



df with RDKit features shape: (1427, 234)




df with MACCS features shape: (1427, 196)
df with Morgan Fingerprint features shape: (1427, 2078)
Feature engineering complete. Final DataFrame shape: (1427, 2450)
Total features added: 217 rdkit+ 167 maccs + 2048 Morgan


In [57]:
concatenated_features_df.isnull().sum().sum()

np.int64(0)

In [58]:
concatenated_features_df.loc[concatenated_features_df.isna().any(axis=1)]

Unnamed: 0,canonical_smiles,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,...,Morgan_2038,Morgan_2039,Morgan_2040,Morgan_2041,Morgan_2042,Morgan_2043,Morgan_2044,Morgan_2045,Morgan_2046,Morgan_2047


In [59]:
concatenated_features_df.iloc[:,:29]

Unnamed: 0,canonical_smiles,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,...,Infections and infestations,"Respiratory, thoracic and mediastinal disorders",Psychiatric disorders,Renal and urinary disorders,"Pregnancy, puerperium and perinatal conditions",Ear and labyrinth disorders,Cardiac disorders,Nervous system disorders,"Injury, poisoning and procedural complications",Molecule
0,NCCNCCNCCNCCN,1,1,0,0,1,1,1,0,0,...,0,1,1,0,0,1,1,1,0,<rdkit.Chem.rdchem.Mol object at 0x3937b84a0>
1,CC(C)(C)c1cc(C(C)(C)C)c(NC(=O)c2c[nH]c3ccccc3c...,0,1,0,0,1,1,1,0,0,...,1,1,0,0,0,1,0,1,0,<rdkit.Chem.rdchem.Mol object at 0x3937a3ed0>
2,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H...,0,1,0,1,1,0,1,0,1,...,0,0,1,0,0,0,0,1,0,<rdkit.Chem.rdchem.Mol object at 0x3937a38b0>
3,C#C[C@]1(O)CCC2C3CCC4=CC(=O)CCC4C3C(=C)CC21CC,1,1,0,1,1,1,1,0,1,...,1,1,1,1,1,0,0,1,1,<rdkit.Chem.rdchem.Mol object at 0x3937d2ea0>
4,NC(=O)N1c2ccccc2CC(O)c2ccccc21,1,1,0,1,1,1,1,0,1,...,1,1,1,0,0,1,0,1,0,<rdkit.Chem.rdchem.Mol object at 0x3937d35a0>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1422,C[C@H]1CN(C[C@H](Cc2ccccc2)C(=O)NCC(=O)O)CC[C@...,0,1,0,0,0,1,1,0,0,...,0,0,0,1,0,0,0,0,0,<rdkit.Chem.rdchem.Mol object at 0x393801f50>
1423,CC[C@H]1OC(=O)[C@H](C)C(=O)[C@H](C)[C@@H](O[C@...,1,1,0,1,1,1,1,0,1,...,1,1,1,1,0,1,1,1,1,<rdkit.Chem.rdchem.Mol object at 0x393801fc0>
1424,CCOc1ccc(Cc2cc([C@@H]3O[C@H](CO)[C@@H](O)[C@H]...,1,1,0,0,1,1,1,0,1,...,1,0,0,1,0,0,1,1,1,<rdkit.Chem.rdchem.Mol object at 0x393802030>
1425,O=c1[nH]c2ccccc2n1C1CCN(CCCC(c2ccc(F)cc2)c2ccc...,0,1,0,1,1,1,1,0,0,...,0,0,1,1,0,0,1,1,1,<rdkit.Chem.rdchem.Mol object at 0x3938020a0>


In [60]:
concatenated_features_df.columns.get_loc('Molecule')

28

In [61]:
concatenated_features_df.isna().sum().sum()

np.int64(0)

## Models

In [30]:
from src import sider_baseline_models
results_dict = {}
for feat_name, df_feat in feature_dfs.items():

    X = df_feat.iloc[:, 29:].copy()
    y = df_feat.iloc[:, 1:28]

    X = X.select_dtypes(include=np.number)
    print(X.dtypes)

    # Apply VarianceThreshold to remove zero-variance features
    """selector = VarianceThreshold(threshold=0.0)
    X_cleaned_array = selector.fit_transform(X)
    X = pd.DataFrame(X_cleaned_array, columns=X.columns[selector.get_support()])"""

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    #results_df, trained_models = sider_baseline_models.train_and_evaluate_models(X_train, X_test, y_train, y_test, feat_name)
    results_dict[feat_name]=sider_baseline_models.train_and_evaluate_models(X_train, X_test, y_train, y_test, feat_name)



RDKit_0      float64
RDKit_9      float64
RDKit_10     float64
RDKit_11     float64
RDKit_12     float64
              ...   
RDKit_212    float64
RDKit_213    float64
RDKit_214    float64
RDKit_215    float64
RDKit_216    float64
Length: 205, dtype: object
Applied StandardScaler (continuous features detected).

--- Hyperparameter search: Random Forest... ---
RF - Best Hyperparameters: {'estimator__max_depth': 10, 'estimator__n_estimators': 100}

--- Hyperparameter search: XGBoost ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode


--- Hyperparameter search: Logistic Regression ---

--- Hyperparameter search: SVM ---


--- EVALUATION AND RESULTS COLLECTION ---

                     MODEL PERFORMANCE SUMMARY
                     Features: rdkit
                    Features Macro AUC-ROC                                               Hyperparameters
Random Forest          rdkit        0.6331  {'estimator__max_depth': 10, 'estimator__n_estimators': 100}
XGBoost                rdkit        0.6213                                                      Defaults
Logistic Regression    rdkit        0.6121                             C=1.0 (Default), Solver=liblinear
SVM                    rdkit        0.6107                                Kernel=linear, C=1.0 (Default)

--- TOP 5 LABELS ---
                                   Label  Random Forest  Logistic Regression    SVM  XGBoost
Reproductive system and breast disorders         0.7242               0.7016 0.6795   0.6942
                       Cardiac disorders         0

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode


--- Hyperparameter search: Logistic Regression ---

--- Hyperparameter search: SVM ---


--- EVALUATION AND RESULTS COLLECTION ---

                     MODEL PERFORMANCE SUMMARY
                     Features: maccs
                    Features Macro AUC-ROC                                               Hyperparameters
Random Forest          maccs        0.6396  {'estimator__max_depth': 10, 'estimator__n_estimators': 300}
XGBoost                maccs        0.6301                                                      Defaults
Logistic Regression    maccs        0.6102                             C=1.0 (Default), Solver=liblinear
SVM                    maccs        0.5979                                Kernel=linear, C=1.0 (Default)

--- TOP 5 LABELS ---
                               Label  Random Forest  Logistic Regression    SVM  XGBoost
             Hepatobiliary disorders         0.7179               0.7049 0.6898   0.6827
Blood and lymphatic system disorders         0.7191       

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode


--- Hyperparameter search: Logistic Regression ---

--- Hyperparameter search: SVM ---


--- EVALUATION AND RESULTS COLLECTION ---

                     MODEL PERFORMANCE SUMMARY
                     Features: MorganFP
                     Features Macro AUC-ROC                                               Hyperparameters
XGBoost              MorganFP        0.6480                                                      Defaults
Logistic Regression  MorganFP        0.6454                             C=1.0 (Default), Solver=liblinear
Random Forest        MorganFP        0.6436  {'estimator__max_depth': 25, 'estimator__n_estimators': 200}
SVM                  MorganFP        0.6238                                Kernel=linear, C=1.0 (Default)

--- TOP 5 LABELS ---
                                   Label  Random Forest  Logistic Regression    SVM  XGBoost
                       Cardiac disorders         0.7469               0.7313 0.7283   0.7183
Reproductive system and breast disorders  

In [36]:
from src.sider_baseline_models import cross_validate_multilabel_models

In [47]:
from sklearn.feature_selection import VarianceThreshold

#running 10 flod cross-validation for all feature sets

cv_results_dict = {}

for feat_name, df_feat in feature_dfs.items():
    print("\n" + "="*100)
    print(f"Running 10-fold cross-validation for feature set: {feat_name}")
    print("="*100)

    # --- Prepare feature matrix (X) and multi-label target (y) ---
    X = df_feat.iloc[:, 29:].copy()
    y = df_feat.iloc[:, 1:28]

    # Keep only numeric columns
    X = X.select_dtypes(include=np.number)
    print(f"\nNumeric feature columns detected: {X.shape[1]}")

    # Optional: remove zero-variance features
    selector = VarianceThreshold(threshold=0.0)
    X_cleaned_array = selector.fit_transform(X)
    X = pd.DataFrame(X_cleaned_array, columns=X.columns[selector.get_support()])

    print(f"After VarianceThreshold: {X.shape[1]} features remain.")

    # --- Run 10-Fold CV pipeline ---
    results = cross_validate_multilabel_models(X, y, feat=feat_name, verbose=True, n_splits=10)
    cv_results_dict[feat_name] = results

print("\n\n==================== ALL FEATURE SETS COMPLETED ====================")



Running 10-fold cross-validation for feature set: rdkit

Numeric feature columns detected: 205
After VarianceThreshold: 199 features remain.

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

                     MODEL PERFORMANCE SUMMARY
                     Features: rdkit
                    Features  Macro AUC-ROC
Random Forest          rdkit         0.6689
XGBoost                rdkit         0.6524
Logistic Regression    rdkit         0.6375
SVM                    rdkit         

In [48]:
cv_results_dict.keys()

dict_keys(['rdkit', 'maccs', 'MorganFP'])

## Run classifiers combining all features

In [62]:
from src import sider_baseline_models

from rdkit import DataStructs


def bitvect_to_numpy(X):
    """
    Convert a DataFrame/Series of RDKit ExplicitBitVect fingerprints
    to a numeric DataFrame.
    """
    if isinstance(X, pd.Series):
        X = X.to_frame()

    numeric_arrays = []
    for col in X.columns:
        # convert each column (assume contains ExplicitBitVect)
        numeric_arrays.append(np.array([DataStructs.ConvertToNumpyArray(fp, np.zeros((fp.GetNumBits(),), dtype=int)) or np.zeros(fp.GetNumBits()) for fp in X[col]]))
    
    # If multiple columns, concatenate horizontally
    X_numeric = np.hstack(numeric_arrays)
    return pd.DataFrame(X_numeric)

In [71]:
feat='all'
X_all_features = concatenated_features_df.iloc[:, 29:].copy()
y_all_features  = concatenated_features_df.iloc[:, 1:28].copy()

X_all_features=X_all_features.select_dtypes(include=np.number)
all_features_results_dict = {}
X_train_all_features, X_test_all_features, y_train_all_features, y_test_all_features = train_test_split(X_all_features, y_all_features, test_size=0.2, random_state=42)

#results_df, trained_models = sider_baseline_models.train_and_evaluate_models(X_train, X_test, y_train, y_test, feat_name)
all_features_results_dict[feat]=sider_baseline_models.train_and_evaluate_models(X_train_all_features, X_test_all_features, y_train_all_features, y_test_all_features, feat='all')


Applied StandardScaler (continuous features detected).

--- Hyperparameter search: Random Forest... ---
RF - Best Hyperparameters: {'estimator__max_depth': 20, 'estimator__n_estimators': 300}

--- Hyperparameter search: XGBoost ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode


--- Hyperparameter search: Logistic Regression ---

--- Hyperparameter search: SVM ---


--- EVALUATION AND RESULTS COLLECTION ---

                     MODEL PERFORMANCE SUMMARY
                     Features: all
                    Features Macro AUC-ROC                                               Hyperparameters
Random Forest            all        0.6450  {'estimator__max_depth': 20, 'estimator__n_estimators': 300}
XGBoost                  all        0.6230                                                      Defaults
Logistic Regression      all        0.5944                             C=1.0 (Default), Solver=liblinear
SVM                      all        0.5816                                Kernel=linear, C=1.0 (Default)

--- TOP 5 LABELS ---
                                                              Label  Random Forest  Logistic Regression    SVM  XGBoost
                                                  Cardiac disorders         0.7656               0.7056 0.6606   0.651

In [72]:
all_features_results_dict

{'all': {'Features': 'all',
  'Random Forest': MultiOutputClassifier(estimator=RandomForestClassifier(max_depth=20,
                                                         n_estimators=300,
                                                         random_state=0),
                        n_jobs=-1),
  'XGBoost': MultiOutputClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                                callbacks=None,
                                                colsample_bylevel=None,
                                                colsample_bynode=None,
                                                colsample_bytree=None,
                                                device=None,
                                                early_stopping_rounds=None,
                                                enable_categorical=False,
                                                eval_metric='logloss',
                                                f

In [73]:
import joblib
import os

#pick XGboost model from trained models as it's the best performer
trained_model= all_features_results_dict['all']['Random Forest']

# Specify save path
save_dir = "saved_models/"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "all_features_rf_max_depth_20_n_estimators_300.joblib")


joblib.dump(trained_model, save_path)
print(f"Radom Forest model saved to: {save_path}")


Radom Forest model saved to: saved_models/all_features_rf_max_depth_20_n_estimators_300.joblib


In [74]:
print("\n" + "="*100)
print(f"Running 10-fold cross-validation for combined features set: {feat}")
print("="*100)
# --- Run 10-Fold CV pipeline ---
results = cross_validate_multilabel_models(X_all_features, y_all_features, feat=feat, verbose=True, n_splits=10)

print("\n==================== FEATURE SET COMPLETED ====================")


Running 10-fold cross-validation for combined features set: all

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

Applied StandardScaler (continuous features detected).

                     MODEL PERFORMANCE SUMMARY
                     Features: all
                    Features  Macro AUC-ROC
Random Forest            all         0.6914
XGBoost                  all         0.6607
SVM                      all         0.6208
Logistic Regression      all         0.6207

--- TOP 5 LABELS ---
                                                  

In [77]:
results['TrainedModels']

{'RF': MultiOutputClassifier(estimator=RandomForestClassifier(max_depth=15,
                                                        n_estimators=200,
                                                        n_jobs=-1,
                                                        random_state=0)),
 'XGB': MultiOutputClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                               callbacks=None,
                                               colsample_bylevel=None,
                                               colsample_bynode=None,
                                               colsample_bytree=None,
                                               device=None,
                                               early_stopping_rounds=None,
                                               enable_categorical=False,
                                               eval_metric='logloss',
                                               feature_types=None,
     

In [78]:
trained_model_cv=results['TrainedModels']['RF']
# Specify save path
save_dir = "saved_models/"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "all_features_cv_random_foresr.joblib")


joblib.dump(trained_model_cv, save_path)
print(f"Cross_alidated Radom Forest model saved to: {save_path}")


Cross_alidated Radom Forest model saved to: saved_models/all_features_cv_random_foresr.joblib


##SVM Optimization

In [None]:
from src.sider_svm_optimization import optimize_svm

scaler=StandardScaler()
X_train_all_scaled=scaler.fit_transform(X_train_all_features)
X_test_all_scaled=scaler.transform(X_test_all_features)

# RandomizedSearchCV on non RBF SVM
best_svm, test_macro_auc, test_detailed = optimize_svm(X_train_all_scaled, y_train_all_features, X_test_all_scaled, y_test_all_features)