In [1]:
from scipy.io import arff
import pandas as pd

data, meta = arff.loadarff('dataset')
df = pd.DataFrame(data)
print(df.head(1))

   FCFP6_1024_0  FCFP6_1024_1  FCFP6_1024_2  FCFP6_1024_3  FCFP6_1024_4  \
0           1.0           1.0           0.0           1.0           0.0   

   FCFP6_1024_5  FCFP6_1024_6  FCFP6_1024_7  FCFP6_1024_8  FCFP6_1024_9  ...  \
0           0.0           0.0           0.0           1.0           0.0  ...   

   FCFP6_1024_1015  FCFP6_1024_1016  FCFP6_1024_1017  FCFP6_1024_1018  \
0              0.0              0.0              1.0              0.0   

   FCFP6_1024_1019  FCFP6_1024_1020  FCFP6_1024_1021  FCFP6_1024_1022  \
0              0.0              0.0              0.0              0.0   

   FCFP6_1024_1023  MEDIAN_PXC50  
0              0.0         5.699  

[1 rows x 1025 columns]


In [2]:
# Load and summarize dataset
print("Number of features:", df.shape[1])
print("Number of samples:", df.shape[0], "\n")

# Check for columns with unexpected dtypes
non_float_columns = {col: dtype for col, dtype in df.dtypes.items() if dtype != 'float64'}

if not non_float_columns:
    print("✅ All columns are of type float64.", "\n")
else:
    print(f"⚠️ Found {len(non_float_columns)} columns not of type float64:")
    for col, dtype in non_float_columns.items():
        print(f"{col}: {dtype}")

# Assume df is your DataFrame
fcfp_columns = [col for col in df.columns if col.startswith('FCFP6_1024_')]

# Check for non-binary values in each column
non_binary = {}
for col in fcfp_columns:
    unique_vals = df[col].unique()
    if not set(unique_vals).issubset({0.0, 1.0}):
        non_binary[col] = unique_vals

# Report
if not non_binary:
    print("✅ All FCFP features are binary (0/1).\n")
else:
    print(f"⚠️ Found non-binary values in {len(non_binary)} columns.\n")
    for col, vals in list(non_binary.items())[:5]:  # Show only a few as example
        print(f"{col}: {vals[:5]}")

# Missing value report (only columns with missing values)
missing = df.isnull().sum()
missing = missing[missing > 0]

if missing.empty:
    print("✅ No missing values found in any column.\n")
else:
    print(f"\n⚠️ Found missing values in {len(missing)} columns:\n")
    print(missing)
    print()

# Select FCFP columns
fcfp_columns = [col for col in df.columns if col.startswith('FCFP6_1024_')]
fcfp_data = df[fcfp_columns]

# Count active bits (1s) per row
active_bits_per_row = fcfp_data.sum(axis=1)

# Compute average and sparsity
avg_active_bits = active_bits_per_row.mean()
sparsity_percent = 100 * (1 - avg_active_bits / len(fcfp_columns))

# Report
print(f"Average active bits per molecule: {avg_active_bits:.2f}")
print(f"Average sparsity: {sparsity_percent:.2f}%", "\n")


# Read ARFF header
print("\nARFF Header Preview:")
with open("dataset", "r") as f:
    for line in f:
        print(line.strip())
        if line.lower().startswith("@data"):
            break  # Stop at data section

Number of features: 1025
Number of samples: 5742 

✅ All columns are of type float64. 

✅ All FCFP features are binary (0/1).

✅ No missing values found in any column.

Average active bits per molecule: 64.91
Average sparsity: 93.66% 


ARFF Header Preview:
%
% This dataset was curated for [TabArena](https://tabarena.ai/) by the TabArena team
% as part of the [TabArena Tabular ML IID Study](https://tabarena.ai/data-tabular-ml-iid-study).
% For more details on the study, see our [paper](https://tabarena.ai/paper-tabular-ml-iid-study).
%
% **Dataset Focus**: This dataset shall be used for evaluating predictive machine
% learning models for independent and identically distributed tabular data. The
% intended task is regression.
%
% ---
% #### Dataset Metadata
% - **Licence:** Public
% - **Original Data Source:** https://www.openml.org/search?type=data&sort=runs&status=active&id=3050
% - **Reference (please cite)**: Olier, Ivan, et al. 'Meta-QSAR: a large-scale application of meta-learning

In [1]:
import openml

def load_dataset(dataset_id_or_name):
    try:
        task_id = int(dataset_id_or_name)
        task = openml.tasks.get_task(task_id)
    except ValueError:
        dataset_list = openml.datasets.list_datasets(output_format="dataframe")
        matching = dataset_list[dataset_list['name'] == dataset_id_or_name]
        if matching.empty:
            raise ValueError(f"No OpenML dataset found with name '{dataset_id_or_name}'")
        dataset_id = matching.iloc[0]['did']
        task_list = openml.tasks.list_tasks(output_format="dataframe", dataset=dataset_id)
        if task_list.empty:
            raise ValueError(f"No tasks found for dataset '{dataset_id_or_name}'")
        task_id = task_list.iloc[0]['tid'] # first matching task
        task = openml.tasks.get_task(task_id)

    dataset = task.get_dataset()
    return task, dataset


In [2]:
t, d = load_dataset(363677)

In [3]:
t

OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 363677
Task URL.............: https://www.openml.org/t/363677
Estimation Procedure.: crossvalidation
Target Feature.......: CompoundActivity
# of Classes.........: 3
Cost Matrix..........: Available

In [6]:
X, y = t.get_X_and_y(dataset_format="dataframe")


In [7]:
X

Unnamed: 0,molecule_structure_property_1,molecule_structure_property_2,molecule_structure_property_3,molecule_structure_property_4,molecule_structure_property_5,molecule_structure_property_6,molecule_structure_property_7,molecule_structure_property_8,molecule_structure_property_9,molecule_structure_property_10,...,molecule_structure_property_1608,molecule_structure_property_1609,molecule_structure_property_1610,molecule_structure_property_1611,molecule_structure_property_1612,molecule_structure_property_1613,molecule_structure_property_1614,molecule_structure_property_1615,molecule_structure_property_1616,molecule_structure_property_1617
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3840,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3841,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3842,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3843,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
y

0       CM
1       CI
2       CI
3       CI
4       CI
        ..
3840    CI
3841    CI
3842    CI
3843    CI
3844    CI
Name: CompoundActivity, Length: 3845, dtype: category
Categories (3, object): ['CA', 'CI', 'CM']

In [9]:
def summarize_column_dtypes(X, logger=None, name="X"):
    import pandas as pd

    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    dtype_counts = X.dtypes.value_counts().to_dict()

    if logger:
        logger.info(f"--- Column dtype summary for {name} ---")
        for dtype, count in dtype_counts.items():
            logger.info(f"{dtype}: {count} columns")

        non_numeric = X.select_dtypes(exclude=["number"])
        if not non_numeric.empty:
            logger.info(f"\n⚠️ Non-numeric columns in {name}:")
            for col in non_numeric.columns:
                sample_values = X[col].dropna().unique()[:5]
                logger.info(f"  - {col} (dtype: {X[col].dtype}, sample values: {sample_values})")
        else:
            logger.info(f"✅ All columns in {name} are numeric.")

    return dtype_counts


In [10]:
summarize_column_dtypes(X)

{CategoricalDtype(categories=['0', '1'], ordered=False, categories_dtype=object): 1617}

In [14]:
t1, d1 = load_dataset(363620)

In [15]:
X1, y1 = t1.get_X_and_y(dataset_format="dataframe")


In [16]:
summarize_column_dtypes(X1)

{dtype('float64'): 942, dtype('uint8'): 834}

In [13]:
summarize_column_dtypes(X1)

{dtype('uint8'): 1024}

In [10]:
from autogluon.tabular.models.catboost.catboost_model import CatBoostModel
import pandas as pd

model = CatBoostModel()
model.fit(X=pd.DataFrame(X_train), y=pd.Series(y_train))
print(f"Validation {model.eval_metric.name}")

  from pkg_resources import parse_version  # pylint: disable=import-outside-toplevel
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


Validation root_mean_squared_error


In [3]:
from autogluon.tabular.models.tabpfn.tabpfn_model import TabPFNModel

import pandas as pd

model = TabPFNModel()
model.fit(X=pd.DataFrame(X_train), y=pd.Series(y_train))
print(f"Validation {model.eval_metric.name}")

We have to download the TabPFN, as there is no checkpoint at  /home/matusd/.conda/envs/tabpfn/lib/python3.11/site-packages/tabpfn/models_diff/prior_diff_real_checkpoint_n_0_epoch_42.cpkt
It has about 100MB, so this might take a moment.
Validation accuracy




In [5]:
from sklearn.metrics import accuracy_score, root_mean_squared_error

def score_for_task_type(
    y_test: pd.Series, y_pred: pd.Series | pd.DataFrame, *, task_type: str
) -> float:
    """Score (higher is better) the predictions for a given task type."""
    if task_type in ["binary", "multiclass"]:
        score = accuracy_score(y_test, y_pred)
        print("Accuracy:", score)
    elif task_type == "regression":
        score = -root_mean_squared_error(y_test, y_pred)
        print("Negative RMSE:", score)
    else:
        raise ValueError("Invalid task type")

    return score

In [11]:
import numpy as np
import pandas as pd

X_train = pd.DataFrame(np.random.rand(55,4))
y_train = pd.Series(np.random.rand(55))
X_test = pd.DataFrame(np.random.rand(20,4))
y_test = pd.DataFrame(np.random.rand(20))


In [None]:
from external.tabrepo.tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_model import TabPFNV2Model
from sklearn.metrics import roc_auc_score

clf = TabPFNV2Model(problem_type="binary")
clf.fit(X=X_train, y=y_train)

# Predict and score
prediction_probabilities = clf.predict_proba(X=X_test)
print("ROC AUC:", roc_auc_score(y_test, prediction_probabilities))



ROC AUC: 0.5660760210006783


In [None]:
from external.tabrepo.tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_model import TabPFNV2Model
from sklearn.metrics import roc_auc_score, log_loss

clf = TabPFNV2Model(problem_type="multiclass")
clf.fit(X=X_train, y=y_train)

# Predict and score
prediction_probabilities = clf.predict_proba(X=X_test)
print("ROC AUC:", log_loss(y_test, prediction_probabilities, labels=[0,1,2]))

In [5]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
from external.tabrepo.tabrepo.benchmark.models.ag.tabpfnv2.tabpfnv2_model import TabPFNV2Model
from sklearn.metrics import roc_auc_score, log_loss, mean_squared_error

clf = TabPFNV2Model(path="models", name="TabPFNv2", problem_type="regression", eval_metric="root_mean_squared_error")
clf.fit(X=X_train, y=y_train, device=device)

# Predict and score
prediction_probabilities = clf.predict_proba(X=X_test)
print("ROC AUC:", mean_squared_error(y_test, prediction_probabilities))



ROC AUC: 0.07993240654468536


In [31]:
prediction_probabilities
y_test

Unnamed: 0,0
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
5,1.0
6,1.0
7,1.0
8,1.0
9,1.0


In [15]:
from external.tabrepo.tabrepo.models.utils import get_configs_generator_from_name
model_to_run = "CatBoost"

model_meta = get_configs_generator_from_name(model_name=model_to_run)
model_cls = model_meta.model_cls
model_config = model_meta.manual_configs[0]

In [16]:
model_config

{}

In [17]:
model=model_cls(problem_type="regression", **model_config)
model.fit(X=X_train, y=y_train)

<autogluon.tabular.models.catboost.catboost_model.CatBoostModel at 0x7fb7b1965e90>

In [18]:
y_pred = model.predict(X=pd.DataFrame(X_test))
task_type = "regression"

score_for_task_type(y_test=y_test, y_pred=y_pred, task_type=task_type)

Negative RMSE: -0.3899732232093811


-0.3899732232093811

In [14]:
from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris
from sklearn.model_selection import train_test_split
def get_example_data_for_task_type(
    *, task_type: str
) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """Return example data for a given task type."""
    if task_type == "binary":
        X, y = load_breast_cancer(return_X_y=True, as_frame=True)

    elif task_type == "multiclass":
        X, y = load_iris(return_X_y=True, as_frame=True)
    elif task_type == "regression":
        X, y = load_diabetes(return_X_y=True, as_frame=True)
    else:
        raise ValueError("Invalid task type")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.5, random_state=42
    )
    return X_train, X_test, y_train, y_test

In [31]:
"""Run a standalone TabArena model on any task."""

from __future__ import annotations

from autogluon.core.data import LabelCleaner
from autogluon.core.models import BaggedEnsembleModel
from autogluon.features.generators import AutoMLPipelineFeatureGenerator

from external.tabrepo.tabrepo.models.utils import get_configs_generator_from_name

task_type = "binary"
"""Task type for the model to run on.
Either "binary", "multiclass", or "regression".
"""
cross_validation_bagging = True
"""If True, will use cross-validation bagging for the model.
This is the default on TabArena and recommended for most tasks.
"""
model_to_run = "CatBoost"
"""Select a model to run, which we automatically load in the code below.

Note: not all models are available for all task types.

The recommended options are:
    - "RealMLP"
    - "TabM"
    - "LightGBM"
    - "CatBoost"
    - "XGBoost"
    - "ModernNCA"
    - "TabPFNv2"
    - "TabICL"
    - "TorchMLP"
    - "TabDPT"
    - "EBM"
    - "FastaiMLP"
    - "ExtraTrees
    - "RandomForest"
    - "KNN"
    - "Linear"

You can also import it manually from TabArena / AutoGluon, which we recommend
for practical applications, for example:
 - RealMLP: from tabrepo.benchmark.models.ag.realmlp.realmlp_model import RealMLPModel
 - Catboost: from autogluon.tabular.models.catboost.catboost_model import CatBoostModel
"""
model_meta = get_configs_generator_from_name(model_name=model_to_run)
model_cls = model_meta.model_cls
model_config = model_meta.manual_configs[0]



In [32]:
model_config

{}

In [29]:
model_meta

<tabrepo.utils.config_utils.CustomAGConfigGenerator at 0x7f608e375d90>

In [30]:
model_config = {}

In [37]:
X_train, X_test, y_train, y_test = get_example_data_for_task_type(task_type=task_type)

# --- Using a TabArena Model: Preprocessing, Train, and Predict:
print(f"Running TabArena model {model_to_run} on task type {task_type}...")
feature_generator, label_cleaner = (
    AutoMLPipelineFeatureGenerator(),
    LabelCleaner.construct(problem_type=task_type, y=y_train),
)
X_train, y_train = (
    feature_generator.fit_transform(X_train),
    label_cleaner.transform(y_train),
)
X_test, y_test = feature_generator.transform(X_test), label_cleaner.transform(y_test)


model = model_cls(problem_type=task_type, **model_config)
model.fit(X=X_train, y=y_train, num_gpus=1)
y_pred = model.predict(X=X_test)

# score_for_task_type(y_test=y_test, y_pred=y_pred, task_type=task_type)

Running TabArena model CatBoost on task type binary...


In [None]:
X_train, X_test, y_train, y_test = get_example_data_for_task_type(task_type=task_type)

# --- Using a TabArena Model: Preprocessing, Train, and Predict:
print(f"Running TabArena model {model_to_run} on task type {task_type}...")
feature_generator, label_cleaner = (
    AutoMLPipelineFeatureGenerator(),
    LabelCleaner.construct(problem_type=task_type, y=y_train),
)
X_train, y_train = (
    feature_generator.fit_transform(X_train),
    label_cleaner.transform(y_train),
)
X_test, y_test = feature_generator.transform(X_test), label_cleaner.transform(y_test)


model = model_cls(problem_type=task_type, **model_config)
model.fit(X=X_train, y=y_train)
y_pred = model.predict(X=X_test)

# score_for_task_type(y_test=y_test, y_pred=y_pred, task_type=task_type)

In [35]:
sc=model.score(X_test,y_test)

In [36]:
sc

0.968421052631579