In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train.csv
/kaggle/input/neurips-open-polymer-prediction-2025/test.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset2.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv
/kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl


In [2]:
!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3


# Data Loading and Preprocessing Pipeline

In [3]:
# === Imports ===
import pandas as pd
import numpy as np
from rdkit import Chem

# === Config ===
BASE_PATH = '/kaggle/input/neurips-open-polymer-prediction-2025/'
TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
BAD_PATTERNS = ['[R]', '[R1]', '[R2]', '[R3]', '[R4]', '[R5]',
                "[R']", '[R"]', 'R1', 'R2', 'R3', 'R4', 'R5',
                '([R])', '([R1])', '([R2])']

# === SMILES Cleaner ===
def clean_and_validate_smiles(smiles):
    if not isinstance(smiles, str) or not smiles:
        return None
    for pattern in BAD_PATTERNS:
        if pattern in smiles:
            return None
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
    except:
        return None
    return None

# === Load Train/Test ===
train = pd.read_csv(BASE_PATH + 'train.csv')
test = pd.read_csv(BASE_PATH + 'test.csv')

train['SMILES'] = train['SMILES'].apply(clean_and_validate_smiles)
test['SMILES'] = test['SMILES'].apply(clean_and_validate_smiles)

train.dropna(subset=['SMILES'], inplace=True)
test.dropna(subset=['SMILES'], inplace=True)

# === Load External Datasets (excluding dataset2) ===
external_datasets = []

def load_external(path, target, rename_map=None):
    try:
        df = pd.read_csv(path)
        if rename_map:
            df = df.rename(columns=rename_map)
        if 'SMILES' in df.columns and target in df.columns:
            df = df[['SMILES', target]].dropna()
            external_datasets.append((target, df))
            print(f"✅ Loaded {path} ({len(df)} entries for {target})")
        else:
            print(f"⚠️ Skipped {path}: required columns missing")
    except Exception as e:
        print(f"⚠️ Failed to load {path}: {e}")

load_external(BASE_PATH + 'train_supplement/dataset1.csv', 'Tc', rename_map={'TC_mean': 'Tc'})
load_external(BASE_PATH + 'train_supplement/dataset3.csv', 'Tg')
load_external(BASE_PATH + 'train_supplement/dataset4.csv', 'FFV')

# === Merge External Data ===
def merge_external(train_df, ext_df, target):
    ext_df['SMILES'] = ext_df['SMILES'].apply(clean_and_validate_smiles)
    ext_df = ext_df.dropna(subset=['SMILES', target])
    ext_df = ext_df.groupby('SMILES', as_index=False)[target].mean()

    # Fill missing target values in existing rows
    existing_smiles = set(train_df['SMILES'])
    to_fill = ext_df[ext_df['SMILES'].isin(existing_smiles)]
    for _, row in to_fill.iterrows():
        mask = (train_df['SMILES'] == row['SMILES']) & (train_df[target].isna())
        train_df.loc[mask, target] = row[target]

    # Add new rows
    new_smiles = set(ext_df['SMILES']) - existing_smiles
    new_rows = ext_df[ext_df['SMILES'].isin(new_smiles)].copy()
    for col in TARGETS:
        if col not in new_rows.columns:
            new_rows[col] = np.nan
    return pd.concat([train_df, new_rows[['SMILES'] + TARGETS]], ignore_index=True)

# === Apply Merges ===
train_extended = train[['SMILES'] + TARGETS].copy()
for target, ext in external_datasets:
    train_extended = merge_external(train_extended, ext, target)

# === Final Clean-Up ===
train_extended = train_extended.replace([np.inf, -np.inf], np.nan)
train_extended = train_extended.dropna(subset=TARGETS, how='all')
train_extended = train_extended.drop_duplicates(subset=['SMILES']).reset_index(drop=True)

# === Summary ===
print("\n📊 Final Summary:")
print(f"Train: {len(train)} | Extended: {len(train_extended)}")
for t in TARGETS:
    base = train[t].notna().sum()
    ext = train_extended[t].notna().sum()
    print(f"• {t:<8}: {ext} total ({ext - base:+} from supplements)")

print("\n✅ Data loading and preprocessing complete.")


✅ Loaded /kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv (874 entries for Tc)
✅ Loaded /kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv (46 entries for Tg)
✅ Loaded /kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv (862 entries for FFV)

📊 Final Summary:
Train: 7973 | Extended: 8972
• Tg      : 557 total (+46 from supplements)
• FFV     : 7892 total (+862 from supplements)
• Tc      : 866 total (+129 from supplements)
• Density : 613 total (+0 from supplements)
• Rg      : 614 total (+0 from supplements)

✅ Data loading and preprocessing complete.


In [4]:
smiles_list = train_extended['SMILES'].tolist()
# Clean SMILES column robustly
train_extended['SMILES'] = train_extended['SMILES'].apply(clean_and_validate_smiles)
train_extended.shape
train_extended

Unnamed: 0,SMILES,Tg,FFV,Tc,Density,Rg
0,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.370410,,,
2,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.378860,,,
3,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.355470,,,
...,...,...,...,...,...,...
8967,*c1cccc(OCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5...,,0.349095,,,
8968,*c1cccc(OCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5C...,,0.350892,,,
8969,*c1cccc(OCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5C(...,,0.345386,,,
8970,*c1cccc(Oc2cccc(Oc3cccc(N4C(=O)c5ccc(Oc6ccc(Sc...,,0.362224,,,


# Feature Engineering Pipeline

In [5]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
import networkx as nx
from tqdm import tqdm

# === Canonicalize SMILES ===
def canonicalize_smiles(smiles_list):
    canonical = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            canonical.append(Chem.MolToSmiles(mol, canonical=True))
        else:
            canonical.append(None)
    return canonical

# === All RDKit Descriptors ===
def compute_rdkit_descriptors(mol):
    descs = {}
    for name, func in Descriptors.descList:
        try:
            descs[name] = func(mol)
        except:
            descs[name] = np.nan
    return descs

# === Graph Features ===
def compute_graph_descriptors(mol):
    descriptors = {}
    g = nx.Graph()
    g.add_edges_from([(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol.GetBonds()])

    try:
        descriptors['graph_diameter'] = nx.diameter(g) if nx.is_connected(g) else 0
        descriptors['avg_shortest_path'] = nx.average_shortest_path_length(g) if nx.is_connected(g) else 0
    except:
        descriptors['graph_diameter'] = 0
        descriptors['avg_shortest_path'] = 0

    descriptors['num_cycles'] = len(nx.cycle_basis(g))

    try:
        descriptors['betweenness_mean'] = np.mean(list(nx.betweenness_centrality(g).values()))
        descriptors['betweenness_std'] = np.std(list(nx.betweenness_centrality(g).values()))
        descriptors['closeness_mean'] = np.mean(list(nx.closeness_centrality(g).values()))
        descriptors['max_degree'] = max(dict(g.degree()).values())
    except:
        descriptors['betweenness_mean'] = np.nan
        descriptors['betweenness_std'] = np.nan
        descriptors['closeness_mean'] = np.nan
        descriptors['max_degree'] = np.nan

    try:
        ec = nx.eigenvector_centrality_numpy(g)
        descriptors['eigenvector_mean'] = np.mean(list(ec.values()))
    except:
        descriptors['eigenvector_mean'] = np.nan

    try:
        katz = nx.katz_centrality_numpy(g)
        descriptors['katz_centrality_std'] = np.std(list(katz.values()))
    except:
        descriptors['katz_centrality_std'] = np.nan

    try:
        ring_info = mol.GetRingInfo().AtomRings()
        descriptors['ring_4'] = sum(1 for r in ring_info if len(r) == 4)
    except:
        descriptors['ring_4'] = 0

    try:
        descriptors['heteroatom_ratio'] = sum(1 for a in mol.GetAtoms() if a.GetAtomicNum() not in [1, 6]) / mol.GetNumAtoms()
    except:
        descriptors['heteroatom_ratio'] = np.nan

    return descriptors

# === Final Combined Feature Computation ===
def compute_all_features(smiles_list, verbose=True):
    smiles_list = canonicalize_smiles(smiles_list)

    feature_dict = {}
    valid_idx = []
    failed_idx = []

    for idx, smi in enumerate(tqdm(smiles_list, desc="Computing Features")):
        if smi is None:
            failed_idx.append(idx)
            continue

        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            failed_idx.append(idx)
            continue

        valid_idx.append(idx)
        feats = {}
        feats.update(compute_rdkit_descriptors(mol))
        feats.update(compute_graph_descriptors(mol))

        for k, v in feats.items():
            if k not in feature_dict:
                feature_dict[k] = []
            feature_dict[k].append(v)

    total = len(smiles_list)
    for k in feature_dict:
        if len(feature_dict[k]) < total:
            feature_dict[k].extend([None] * (total - len(feature_dict[k])))

    if verbose:
        print("\n--- Feature Engineering Summary ---")
        print(f"Total SMILES: {total}")
        print(f"Valid molecules: {len(valid_idx)}")
        print(f"Invalid molecules: {len(failed_idx)}")
        print(f"Number of computed features: {len(feature_dict)}")
        sample_key = next(iter(feature_dict))
        print(f"Feature vector length per molecule: {len(feature_dict[sample_key])}")
        print("-----------------------------------")

    return feature_dict, valid_idx



In [6]:
from rdkit import RDLogger
import pandas as pd

# Suppress RDKit warnings
RDLogger.DisableLog('rdApp.*')

# List of columns to drop ,Source: from various Notebooks through out the competition
useless_cols = [   
    'MaxPartialCharge', 
    'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO',
    'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW',
    'NumRadicalElectrons', 'SMR_VSA8', 'SlogP_VSA9', 'fr_barbitur',
    'fr_benzodiazepine', 'fr_dihydropyridine', 'fr_epoxide', 'fr_isothiocyan',
    'fr_lactam', 'fr_nitroso', 'fr_prisulfonamd', 'fr_thiocyan',
    'MaxEStateIndex', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons',
    'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Kappa1',
    'LabuteASA', 'HeavyAtomCount', 'MolMR', 'Chi3n', 'BertzCT', 'Chi2v',
    'Chi4n', 'HallKierAlpha', 'Chi3v', 'Chi4v', 'MinAbsPartialCharge',
    'MinPartialCharge', 'MaxAbsPartialCharge', 'FpDensityMorgan2',
    'FpDensityMorgan3', 'Phi', 'Kappa3', 'fr_nitrile', 'SlogP_VSA6',
    'NumAromaticCarbocycles', 'NumAromaticRings', 'fr_benzene', 'VSA_EState6',
    'NOCount', 'fr_C_O', 'fr_C_O_noCOO', 'NumHDonors', 'fr_amide',
    'fr_Nhpyrrole', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_COO2',
    'fr_halogen', 'fr_diazo', 'fr_nitro_arom', 'fr_phos_ester'
]

# === Compute Train Features ===
feature_dict_train, valid_idx_train = compute_all_features(train_extended["SMILES"], verbose=True)
features_train = pd.DataFrame(feature_dict_train).reset_index(drop=True)
features_train = features_train.drop(columns=[col for col in useless_cols if col in features_train.columns])

# === Compute Test Features ===
feature_dict_test, valid_idx_test = compute_all_features(test["SMILES"], verbose=True)
features_test = pd.DataFrame(feature_dict_test).reset_index(drop=True)
features_test = features_test.drop(columns=[col for col in useless_cols if col in features_test.columns])

# === Output Summary ===
print("Train features shape:", features_train.shape)
print("Test features shape:", features_test.shape)
print("Training dataframe Shape:", train_extended.shape)
print("Test dataframe Shape:", test.shape)


Computing Features: 100%|██████████| 8972/8972 [05:15<00:00, 28.46it/s]



--- Feature Engineering Summary ---
Total SMILES: 8972
Valid molecules: 8972
Invalid molecules: 0
Number of computed features: 228
Feature vector length per molecule: 8972
-----------------------------------


Computing Features: 100%|██████████| 3/3 [00:00<00:00, 23.90it/s]


--- Feature Engineering Summary ---
Total SMILES: 3
Valid molecules: 3
Invalid molecules: 0
Number of computed features: 228
Feature vector length per molecule: 3
-----------------------------------
Train features shape: (8972, 159)
Test features shape: (3, 159)
Training dataframe Shape: (8972, 6)
Test dataframe Shape: (3, 2)





# Prerprocessing features

In [7]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

# === Preprocessing Utilities ===

def preprocess_features(df, df_name=""):
    print(f"\n📦 Preprocessing {df_name} features")

    # 1. Replace inf/-inf with NaN
    inf_count = np.isinf(df.values).sum()
    print(f"  - Replacing {inf_count} ±inf values with NaN...")
    df = df.replace([np.inf, -np.inf], np.nan)

    # 2. Drop columns that are entirely NaN
    all_nan_cols = df.columns[df.isna().all()].tolist()
    print(f"  - Dropping {len(all_nan_cols)} all-NaN columns...")
    df = df.dropna(axis=1, how='all')

    # 3. Fill remaining NaNs with column means
    nan_count = df.isna().sum().sum()
    print(f"  - Filling {nan_count} remaining NaNs with column means...")
    df = df.fillna(df.mean())

    return df


def detect_outliers(df, threshold=1e10):
    max_vals = df.max()
    min_vals = df.min()
    too_large = max_vals[max_vals > threshold]
    too_small = min_vals[min_vals < -threshold]
    return too_large, too_small


def remove_low_variance(df, threshold=1e-5):
    print(f"\n🧹 Applying VarianceThreshold (threshold={threshold})...")
    selector = VarianceThreshold(threshold=threshold)
    reduced = selector.fit_transform(df)
    kept_cols = df.columns[selector.get_support()]
    removed_count = df.shape[1] - len(kept_cols)
    print(f"  - Removed {removed_count} low-variance features.")
    return pd.DataFrame(reduced, columns=kept_cols)


def clip_outliers(df, lower=-1e6, upper=1e6):
    print(f"\n🧯 Clipping outliers to range [{lower}, {upper}]...")
    too_large, too_small = detect_outliers(df)
    if not too_large.empty or not too_small.empty:
        print(f"  - Clipping {len(too_large)} overly large and {len(too_small)} overly small features.")
        df = df.clip(lower, upper)
    else:
        print("  - No extreme outliers found.")
    return df


# === Apply Preprocessing ===

# Make sure your features_train and features_test already exist
features_train_clean = preprocess_features(features_train, df_name="Train")
features_test_clean = preprocess_features(features_test, df_name="Test")

# Align both datasets
common_cols = features_train_clean.columns.intersection(features_test_clean.columns)
features_train_clean = features_train_clean[common_cols].copy()
features_test_clean = features_test_clean[common_cols].copy()

# Remove near-zero variance features
features_train_clean = remove_low_variance(features_train_clean)
features_test_clean = features_test_clean[features_train_clean.columns]  # Align

# Clip extreme outliers
features_train_clean = clip_outliers(features_train_clean)
features_test_clean = clip_outliers(features_test_clean)

# === Summary ===
print("\n✅ Final Preprocessing Summary:")
print(f"  - Train shape: {features_train_clean.shape}")
print(f"  - Test shape:  {features_test_clean.shape}")
print(f"  - Common features retained: {len(features_train_clean.columns)}")






📦 Preprocessing Train features
  - Replacing 0 ±inf values with NaN...
  - Dropping 0 all-NaN columns...
  - Filling 0 remaining NaNs with column means...

📦 Preprocessing Test features
  - Replacing 0 ±inf values with NaN...
  - Dropping 0 all-NaN columns...
  - Filling 0 remaining NaNs with column means...

🧹 Applying VarianceThreshold (threshold=1e-05)...
  - Removed 0 low-variance features.

🧯 Clipping outliers to range [-1000000.0, 1000000.0]...
  - Clipping 1 overly large and 0 overly small features.

🧯 Clipping outliers to range [-1000000.0, 1000000.0]...
  - Clipping 1 overly large and 0 overly small features.

✅ Final Preprocessing Summary:
  - Train shape: (8972, 159)
  - Test shape:  (3, 159)
  - Common features retained: 159


# ExtraTrees Pipeline 

In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# === Configuration ===
target_cols = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
n_splits = 5
random_seed = 42

# === wMAE Computation (aligned with competition metric) ===
def compute_wmae(oof_df, true_df, target_cols):
    """
    Compute weighted MAE (wMAE) using reweighting formula from competition rules.
    """
    ranges = {}
    avail_counts = {}

    for col in target_cols:
        mask = ~true_df[col].isna()
        y_true = true_df.loc[mask, col]
        ranges[col] = y_true.max() - y_true.min()
        avail_counts[col] = mask.sum()

    # Inverse sqrt of availability and scaled by inverse of value range
    unnormalized_weights = {
        col: (1 / np.sqrt(avail_counts[col])) / ranges[col]
        for col in target_cols
    }

    total_weight = sum(unnormalized_weights.values())
    weights = {col: w / total_weight for col, w in unnormalized_weights.items()}

    # Compute weighted MAE
    wmae = 0
    for col in target_cols:
        mask = ~true_df[col].isna()
        error = np.abs(oof_df.loc[mask, col] - true_df.loc[mask, col])
        wmae += weights[col] * error.mean()

    return wmae, weights

# === Cross-validation Training ===
test_preds = pd.DataFrame(index=features_test_clean.index)
oof_preds = pd.DataFrame(index=features_train_clean.index)

kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)

for target in target_cols:
    print(f"\n🎯 Training ExtraTrees for target: {target}")

    mask = ~train_extended[target].isna()
    X = features_train_clean.loc[mask]
    y = train_extended.loc[mask, target]

    oof_pred = np.zeros(X.shape[0])
    test_fold_preds = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"  🧪 Fold {fold + 1}/{n_splits}")

        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = ExtraTreesRegressor(
            n_estimators=300,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features='sqrt',
            bootstrap=False,
            random_state=random_seed,
            n_jobs=-1
        )

        model.fit(X_train, y_train)

        val_pred = model.predict(X_val)
        test_pred = model.predict(features_test_clean)

        oof_pred[val_idx] = val_pred
        test_fold_preds.append(test_pred)

        fold_mae = mean_absolute_error(y_val, val_pred)
        print(f"     🔍 Fold MAE: {fold_mae:.5f}")

    oof_preds.loc[mask, target] = oof_pred
    test_preds[target] = np.mean(test_fold_preds, axis=0)

print("\n✅ All models trained.")

# === Submission ===
submission = test[['id']].copy()
submission = pd.concat([submission, test_preds], axis=1)
submission.to_csv("submission.csv", index=False)
print("📁 Submission saved as submission.csv")

# === Evaluation ===
wmae_score, weight_map = compute_wmae(oof_preds, train_extended, target_cols)

print("\n📊 Final Evaluation — Weighted MAE (wMAE):")
print(f"✅ wMAE: {wmae_score:.6f}")
for t in target_cols:
    print(f"   • {t}: weight = {weight_map[t]:.6f}")



🎯 Training ExtraTrees for target: Tg
  🧪 Fold 1/5
     🔍 Fold MAE: 57.25228
  🧪 Fold 2/5
     🔍 Fold MAE: 56.72785
  🧪 Fold 3/5
     🔍 Fold MAE: 51.50944
  🧪 Fold 4/5
     🔍 Fold MAE: 45.37280
  🧪 Fold 5/5
     🔍 Fold MAE: 53.54996

🎯 Training ExtraTrees for target: FFV
  🧪 Fold 1/5
     🔍 Fold MAE: 0.00656
  🧪 Fold 2/5
     🔍 Fold MAE: 0.00629
  🧪 Fold 3/5
     🔍 Fold MAE: 0.00627
  🧪 Fold 4/5
     🔍 Fold MAE: 0.00653
  🧪 Fold 5/5
     🔍 Fold MAE: 0.00685

🎯 Training ExtraTrees for target: Tc
  🧪 Fold 1/5
     🔍 Fold MAE: 0.02888
  🧪 Fold 2/5
     🔍 Fold MAE: 0.02835
  🧪 Fold 3/5
     🔍 Fold MAE: 0.03994
  🧪 Fold 4/5
     🔍 Fold MAE: 0.02966
  🧪 Fold 5/5
     🔍 Fold MAE: 0.03736

🎯 Training ExtraTrees for target: Density
  🧪 Fold 1/5
     🔍 Fold MAE: 0.03314
  🧪 Fold 2/5
     🔍 Fold MAE: 0.05058
  🧪 Fold 3/5
     🔍 Fold MAE: 0.02847
  🧪 Fold 4/5
     🔍 Fold MAE: 0.02528
  🧪 Fold 5/5
     🔍 Fold MAE: 0.03345

🎯 Training ExtraTrees for target: Rg
  🧪 Fold 1/5
     🔍 Fold MAE: 2.01267
 

In [9]:
submission

Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,154.77232,0.373127,0.202526,1.147388,20.096992
1,1422188626,149.139589,0.374912,0.238176,1.094483,19.666001
2,2032016830,105.683468,0.350509,0.25766,1.11886,19.975709
