In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install -q hillclimbers

[33m  DEPRECATION: Building 'hillclimbers' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'hillclimbers'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m

In [4]:
import numpy as np
import pandas as pd
import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt
import hashlib
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error
from sklearn.model_selection import KFold
from hillclimbers import climb_hill, partial

# ==========================================
# 0. CONFIGURATION
# ==========================================
test_req = False       # Set to True for fast execution, False for final submission
skip_hillclimb = False  
PATH = "/kaggle/input/s6e1-models/" 
TARGET = 'exam_score'

# ==========================================
# 1. SETUP & DATA LOADING
# ==========================================
print(f"--- Initializing Data Loading (Mode: {'TEST' if test_req else 'FULL'}) ---")
oof_files = sorted(glob.glob(os.path.join(PATH, "**/*_oof.csv"), recursive=True))
if not oof_files:
    oof_files = sorted(glob.glob("/kaggle/input/**/*_oof.csv", recursive=True))

sub_files = [f.replace("_oof.csv", "_sub.csv") for f in oof_files]
model_names = [os.path.basename(f).replace("_oof.csv", "") for f in oof_files]

train_df = pd.read_csv("/Users/kavuk/Desktop/GitHub/Kaggle/KaggleData/playground-series-s6e1/train.csv")
y_true = train_df[TARGET].values 

# --- Phase 0: Deduplication ---
unique_subs = {}
indices_to_keep = []

for i, (s_file, name) in enumerate(zip(sub_files, model_names)):
    temp_sub = pd.read_csv(s_file)[TARGET].values
    sub_hash = hashlib.md5(temp_sub.tobytes()).hexdigest()
    if sub_hash not in unique_subs:
        unique_subs[sub_hash] = name
        indices_to_keep.append(i)
    else:
        print(f"‚ö†Ô∏è Dropping duplicate: {name}")

oof_files = [oof_files[i] for i in indices_to_keep]
sub_files = [sub_files[i] for i in indices_to_keep]
model_names = [model_names[i] for i in indices_to_keep]

oofs = np.stack([pd.read_csv(f)[TARGET].values for f in oof_files], axis=1)
subs = np.stack([pd.read_csv(f)[TARGET].values for f in sub_files], axis=1)

print(f"‚úÖ Data loaded. Models: {len(model_names)}")

# ==========================================
# 2. PHASE 1: HILL CLIMBING
# ==========================================
df_oof_indexed = pd.DataFrame(oofs, columns=model_names)
df_sub_indexed = pd.DataFrame(subs, columns=model_names)

hc_precision = 0.01 if test_req else 0.001
hc_negative = False if test_req else True

if test_req:
    np.random.seed(42)
    sample_idx = np.random.choice(len(train_df), size=int(len(train_df)*0.2), replace=False)
    hc_train_subset = train_df.iloc[sample_idx].reset_index(drop=True)
    hc_oof_subset = df_oof_indexed.iloc[sample_idx].reset_index(drop=True)
    print(f"üìâ Test Mode: Downsampled HC to {len(hc_train_subset)} rows.")
else:
    hc_train_subset = train_df
    hc_oof_subset = df_oof_indexed

if not skip_hillclimb:
    print(f"\nüöÄ Initiating Hill Climbing (Precision: {hc_precision}, Neg Weights: {hc_negative})")
    
    # HC returns numpy arrays when return_oof_preds=True
    hc_test, hc_oof = climb_hill(
        train=hc_train_subset, 
        target=TARGET, 
        objective='minimize', 
        eval_metric=partial(root_mean_squared_error),
        oof_pred_df=hc_oof_subset, 
        test_pred_df=df_sub_indexed,
        plot_hill=True,
        plot_hist=False, 
        precision=hc_precision,
        negative_weights=hc_negative,
        return_oof_preds=True
    )
    
    # Handle the AttributeError by treating the blended output as a single feature for RidgeCV
    # Note: If test_req is True, hc_oof is only the length of the sample. 
    # To keep stacking consistent on the FULL dataset, we use the HC result as the new input.
    if test_req:
        print("‚ö†Ô∏è Note: Using library output for Phase 2 (Sampled OOF).")
        X_train_hc = hc_oof.reshape(-1, 1)
        y_true_stacking = hc_train_subset[TARGET].values
    else:
        X_train_hc = hc_oof.reshape(-1, 1)
        y_true_stacking = y_true
        
    X_test_hc = hc_test.reshape(-1, 1)
    selected_model_names = ['HC_Blended_Feature']
else:
    print("\n‚ö†Ô∏è Skipping Hill Climbing.")
    X_train_hc = oofs
    y_true_stacking = y_true
    X_test_hc = subs
    selected_model_names = model_names

# ==========================================
# 3. PHASE 2: RIDGE CV (STACKING)
# ==========================================
print(f"\n--- Phase 2: Training RidgeCV Meta-Model ---")
kf_splits = 3 if test_req else 10 
kf = KFold(n_splits=kf_splits, shuffle=True, random_state=42)
alphas = np.logspace(-2, 4, 15) if test_req else np.logspace(-2, 7, 50)

oof_final_preds = np.zeros(len(y_true_stacking))
sub_final_preds = np.zeros(X_test_hc.shape[0]) 

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_hc)):
    X_tr, y_tr = X_train_hc[train_idx], y_true_stacking[train_idx]
    X_va, y_va = X_train_hc[val_idx], y_true_stacking[val_idx]
    
    model = RidgeCV(alphas=alphas, scoring='neg_root_mean_squared_error')
    model.fit(X_tr, y_tr)
    
    oof_final_preds[val_idx] = model.predict(X_va)
    sub_final_preds += model.predict(X_test_hc) / kf_splits
    print(f"Fold {fold+1}/{kf_splits} complete. Alpha: {model.alpha_:.2f}")

# ==========================================
# 4. FINAL PERFORMANCE & EXPORT
# ==========================================
final_rmse = root_mean_squared_error(y_true_stacking, oof_final_preds)
print(f"\n" + "="*35)
print(f"‚úÖ FINAL ENSEMBLE RMSE: {final_rmse:.6f}")
print("="*35)

final_sub_preds = np.clip(sub_final_preds, train_df[TARGET].min(), train_df[TARGET].max())
sub_template = pd.read_csv("/kaggle/input/playground-series-s6e1/sample_submission.csv")
sub_template[TARGET] = final_sub_preds
sub_file = f"submission_rmse_{final_rmse:.6f}.csv"
sub_template.to_csv(sub_file, index=False)

print(f"üöÄ Saved to: {sub_file}")

--- Initializing Data Loading (Mode: FULL) ---


ValueError: need at least one array to stack