<a href="https://colab.research.google.com/github/darshlukkad/AutoGluon/blob/main/california_housing_prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os

# ---- CONFIG ----
KAGGLE_COMPETITION = "california-house-prices"
DATA_DIR = "/content/data"
DATASET = os.path.join(DATA_DIR, KAGGLE_COMPETITION)
AUTOGLUON_SAVE_PATH = os.path.join(DATA_DIR, "AutoGluonModels")

print("Competition:", KAGGLE_COMPETITION)
print("DATA_DIR:", DATA_DIR)
print("AUTOGLUON_SAVE_PATH:", AUTOGLUON_SAVE_PATH)

# ---- Install slim AutoGluon + Kaggle CLI ----
!pip install -q kaggle autogluon.tabular scikit-learn

# ---- Kaggle auth (assumes kaggle.json exists in Google Drive) ----
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# ---- Download + extract competition files ----
!mkdir -p "{DATA_DIR}" "{AUTOGLUON_SAVE_PATH}"
!kaggle competitions download -c "{KAGGLE_COMPETITION}" -p "{DATA_DIR}" --force
!unzip -o -q "{DATA_DIR}/{KAGGLE_COMPETITION}.zip" -d "{DATA_DIR}/{KAGGLE_COMPETITION}"
!rm -f "{DATA_DIR}/{KAGGLE_COMPETITION}.zip"
!ls -lh "{DATA_DIR}/{KAGGLE_COMPETITION}"

Competition: california-house-prices
DATA_DIR: /content/data
AUTOGLUON_SAVE_PATH: /content/data/AutoGluonModels
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.3/487.3 kB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.1/225.1 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.0/71.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m156.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Downloading califor

In [7]:
# === FAST MODE (Academic demo): California Housing (Optimized) ===
import os, time, numpy as np, pandas as pd, random
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split

# ---- Config (assumes DATASET & AUTOGLUON_SAVE_PATH already defined) ----
SEED = 42
N_TRAIN = 20_000        # cap rows for speed
TL = 300                # seconds; demo budget
THREADS = max(1, (os.cpu_count() or 1))  # use available CPU cores

np.random.seed(SEED)
random.seed(SEED)

COMP_DIR   = DATASET
TRAIN_PATH = os.path.join(COMP_DIR, "train.csv")
TEST_PATH  = os.path.join(COMP_DIR, "test.csv")
SUB_PATH   = os.path.join(COMP_DIR, "sample_submission.csv")
TARGET     = "Sold Price"

# ---- Load ----
train_full = pd.read_csv(TRAIN_PATH, low_memory=False)
test_df    = pd.read_csv(TEST_PATH,  low_memory=False)
sub_df     = pd.read_csv(SUB_PATH,   low_memory=False)

# ---- Deterministic sample for speed ----
if len(train_full) > N_TRAIN:
    train_df = train_full.sample(n=N_TRAIN, random_state=SEED).reset_index(drop=True)
else:
    train_df = train_full.reset_index(drop=True)
del train_full

# ---- Quick preprocess ----
# Drop obvious IDs if present
id_cols = [c for c in ("Id", "id") if c in train_df.columns]
if id_cols: train_df.drop(columns=id_cols, inplace=True)
id_cols_test = [c for c in ("Id", "id") if c in test_df.columns]
if id_cols_test: test_df.drop(columns=id_cols_test, inplace=True)

# Log1p-stabilize numerics (except target), then downcast to float32 for speed
num_cols_train = train_df.select_dtypes(include="number").columns.tolist()
if TARGET in num_cols_train:
    num_cols_train.remove(TARGET)

train_df[num_cols_train] = np.log1p(train_df[num_cols_train].clip(lower=0))
train_df[TARGET]         = np.log1p(train_df[TARGET].clip(lower=0))
train_df[num_cols_train] = train_df[num_cols_train].astype("float32")

# Match test numeric handling (columns may differ in rare cases)
num_cols_test = test_df.select_dtypes(include="number").columns.tolist()
test_df[num_cols_test] = np.log1p(test_df[num_cols_test].clip(lower=0))
test_df[num_cols_test] = test_df[num_cols_test].astype("float32")

# ---- Holdout scheme: 10% final holdout; from remaining 90%, 10% for dev ----
train_full_split, holdout = train_test_split(train_df, test_size=0.10, random_state=SEED)
train_split, dev_split    = train_test_split(train_full_split, test_size=0.10, random_state=SEED)
print(f"Train: {train_split.shape}, Dev: {dev_split.shape}, Holdout: {holdout.shape}")

# ---- Fast single-model config (LightGBM only) ----
# Use more threads for speed, keep seeds for reproducibility.
hyperparameters = {
    "GBM": [{
        "num_boost_round": 200,
        "learning_rate": 0.1,
        "num_leaves": 31,
        "early_stopping_rounds": 50,   # speeds up if no progress
        "random_state": SEED,
        "bagging_seed": SEED,
        "feature_fraction_seed": SEED,
        "data_random_seed": SEED,
        "num_threads": THREADS,        # speed up on multicore
    }],
    "CAT": [], "XGB": [], "RF": [], "XT": [], "NN_TORCH": []
}

ag_args_fit = {
    "random_seed": SEED,
    "num_cpus": THREADS,              # propagate thread count
}

# ---- Output path ----
BASE_PATH = AUTOGLUON_SAVE_PATH
os.makedirs(BASE_PATH, exist_ok=True)
RUN_NAME = f"california_house_{time.strftime('%Y%m%d_%H%M%S')}"
MODEL_PATH = os.path.join(BASE_PATH, RUN_NAME)

predictor = TabularPredictor(
    label=TARGET,
    eval_metric="rmse",
    path=MODEL_PATH,
    verbosity=2
)

# ---- Fit ----
trained = False
try:
    predictor.fit(
        train_data=train_split,
        tuning_data=dev_split,
        hyperparameters=hyperparameters,
        presets="good_quality",       # slimmer, faster than medium_quality
        time_limit=TL,
        num_bag_folds=0,
        num_stack_levels=0,
        keep_only_best=True,
        ag_args_fit=ag_args_fit,
    )
    trained = True
except AssertionError as e:
    print("✅ Fit exited early (likely tight time_limit):", e)

# ---- Evaluate on never-seen holdout + export submission ----
if trained and getattr(predictor, "_trainer", None):
    holdout_metrics = predictor.evaluate(holdout)
    print("Holdout metrics (log-scale RMSE):", holdout_metrics)

    # Dollar-scale RMSE (original target space)
    try:
        y_true = np.expm1(holdout[TARGET].to_numpy())
        y_pred = np.expm1(predictor.predict(holdout).to_numpy())
        rmse_dollars = np.sqrt(np.mean((y_true - y_pred) ** 2))
        print(f"Holdout RMSE (original $): {rmse_dollars:,.2f}")
    except Exception as e:
        print("Skipping dollar-scale RMSE:", e)

    # Predict test and save
    pred_log = predictor.predict(test_df)
    pred     = np.expm1(pred_log)
    sub      = sub_df.copy()
    sub[TARGET] = pred
    out_path = os.path.join(MODEL_PATH, "submission.csv")
    sub.to_csv(out_path, index=False)
    print("✅ Saved submission:", out_path)
else:
    print("⏸️ Training didn’t complete — consider increasing TL (≥300 s).")

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct  2 10:42:05 UTC 2025
CPU Count:          8
Memory Avail:       49.21 GB / 50.99 GB (96.5%)
Disk Space Avail:   195.85 GB / 235.68 GB (83.1%)
Presets specified: ['good_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=0, num_bag_sets=1


Train: (16200, 40), Dev: (1800, 40), Holdout: (2000, 40)


Beginning AutoGluon training ... Time limit = 300s
AutoGluon will save models to "/content/data/AutoGluonModels/california_house_20251023_192505"
Train Data Rows:    16200
Train Data Columns: 39
Tuning Data Rows:    1800
Tuning Data Columns: 39
Label Column:       Sold Price
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (17.909855136853043, 11.51792295668052, 13.73669, 0.79778)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    50431.20 MB
	Train Data (Original)  Memory Usage: 35.83 MB (0.1% of available memory)
	

Holdout metrics (log-scale RMSE): {'root_mean_squared_error': np.float64(-0.1884125617346258), 'mean_squared_error': -0.03549929341940418, 'mean_absolute_error': -0.09152310644919366, 'r2': 0.9436067064340019, 'pearsonr': 0.9715009371785128, 'median_absolute_error': np.float64(-0.04761857970012873)}
Holdout RMSE (original $): 795,050.60
✅ Saved submission: /content/data/AutoGluonModels/california_house_20251023_192505/submission.csv


In [8]:
import glob, os

candidates = sorted(
    glob.glob(os.path.join(AUTOGLUON_SAVE_PATH, "*/submission.csv")),
    key=os.path.getmtime,
    reverse=True
)
if not candidates:
    raise FileNotFoundError("No submission.csv found under AutoGluonModels/*/")
SUBMISSION_FILE = candidates[0]
print("Using submission file:", SUBMISSION_FILE)

!kaggle competitions submit -c "california-house-prices" -f "$SUBMISSION_FILE" -m "submit"
!kaggle competitions submissions -c "california-house-prices" | head -n 20

Using submission file: /content/data/AutoGluonModels/california_house_20251023_192505/submission.csv
100% 483k/483k [00:01<00:00, 295kB/s]
Successfully submitted to California House PricesfileName        date                        description  status                    publicScore  privateScore  
--------------  --------------------------  -----------  ------------------------  -----------  ------------  
submission.csv  2025-10-23 19:37:03.333000  submit       SubmissionStatus.PENDING                             
