In [1]:
# Block 1: Setup and Load All Processed Features

from google.colab import drive
from scipy.sparse import vstack, load_npz, csr_matrix
import numpy as np
import glob
import os
from tqdm import tqdm

# --- Mount Google Drive ---
print("--> Mounting Google Drive...")
drive.mount('/content/drive')
print("--> Google Drive mounted successfully!")


# --- Load All Saved Feature Chunks from Google Drive ---

# This path MUST match where you saved your features
FEATURES_DIR = '/content/drive/MyDrive/ML_Competition/processed_features'

all_X = []
all_y = []
# Find all the .npz files in your features directory
feature_files = sorted(glob.glob(os.path.join(FEATURES_DIR, "*.npz")))

if not feature_files:
    print("CRITICAL ERROR: No feature files found in the 'processed_features' directory.")
    print("Please make sure the path is correct and the previous step completed successfully.")
else:
    print(f"\n--> Loading {len(feature_files)} saved feature chunks from Google Drive...")

    for filename in tqdm(feature_files):
        with np.load(filename, allow_pickle=True) as loaded:
            # Reconstruct the sparse matrix for this chunk
            X_chunk = csr_matrix((loaded['data'], loaded['indices'], loaded['indptr']), shape=loaded['shape'])
            all_X.append(X_chunk)
            all_y.append(loaded['labels'])

    # --- Combine all chunks into one final dataset ---
    X_final_features = vstack(all_X)
    y_final = np.concatenate(all_y)

    print("\n--> All features loaded and combined successfully!")
    print(f"--> Final feature matrix shape: {X_final_features.shape}")
    print(f"--> Final labels array shape: {y_final.shape}")

--> Mounting Google Drive...
Mounted at /content/drive
--> Google Drive mounted successfully!

--> Loading 30 saved feature chunks from Google Drive...


100%|██████████| 30/30 [00:43<00:00,  1.45s/it]



--> All features loaded and combined successfully!
--> Final feature matrix shape: (75000, 22049)
--> Final labels array shape: (75000,)


In [None]:
# Block 2: Tune and Train the Definitive Model

import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

if 'X_final_features' not in locals():
    print("Please run Block 1 first to load the data.")
else:
    # --- Tune Hyperparameters on the Full Dataset ---
    print("\n--> Starting hyperparameter tuning on the full dataset...")
    print("(This is the final long step, it may take several hours)")

    param_dist = {
        'n_estimators': randint(200, 1500),
        'learning_rate': uniform(0.01, 0.05),
        'num_leaves': randint(31, 100),
        'max_depth': [-1, 20, 30],
        'reg_alpha': uniform(0, 0.5), # L1 regularization
        'reg_lambda': uniform(0, 0.5), # L2 regularization
    }

    lgbm = lgb.LGBMRegressor(random_state=42, n_jobs=-1)

    # We'll run 15-20 iterations of search. More is better but takes longer.
    random_search = RandomizedSearchCV(
        lgbm, param_distributions=param_dist, n_iter=20,
        cv=3, scoring='neg_root_mean_squared_error', random_state=42, verbose=2
    )

    random_search.fit(X_final_features, y_final)

    print("\n--> Tuning complete.")
    print(f"--> Best parameters found: {random_search.best_params_}")

    # --- Train the definitive model using the best found parameters ---
    print("\n--> Training the final model on the entire dataset with the best parameters...")
    final_model = lgb.LGBMRegressor(**random_search.best_params_, random_state=42, n_jobs=-1)
    final_model.fit(X_final_features, y_final)

    print("\n\n*** YOUR COMPETITION MODEL IS TRAINED AND READY! ***")
    print("The variable 'final_model' now holds your fully trained and optimized model.")


--> Starting hyperparameter tuning on the full dataset...
(This is the final long step, it may take several hours)
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 19.118772 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1395253
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17031
[LightGBM] [Info] Start training from score 2.735708




[CV] END learning_rate=0.028727005942368128, max_depth=-1, n_estimators=1494, num_leaves=91, reg_alpha=0.2984250789732435, reg_lambda=0.22291637642679557; total time=66.2min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 20.091373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1401511
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17402
[LightGBM] [Info] Start training from score 2.739072




[CV] END learning_rate=0.028727005942368128, max_depth=-1, n_estimators=1494, num_leaves=91, reg_alpha=0.2984250789732435, reg_lambda=0.22291637642679557; total time=67.4min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 20.187392 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1403444
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17380
[LightGBM] [Info] Start training from score 2.742872




[CV] END learning_rate=0.028727005942368128, max_depth=-1, n_estimators=1494, num_leaves=91, reg_alpha=0.2984250789732435, reg_lambda=0.22291637642679557; total time=60.6min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 19.541787 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1395253
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17031
[LightGBM] [Info] Start training from score 2.735708




[CV] END learning_rate=0.014998745790900145, max_depth=30, n_estimators=287, num_leaves=54, reg_alpha=0.32544423647442644, reg_lambda=0.028205789513550128; total time=12.7min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 20.452143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1401511
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17402
[LightGBM] [Info] Start training from score 2.739072




[CV] END learning_rate=0.014998745790900145, max_depth=30, n_estimators=287, num_leaves=54, reg_alpha=0.32544423647442644, reg_lambda=0.028205789513550128; total time=13.0min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 20.733055 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1403444
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17380
[LightGBM] [Info] Start training from score 2.742872




[CV] END learning_rate=0.014998745790900145, max_depth=30, n_estimators=287, num_leaves=54, reg_alpha=0.32544423647442644, reg_lambda=0.028205789513550128; total time=11.9min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 20.049651 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1395253
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17031
[LightGBM] [Info] Start training from score 2.735708




[CV] END learning_rate=0.04609993861334124, max_depth=20, n_estimators=1005, num_leaves=32, reg_alpha=0.09091248360355031, reg_lambda=0.09170225492671691; total time=21.6min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 21.190533 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1401511
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17402
[LightGBM] [Info] Start training from score 2.739072




[CV] END learning_rate=0.04609993861334124, max_depth=20, n_estimators=1005, num_leaves=32, reg_alpha=0.09091248360355031, reg_lambda=0.09170225492671691; total time=21.9min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 20.904954 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1403444
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17380
[LightGBM] [Info] Start training from score 2.742872




[CV] END learning_rate=0.04609993861334124, max_depth=20, n_estimators=1005, num_leaves=32, reg_alpha=0.09091248360355031, reg_lambda=0.09170225492671691; total time=19.8min


Exception ignored on calling ctypes callback function: <function _log_callback at 0x7886687319e0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/lightgbm/basic.py", line 287, in _log_callback
    def _log_callback(msg: bytes) -> None:
    
KeyboardInterrupt: 


Auto-choosing col-wise multi-threading, the overhead of testing was 22.284802 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1395253
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17031
[LightGBM] [Info] Start training from score 2.735708




[CV] END learning_rate=0.02521211214797689, max_depth=20, n_estimators=452, num_leaves=79, reg_alpha=0.2623873301291946, reg_lambda=0.19993048585762774; total time=20.5min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 19.632008 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1401511
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17402
[LightGBM] [Info] Start training from score 2.739072




[CV] END learning_rate=0.02521211214797689, max_depth=20, n_estimators=452, num_leaves=79, reg_alpha=0.2623873301291946, reg_lambda=0.19993048585762774; total time=20.4min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 19.664217 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1403444
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17380
[LightGBM] [Info] Start training from score 2.742872




[CV] END learning_rate=0.02521211214797689, max_depth=20, n_estimators=452, num_leaves=79, reg_alpha=0.2623873301291946, reg_lambda=0.19993048585762774; total time=18.4min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 20.469565 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1395253
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17031
[LightGBM] [Info] Start training from score 2.735708




[CV] END learning_rate=0.012333283160680771, max_depth=30, n_estimators=389, num_leaves=92, reg_alpha=0.3925879806965068, reg_lambda=0.09983689107917987; total time=23.8min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 19.934086 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1401511
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17402
[LightGBM] [Info] Start training from score 2.739072




[CV] END learning_rate=0.012333283160680771, max_depth=30, n_estimators=389, num_leaves=92, reg_alpha=0.3925879806965068, reg_lambda=0.09983689107917987; total time=24.2min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 19.830296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1403444
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17380
[LightGBM] [Info] Start training from score 2.742872




[CV] END learning_rate=0.012333283160680771, max_depth=30, n_estimators=389, num_leaves=92, reg_alpha=0.3925879806965068, reg_lambda=0.09983689107917987; total time=21.9min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 20.487601 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1395253
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17031
[LightGBM] [Info] Start training from score 2.735708




[CV] END learning_rate=0.03571172192068058, max_depth=-1, n_estimators=1354, num_leaves=81, reg_alpha=0.34015376929388985, reg_lambda=0.2252496259847715; total time=59.0min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 19.442280 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1401511
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17402
[LightGBM] [Info] Start training from score 2.739072




[CV] END learning_rate=0.03571172192068058, max_depth=-1, n_estimators=1354, num_leaves=81, reg_alpha=0.34015376929388985, reg_lambda=0.2252496259847715; total time=56.3min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 19.284936 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1403444
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17380
[LightGBM] [Info] Start training from score 2.742872




[CV] END learning_rate=0.03571172192068058, max_depth=-1, n_estimators=1354, num_leaves=81, reg_alpha=0.34015376929388985, reg_lambda=0.2252496259847715; total time=48.7min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 21.069480 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1395253
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17031
[LightGBM] [Info] Start training from score 2.735708




[CV] END learning_rate=0.010663248057993327, max_depth=-1, n_estimators=515, num_leaves=44, reg_alpha=0.40419867405823057, reg_lambda=0.15230688458668534; total time=20.0min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 20.226698 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1401511
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 17402
[LightGBM] [Info] Start training from score 2.739072




[CV] END learning_rate=0.010663248057993327, max_depth=-1, n_estimators=515, num_leaves=44, reg_alpha=0.40419867405823057, reg_lambda=0.15230688458668534; total time=19.8min
