In [1]:
%%writefile dataloader.py
import numpy as np
import pandas as pd 

import warnings
warnings.filterwarnings('ignore')

def load_data(train_path, test_path):
    """
    Loads training and testing datasets as Pandas DataFrames.
    
    Parameters:
    - train_path (str): Path to the training CSV file.
    - test_path (str): Path to the testing CSV file.
    
    Returns:
    - train_df (pd.DataFrame): Training dataset.
    - test_df (pd.DataFrame): Testing dataset.
    """
    
    try:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        print(f"✅ Data loaded successfully!\n - Train shape: {train_df.shape}\n - Test shape: {test_df.shape}")
        return train_df, test_df
    except Exception as e:
        print(f"❌ Error loading data: {e}")
        return None, None

Writing dataloader.py


In [2]:
%%writefile preprocessing.py

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import KFold, GroupKFold
import statistics
import math
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

class TimeSeriesFeatureEngineer:
    def __init__(self, 
                 date_col='DATE', 
                 target_col='Target',
                 diff_cols=None,
                 n_lags=2):
        """
        A class for creating lag and difference features in a combined train/test dataset.

        Parameters:
        - date_col: str, name of the date column
        - target_col: str, name of the target column
        - diff_cols: list, columns to compute first-order differences for
        - n_lags: int, number of lag/lead features to create
        """
        self.date_col = date_col
        self.target_col = target_col
        self.diff_cols = diff_cols if diff_cols else []
        self.n_lags = n_lags

    def transform(self, train_df, test_df):
        # Sort and tag
        train_df = train_df.sort_values(self.date_col).reset_index(drop=True)
        test_df = test_df.sort_values(self.date_col).reset_index(drop=True)
        train_df['Set'] = 'train'
        test_df['Set'] = 'test'
        
        # Combine datasets
        dataset = pd.concat([train_df, test_df], axis=0).sort_values(self.date_col).reset_index(drop=True)

        # Lag and lead features
        for lag in range(1, self.n_lags + 1):
            dataset[f'{self.target_col}_Lag_{lag}'] = dataset[self.target_col].shift(lag)
            dataset[f'{self.target_col}_Lead_{lag}'] = dataset[self.target_col].shift(-lag)

        # First-order difference features
        for col in self.diff_cols:
            dataset[f'{col}_diff1'] = dataset[col].diff()

        # Split back into train and test
        train_processed = dataset[dataset['Set'] == 'train'].copy()
        test_processed = dataset[dataset['Set'] == 'test'].copy()

        # Drop helper column
        train_processed.drop(columns=['Set'], inplace=True)
        test_processed.drop(columns=['Set'], inplace=True)
        
        return train_processed, test_processed


def prepare_features(train_df, test_df, target_col='Target', drop_cols=None):
    """
    Prepares X, y, and X_test by dropping specified columns.
    
    Parameters:
    - train_df: pd.DataFrame, training dataset
    - test_df: pd.DataFrame, test dataset
    - target_col: str, name of target column
    - drop_cols: list, columns to drop (in addition to target)
    
    Returns:
    - X: pd.DataFrame, training features
    - y: pd.Series, training target
    - X_test: pd.DataFrame, test features
    """
    if drop_cols is None:
        drop_cols = []
    
    # Ensure target is in drop columns
    cols_to_drop = list(set(drop_cols + [target_col]))

    # Extract target
    y = train_df[target_col]

    # Drop unnecessary columns
    X = train_df.drop(columns=cols_to_drop, errors='ignore')
    X_test = test_df[X.columns]  # Ensure same columns as X

    return X, y, X_test


Writing preprocessing.py


In [3]:
%%writefile models.py
from catboost import CatBoostRegressor, Pool
import lightgbm as lgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import KFold, GroupKFold
import statistics
import math
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


class CatBoostCV:
    def __init__(self, 
                 n_splits=10, 
                 random_state=42, 
                 use_groups=False,
                 model_params=None):
        """
        CatBoost cross-validation trainer with tqdm progress.

        Parameters:
        - n_splits: int, number of folds
        - random_state: int, random seed
        - use_groups: bool, whether to use GroupKFold
        - model_params: dict, CatBoost hyperparameters
        """
        self.n_splits = n_splits
        self.random_state = random_state
        self.use_groups = use_groups
        self.model_params = model_params if model_params else {
            'iterations': 1000,
            'loss_function': 'RMSE',
            'eval_metric': 'RMSE',
            'early_stopping_rounds': 100,
            'verbose': 0,
            'random_seed': random_state
        }
        self.models = []
        self.oof_preds = None
        self.test_preds = None
        self.rmse_list = []

    def fit(self, X, y, X_test, groups=None):
        """
        Fits the model using cross-validation and stores OOF/test predictions.

        Parameters:
        - X: pd.DataFrame, training features
        - y: pd.Series, target
        - X_test: pd.DataFrame, test features
        - groups: pd.Series or array, group labels for GroupKFold (if used)
        """
        # Choose splitter
        if self.use_groups:
            if groups is None:
                raise ValueError("Groups must be provided for GroupKFold.")
            splitter = GroupKFold(n_splits=self.n_splits)
            split_data = splitter.split(X, y, groups)
        else:
            splitter = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
            split_data = splitter.split(X, y)

        # Initialize arrays
        self.oof_preds = np.zeros(len(X))
        self.test_preds = np.zeros(len(X_test))

        # Progress bar
        for train_idx, val_idx in tqdm(split_data, total=self.n_splits, desc="Training Folds"):
            X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

            train_pool = Pool(X_tr, y_tr)
            val_pool = Pool(X_val, y_val)

            # Model
            model = CatBoostRegressor(**self.model_params)
            model.fit(train_pool, eval_set=val_pool, use_best_model=True)
            self.models.append(model)

            # OOF predictions
            self.oof_preds[val_idx] = model.predict(X_val)
            rmse = mean_squared_error(y_val, self.oof_preds[val_idx], squared=False)
            self.rmse_list.append(rmse)

            # Test predictions
            test_pool = Pool(X_test)
            self.test_preds += model.predict(test_pool) / self.n_splits

        print(f"\nMean CV RMSE: {np.mean(self.rmse_list):.4f}")
        return self

    def get_oof(self):
        return self.oof_preds

    def get_test_preds(self):
        return self.test_preds

    def get_models(self):
        return self.models

class LightGBMCV:
    def __init__(self, 
                 n_splits=10, 
                 random_state=42, 
                 use_groups=False,
                 model_params=None,
                 num_boost_round=1000):
        """
        LightGBM cross-validation trainer with tqdm progress.
        
        Parameters:
        - n_splits: int, number of folds
        - random_state: int, random seed
        - use_groups: bool, whether to use GroupKFold
        - model_params: dict, LightGBM hyperparameters
        - num_boost_round: int, max boosting rounds
        """
        self.n_splits = n_splits
        self.random_state = random_state
        self.use_groups = use_groups
        self.num_boost_round = num_boost_round
        self.model_params = model_params if model_params else {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'learning_rate': 0.05,
            'num_leaves': 64,
            'max_depth': -1,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'random_state': random_state,
            'early_stopping_rounds': 100,
        }
        self.models = []
        self.oof_preds = None
        self.test_preds = None
        self.rmse_list = []

    def fit(self, X, y, X_test, groups=None):
        """
        Fits the model using cross-validation and stores OOF/test predictions.
        """
        # Choose splitter
        if self.use_groups:
            if groups is None:
                raise ValueError("Groups must be provided for GroupKFold.")
            splitter = GroupKFold(n_splits=self.n_splits)
            split_data = splitter.split(X, y, groups)
        else:
            splitter = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
            split_data = splitter.split(X, y)

        # Initialize arrays
        self.oof_preds = np.zeros(len(X))
        self.test_preds = np.zeros(len(X_test))

        # Progress bar
        for train_idx, val_idx in tqdm(split_data, total=self.n_splits, desc="Training Folds"):
            X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

            train_set = lgb.Dataset(X_tr, label=y_tr)
            val_set = lgb.Dataset(X_val, label=y_val, reference=train_set)

            model = lgb.train(
                self.model_params,
                train_set,
                num_boost_round=self.num_boost_round,
                valid_sets=[train_set, val_set],
                valid_names=['train', 'valid'],
            )
            self.models.append(model)

            # OOF predictions
            self.oof_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
            rmse = mean_squared_error(y_val, self.oof_preds[val_idx], squared=False)
            self.rmse_list.append(rmse)

            # Test predictions
            self.test_preds += model.predict(X_test, num_iteration=model.best_iteration) / self.n_splits

        print(f"\nMean CV RMSE: {np.mean(self.rmse_list):.4f} ± {np.std(self.rmse_list):.4f}")
        return self

    def get_oof(self):
        return self.oof_preds

    def get_test_preds(self):
        return self.test_preds

    def get_models(self):
        return self.models
        

class StackingRegressor:
    def __init__(self, meta_model=None, n_splits=10, random_state=42):
        """
        Stacks multiple model predictions using a meta-model.

        Parameters:
        - meta_model: sklearn-style regressor (default: Ridge regression)
        - n_splits: int, number of CV folds for stacking
        - random_state: int, reproducibility
        """
        self.meta_model = meta_model if meta_model else Ridge(alpha=1.0)
        self.n_splits = n_splits
        self.random_state = random_state
        self.oof_preds = None
        self.test_preds = None
        self.rmse_list = []
        self.models = []

    def fit(self, oof_list, y, test_list):
        """
        Fits the stacking model.

        Parameters:
        - oof_list: list of arrays (OOF predictions from base models)
        - y: Series or array, true target values
        - test_list: list of arrays (test predictions from base models)
        """
        # Combine OOF & test predictions
        X_stack = np.column_stack(oof_list)
        X_test_stack = np.column_stack(test_list)

        # KFold for meta-model training
        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
        self.oof_preds = np.zeros(len(y))
        self.test_preds = np.zeros(len(X_test_stack))

        for train_idx, val_idx in tqdm(kf.split(X_stack, y), total=self.n_splits, desc="Stacking Folds"):
            X_tr, X_val = X_stack[train_idx], X_stack[val_idx]
            y_tr, y_val = y[train_idx], y[val_idx]

            model = self._clone_model()
            model.fit(X_tr, y_tr)
            self.models.append(model)

            # OOF predictions
            self.oof_preds[val_idx] = model.predict(X_val)
            rmse = mean_squared_error(y_val, self.oof_preds[val_idx], squared=False)
            self.rmse_list.append(rmse)

            # Test predictions
            self.test_preds += model.predict(X_test_stack) / self.n_splits

        print(f"\nStacking CV RMSE: {np.mean(self.rmse_list):.4f} ± {np.std(self.rmse_list):.4f}")
        return self

    def _clone_model(self):
        """Re-initialize the meta-model for each fold."""
        return type(self.meta_model)(**self.meta_model.get_params())

    def get_oof(self):
        return self.oof_preds

    def get_test_preds(self):
        return self.test_preds

Writing models.py


In [4]:
%%writefile utils.py
from catboost import CatBoostRegressor, Pool
import lightgbm as lgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import KFold, GroupKFold
import statistics
import math
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

def ensemble_predictions(pred_list, weights=None):
    """
    Averages multiple prediction arrays into one ensemble prediction.
    
    Parameters:
    - pred_list: list of np.arrays, each containing model predictions
    - weights: list of floats, weights for each model (must sum to 1). If None, equal weights are used.
    
    Returns:
    - np.array: ensembled predictions
    """
    if weights is None:
        weights = [1 / len(pred_list)] * len(pred_list)
    else:
        if not np.isclose(sum(weights), 1.0):
            raise ValueError("Weights must sum to 1.")
    
    pred_list = [np.array(p) for p in pred_list]
    ensemble = np.zeros_like(pred_list[0], dtype=float)
    for p, w in zip(pred_list, weights):
        ensemble += w * p
    return ensemble

Writing utils.py


In [5]:
%%writefile config.py

TRAIN_PATH = "/kaggle/input/localised-precipitation-forecasting/Train_data.csv"
TEST_PATH = "/kaggle/input/localised-precipitation-forecasting/Test_data.csv"

DIFF_COLUMNS = ['RH2M', 'QV2M', 'T2MDEW', 'PS']
DROP_COLUMNS_CAT = ['Set','DATE', 'ID', 'DY']
DROP_COLUMNS_LGB = ['Set','DATE', 'ID']

N_SPLITS = 10

Writing config.py


In [6]:
%%writefile train.py
import joblib
import pandas as pd
import numpy as np
from dataloader import load_data
from preprocessing import TimeSeriesFeatureEngineer, prepare_features
from models import CatBoostCV, LightGBMCV, StackingRegressor
from utils import ensemble_predictions
from config import *
from sklearn.linear_model import Ridge
import os

def run_training():
    # Ensure artifacts directory exists
    os.makedirs("artifacts", exist_ok=True)
    # Load data
    train_df, test_df = load_data(TRAIN_PATH, TEST_PATH)

    # Feature engineering
    fe = TimeSeriesFeatureEngineer(date_col='DATE', target_col='Target', diff_cols=DIFF_COLUMNS, n_lags=2)
    train_processed, test_processed = fe.transform(train_df, test_df)
    joblib.dump(fe, "artifacts/fe.pkl")
    print("artifacts/fe.pkl")

    # CatBoost
    X_cb, y_cb, X_test_cb = prepare_features(train_processed, test_processed, target_col='Target', drop_cols=DROP_COLUMNS_CAT)
    cb_plain = CatBoostCV(n_splits=N_SPLITS, use_groups=False)
    cb_plain.fit(X_cb, y_cb, X_test_cb)
    cb_group = CatBoostCV(n_splits=N_SPLITS, use_groups=True)
    cb_group.fit(X_cb, y_cb, X_test_cb, groups=train_df.get('YEAR'))
    cb_preds = ensemble_predictions([cb_plain.get_test_preds(), cb_group.get_test_preds()])
    joblib.dump((cb_plain, cb_group), "artifacts/cb_model.pkl")

    # LightGBM
    X_lgb, y_lgb, X_test_lgb = prepare_features(train_processed, test_processed, target_col='Target', drop_cols=DROP_COLUMNS_LGB)
    lgb_plain = LightGBMCV(n_splits=N_SPLITS, use_groups=False)
    lgb_plain.fit(X_lgb, y_lgb, X_test_lgb)
    lgb_group = LightGBMCV(n_splits=N_SPLITS, use_groups=True)
    lgb_group.fit(X_lgb, y_lgb, X_test_lgb, groups=train_df.get('YEAR'))
    lgb_preds = ensemble_predictions([lgb_plain.get_test_preds(), lgb_group.get_test_preds()])
    joblib.dump((lgb_plain, lgb_group), "artifacts/lgb_model.pkl")

    # Stacking
    oof_list = [(cb_plain.get_oof() + cb_group.get_oof()) / 2, (lgb_plain.get_oof() + lgb_group.get_oof()) / 2]
    test_list = [cb_preds, lgb_preds]
    stacker = StackingRegressor(meta_model=Ridge(alpha=1), n_splits=N_SPLITS)
    stacker.fit(oof_list, y_lgb.values, test_list)
    joblib.dump(stacker, "artifacts/stacker.pkl")

    print("✅ Training complete. Models saved in 'artifacts/'.")

def run_inference(output_path="predictions.csv"):
    # Load data
    train_df, test_df = load_data(TRAIN_PATH, TEST_PATH)

    # Load preprocessing and models
    fe = joblib.load("artifacts/fe.pkl")
    cb_plain, cb_group = joblib.load("artifacts/cb_model.pkl")
    lgb_plain, lgb_group = joblib.load("artifacts/lgb_model.pkl")
    stacker = joblib.load("artifacts/stacker.pkl")

    # Feature engineering (reuse training logic)
    train_processed, test_processed = fe.transform(train_df, test_df)

    # Predictions
    cb_preds = ensemble_predictions([cb_plain.get_test_preds(), cb_group.get_test_preds()])
    lgb_preds = ensemble_predictions([lgb_plain.get_test_preds(), lgb_group.get_test_preds()])

    # Stacking
    # Get final stacked predictions
    stacked_preds = stacker.get_test_preds()
    
    # Save
    submission = pd.DataFrame({'ID': test_processed['ID'], 'Target': np.clip(stacked_preds, 0, stacked_preds.max())})
    submission.to_csv(output_path, index=False)
    print(f"✅ Inference complete. Predictions saved to {output_path}")

    print(submission)

Writing train.py


In [7]:
%%writefile main.py
import argparse
from train import run_training, run_inference

def main():
    parser = argparse.ArgumentParser(description="Run ML pipeline")
    parser.add_argument(
        "--mode", choices=["train", "infer"], required=True,
        help="Mode: 'train' to train models, 'infer' to generate predictions"
    )
    parser.add_argument(
        "--output", type=str, default="predictions.csv",
        help="Path to save predictions (default: predictions.csv)"
    )
    args = parser.parse_args()

    if args.mode == "train":
        print("🚀 Starting training...")
        run_training()
    elif args.mode == "infer":
        print("🔮 Running inference...")
        run_inference(args.output)

if __name__ == "__main__":
    main()

Writing main.py


In [8]:
!python main.py --mode train

🚀 Starting training...
✅ Data loaded successfully!
 - Train shape: (3579, 13)
 - Test shape: (1535, 12)
artifacts/fe.pkl
Training Folds: 100%|███████████████████████████| 10/10 [00:08<00:00,  1.20it/s]

Mean CV RMSE: 6.7329
Training Folds: 100%|███████████████████████████| 10/10 [00:08<00:00,  1.19it/s]

Mean CV RMSE: 6.4441
Training Folds: 100%|███████████████████████████| 10/10 [00:06<00:00,  1.61it/s]

Mean CV RMSE: 6.7136 ± 1.1918
Training Folds: 100%|███████████████████████████| 10/10 [00:05<00:00,  1.86it/s]

Mean CV RMSE: 6.4691 ± 1.7824
Stacking Folds: 100%|██████████████████████████| 10/10 [00:00<00:00, 180.18it/s]

Stacking CV RMSE: 6.6452 ± 1.1880
✅ Training complete. Models saved in 'artifacts/'.


In [9]:
!python main.py --mode infer --output predictions.csv

🔮 Running inference...
✅ Data loaded successfully!
 - Train shape: (3579, 13)
 - Test shape: (1535, 12)
✅ Inference complete. Predictions saved to predictions.csv
                      ID     Target
8     ID_AjcgJM_20100109   2.776282
12    ID_1FILm2_20100113   9.799237
14    ID_uRUSlU_20100115   3.835452
17    ID_cJcBK7_20100118   3.669714
19    ID_zEuY13_20100120   2.130194
...                  ...        ...
5096  ID_0y8uNw_20231215   4.008837
5104  ID_Qj20ey_20231223   3.984340
5110  ID_BgUL19_20231229  10.963587
5112  ID_ow4iio_20231231   2.724987
5113  ID_V5zPUV_20240101   3.361575

[1535 rows x 2 columns]
