# Model Ensemble Approach

Cleaner workflow:
1. Split data into train/validation
2. Tune hyperparameters for each model type (XGBoost, LightGBM, CatBoost)
3. Generate EWGM (Exponentially Weighted Geometric Mean) predictions
4. Find optimal ensemble weights across all 4 models
5. Retrain everything on full data
6. Predict on test set with weighted ensemble

In [34]:
# Quick test mode - reduces iterations/models for faster iteration
# Set to False for final submission run
TEST_MODE = False

## Imports

In [35]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

print(f"Running in {'TEST' if TEST_MODE else 'PRODUCTION'} mode")

Running in PRODUCTION mode


## EWGM (Exponentially Weighted Geometric Mean) Implementation

A statistical model that uses exponentially weighted geometric means to predict future values based on historical patterns. Works particularly well for capturing seasonality effects like December spikes.

In [36]:
def build_month_codes():
    """Map month abbreviations to numbers"""
    return {
        'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
        'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
        'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
    }

def split_test_id_column(df):
    """Split test ID into month_text and sector"""
    parts = df.id.str.split('_', expand=True)
    df['month_text'] = parts[0]
    df['sector'] = parts[1]
    return df

def add_time_and_sector_fields(df, month_codes):
    """Add time and sector_id fields"""
    # Extract sector_id from sector column
    if 'sector' in df.columns and 'sector_id' not in df.columns:
        if df['sector'].dtype == 'object' and 'sector' in df['sector'].iloc[0]:
            df['sector_id'] = df.sector.str.slice(7, None).astype(int)
        else:
            # If sector is just the ID, use it directly
            df['sector_id'] = df['sector'].astype(int) if df['sector'].dtype != 'int64' else df['sector']
    
    # Handle month_text column - create it if it doesn't exist
    if 'month_text' not in df.columns:
        # If we have 'month' column in format '2019-Jan', use it
        if 'month' in df.columns:
            df['month_text'] = df['month']
    
    # Extract month number and year
    if 'month' not in df.columns or df['month'].dtype == 'object':
        # Parse from month_text
        df['month_num'] = df['month_text'].str.slice(5, None).map(month_codes)
        df['year'] = df['month_text'].str.slice(0, 4).astype(int)
    else:
        # Already have numeric month
        df['month_num'] = df['month']
        if 'year' not in df.columns:
            df['year'] = df['month_text'].str.slice(0, 4).astype(int)
    
    df['time'] = (df['year'] - 2019) * 12 + df['month_num'] - 1
    
    return df

def build_amount_matrix(train_nht, month_codes):
    """
    Build a matrix of transaction amounts: rows=time, columns=sector_id
    Missing sector-month combinations are filled with 0
    """
    train_nht = train_nht.copy()
    
    # Ensure we have month_text column
    if 'month_text' not in train_nht.columns and 'month' in train_nht.columns:
        train_nht['month_text'] = train_nht['month']
    
    train_nht = add_time_and_sector_fields(train_nht, month_codes)
    
    # Pivot: time as index, sector_id as columns
    pivot = train_nht.set_index(['time', 'sector_id']).amount_new_house_transactions.unstack()
    pivot = pivot.fillna(0)
    
    # Ensure all 96 sectors exist
    all_sectors = np.arange(1, 97)
    for s in all_sectors:
        if s not in pivot.columns:
            pivot[s] = 0
    
    pivot = pivot[all_sectors]
    return pivot

def compute_december_multipliers(a_tr, eps=1e-9, min_dec_obs=1, clip_low=0.8, clip_high=1.5):
    """
    Compute sector-specific December multipliers
    December months have time % 12 == 11
    """
    is_december = (a_tr.index.values % 12) == 11
    dec_counts = a_tr[is_december].astype(bool).sum(axis=0)
    
    dec_means = a_tr[is_december].mean(axis=0)
    nondec_means = a_tr[~is_december].mean(axis=0)
    
    raw_mult = dec_means / (nondec_means + eps)
    overall_mult = float(dec_means.mean() / (nondec_means.mean() + eps))
    
    # Use overall multiplier where insufficient December observations
    raw_mult = raw_mult.where(dec_counts >= min_dec_obs, overall_mult)
    
    # Replace inf/-inf with 1.0
    raw_mult = raw_mult.replace([np.inf, -np.inf], 1.0).fillna(1.0)
    
    # Clip to reasonable range
    clipped_mult = raw_mult.clip(lower=clip_low, upper=clip_high)
    
    return clipped_mult.to_dict()

def apply_december_bump(a_pred, sector_to_mult):
    """Apply December multipliers to predictions"""
    dec_rows = [t for t in a_pred.index.values if (t % 12) == 11]
    
    if len(dec_rows) == 0:
        return a_pred
    
    for sector in a_pred.columns:
        m = sector_to_mult.get(sector, 1.0)
        a_pred.loc[dec_rows, sector] = a_pred.loc[dec_rows, sector] * m
    
    return a_pred

def ewgm_per_sector(a_tr, sector, n_lags, alpha):
    """
    Calculate Exponentially Weighted Geometric Mean for one sector
    
    EWGM = exp(sum(weights * log(values)))
    
    Parameters:
    - a_tr: training amount matrix
    - sector: sector ID
    - n_lags: number of historical values to use
    - alpha: decay parameter (0-1), higher = more weight on recent
    """
    # Calculate exponential weights
    weights = np.array([alpha**(n_lags - 1 - i) for i in range(n_lags)], dtype=float)
    weights = weights / weights.sum()  # Normalize
    
    # Get historical values
    values = a_tr[sector].values
    
    # Take last n_lags observations
    if len(values) < n_lags:
        values = np.pad(values, (n_lags - len(values), 0), constant_values=0)
    else:
        values = values[-n_lags:]
    
    # Add epsilon to avoid log(0)
    values = values + 1e-10
    
    # Compute EWGM
    log_values = np.log(values)
    ewgm = np.exp(np.dot(weights, log_values))
    
    return ewgm

def generate_ewgm_predictions(train_data, test_data, n_lags=6, alpha=0.9):
    """
    Generate EWGM predictions for test set
    
    Parameters:
    - train_data: training DataFrame with amount_new_house_transactions
    - test_data: test DataFrame with id column
    - n_lags: number of lags to use (default: 6)
    - alpha: exponential decay parameter (default: 0.9)
    
    Returns:
    - predictions: array of predictions matching test_data order
    """
    month_codes = build_month_codes()
    
    # Build amount matrix from training data
    a_tr = build_amount_matrix(train_data, month_codes)
    
    # Compute December multipliers
    sector_to_mult = compute_december_multipliers(a_tr)
    
    # Prepare test data
    test_df = test_data.copy()
    test_df = split_test_id_column(test_df)
    test_df = add_time_and_sector_fields(test_df, month_codes)
    
    # Generate predictions
    predictions = []
    
    for idx, row in test_df.iterrows():
        sector_id = row['sector_id']
        time = row['time']
        
        # Calculate EWGM for this sector
        ewgm_pred = ewgm_per_sector(a_tr, sector_id, n_lags, alpha)
        
        # Apply December bump if applicable
        if (time % 12) == 11:  # December
            mult = sector_to_mult.get(sector_id, 1.0)
            ewgm_pred = ewgm_pred * mult
        
        predictions.append(ewgm_pred)
    
    return np.array(predictions)

print("EWGM functions defined")

EWGM functions defined


## Competition Metric Implementation

In [37]:
def competition_metric(y_true, y_pred, eps=1e-10, verbose=True):
    """
    Two-stage metric from competition rules:
    - Stage 1: If >30% samples have APE>100%, score=0
    - Stage 2: Otherwise, calculate scaled MAPE on samples with APE<=100%
    """
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    
    # Absolute percentage error for each sample
    ape = np.abs(y_true - y_pred) / np.maximum(y_true, eps)
    
    # Stage 1 check
    extreme_errors = (ape > 1.0).sum() / len(ape)
    if verbose:
        print(f"  APE > 100%: {extreme_errors*100:.1f}%")
    
    if extreme_errors > 0.3:
        if verbose:
            print("  FAILED Stage 1")
        return 0.0
    
    # Stage 2: scaled MAPE on good predictions only
    valid_mask = ape <= 1.0
    if valid_mask.sum() == 0:
        return 0.0
    
    mape = ape[valid_mask].mean()
    fraction_valid = valid_mask.sum() / len(ape)
    scaled_mape = mape / fraction_valid
    score = max(0, 1 - scaled_mape)
    
    if verbose:
        print(f"  Valid samples: {fraction_valid*100:.1f}%")
        print(f"  MAPE (valid): {mape:.4f}")
        print(f"  Scaled MAPE: {scaled_mape:.4f}")
    
    return score

## Load Data

In [38]:
file_path = "./data/"

# All the training data files
new_house = pd.read_csv(file_path + 'train/new_house_transactions.csv')
new_house_nearby = pd.read_csv(file_path + 'train/new_house_transactions_nearby_sectors.csv')
pre_owned = pd.read_csv(file_path + 'train/pre_owned_house_transactions.csv')
pre_owned_nearby = pd.read_csv(file_path + 'train/pre_owned_house_transactions_nearby_sectors.csv')
land_trans = pd.read_csv(file_path + 'train/land_transactions.csv')
land_trans_nearby = pd.read_csv(file_path + 'train/land_transactions_nearby_sectors.csv')
sector_poi = pd.read_csv(file_path + 'train/sector_POI.csv')
city_indexes = pd.read_csv(file_path + 'train/city_indexes.csv')
search_index = pd.read_csv(file_path + 'train/city_search_index.csv')
test = pd.read_csv(file_path + 'test.csv')
test[['month', 'sector']] = test['id'].str.split('_', n=1, expand=True)

print(f"Training samples: {len(new_house)}")
print(f"Test samples: {len(test)}")

Training samples: 5433
Test samples: 1152


## Feature Engineering Function

Putting this in a function so I can reuse it for train/val/test without code duplication

In [39]:
def create_base_features(df, target='amount_new_house_transactions'):
    """
    Merge all auxiliary datasets and create basic features
    (everything except lag features which need special handling)
    """
    data = df.copy()
    
    # Merge all the datasets
    data = data.merge(new_house_nearby, on=['month', 'sector'], how='left')
    data = data.merge(pre_owned, on=['month', 'sector'], how='left')
    data = data.merge(pre_owned_nearby, on=['month', 'sector'], how='left')
    data = data.merge(land_trans, on=['month', 'sector'], how='left')
    data = data.merge(land_trans_nearby, on=['month', 'sector'], how='left')
    data = data.merge(sector_poi, on='sector', how='left')
    
    # DateTime features
    data['month_date'] = pd.to_datetime(data['month'], format='%Y-%b' if 'id' not in data.columns else '%Y %b')
    data['year'] = data['month_date'].dt.year
    data['month_num'] = data['month_date'].dt.month
    data['quarter'] = data['month_date'].dt.quarter
    
    # City-level economic indicators
    data = data.merge(city_indexes, left_on='year', right_on='city_indicator_data_year', how='left')
    
    # Search volume aggregates by month
    search_agg = search_index.groupby('month').agg({
        'search_volume': ['sum', 'mean', 'max', 'std']
    }).reset_index()
    search_agg.columns = ['month', 'search_volume_sum', 'search_volume_mean', 
                          'search_volume_max', 'search_volume_std']
    data = data.merge(search_agg, on='month', how='left')
    
    # Cyclic encoding for seasonality - helps model understand Dec is close to Jan
    data['month_sin'] = np.sin(2 * np.pi * data['month_num'] / 12)
    data['month_cos'] = np.cos(2 * np.pi * data['month_num'] / 12)
    
    return data


def create_lag_features(data, target='amount_new_house_transactions'):
    """
    Create time-based lag features
    Must be called AFTER sorting by sector and date
    """
    data = data.sort_values(['sector', 'month_date']).copy()
    
    # Target encoding - expanding mean (prevents leakage with shift)
    data['sector_target_encoded'] = data.groupby('sector')[target].transform(
        lambda x: x.shift(1).expanding().mean()
    )
    
    # Simple lags - what happened N months ago?
    for lag in [1, 2, 3, 4, 6, 12]:
        data[f'amount_lag_{lag}'] = data.groupby('sector')[target].shift(lag)
    
    # Exponential moving averages - recent values weighted more
    for span in [3, 6, 12]:
        data[f'amount_ewm_{span}'] = data.groupby('sector')[target].transform(
            lambda x: x.shift(1).ewm(span=span, adjust=False).mean()
        )
    
    # Rolling statistics - trends and volatility
    for window in [3, 6, 12]:
        data[f'amount_rolling_mean_{window}'] = data.groupby('sector')[target].transform(
            lambda x: x.shift(1).rolling(window, min_periods=1).mean()
        )
        data[f'amount_rolling_std_{window}'] = data.groupby('sector')[target].transform(
            lambda x: x.shift(1).rolling(window, min_periods=1).std()
        )
    
    # Year-over-year growth rate
    data['amount_lag_12_growth'] = (
        data.groupby('sector')[target].shift(1) - data.groupby('sector')[target].shift(13)
    ) / (data.groupby('sector')[target].shift(13) + 1)
    
    return data


def add_sector_stats(data, train_data, target='amount_new_house_transactions'):
    """
    Add sector-level statistics calculated from training data only
    """
    sector_stats = train_data.groupby('sector')[target].agg(['mean', 'std']).reset_index()
    sector_stats.columns = ['sector', 'sector_mean', 'sector_std']
    data = data.merge(sector_stats, on='sector', how='left')
    return data


print("Feature engineering functions defined")

Feature engineering functions defined


## STEP 1: Create Train/Validation Split

Using last 6 months as validation to simulate the actual test scenario

In [40]:
target = 'amount_new_house_transactions'

# Changed: build dense sector×month grid including zero targets
months = sorted(set(new_house['month']) | set(pre_owned['month']) | set(land_trans['month']))
sectors = sorted(set(new_house['sector']) | set(pre_owned['sector']) | set(land_trans['sector']))
train_grid = pd.DataFrame([(m, s) for m in months for s in sectors], columns=['month', 'sector'])

# Attach target (zeros when missing)
train_grid = train_grid.merge(new_house[['month', 'sector', target]], on=['month', 'sector'], how='left')
train_grid[target] = train_grid[target].fillna(0)

# Start with base features on the grid (not just observed rows)
all_data = create_base_features(train_grid, target=target)

# Time-based split - last 6 months for validation
val_months = 6
val_cutoff = all_data['month_date'].max() - pd.DateOffset(months=val_months)

train_data = all_data[all_data['month_date'] < val_cutoff].copy()
val_data = all_data[all_data['month_date'] >= val_cutoff].copy()

print(f"Train: {len(train_data)} samples, {train_data['month_date'].min()} to {train_data['month_date'].max()}")
print(f"Val:   {len(val_data)} samples, {val_data['month_date'].min()} to {val_data['month_date'].max()}")

# Create lag features separately for train and val
# For train: only use train data
train_data = create_lag_features(train_data, target=target)

# For val: concatenate train+val, create lags, then extract val portion
# This way validation lags can use recent training data
combined = pd.concat([train_data, val_data]).sort_values(['sector', 'month_date'])
combined = create_lag_features(combined, target=target)
val_data = combined[combined['month_date'] >= val_cutoff].copy()

# Add sector statistics (calculated only from training data)
train_data = add_sector_stats(train_data, train_data, target=target)
val_data = add_sector_stats(val_data, train_data, target=target)

# Define features to use (exclude target and metadata)
exclude_cols = [
    'month', 'sector', 'month_date', target, 'city_indicator_data_year',
    'num_new_house_transactions', 'area_new_house_transactions', 'price_new_house_transactions',
    'area_per_unit_new_house_transactions', 'total_price_per_unit_new_house_transactions',
    'num_new_house_available_for_sale', 'area_new_house_available_for_sale',
    'period_new_house_sell_through'
]
feature_cols = [col for col in train_data.columns
                if col not in exclude_cols and train_data[col].dtype in ['int64', 'float64']]

X_train = train_data[feature_cols].fillna(0)
y_train = train_data[target]
X_val = val_data[feature_cols].fillna(0)
y_val = val_data[target]

print(f"\nFeatures: {len(feature_cols)}")
print(f"X_train: {X_train.shape}, X_val: {X_val.shape}")


Train: 6912 samples, 2019-01-01 00:00:00 to 2023-12-01 00:00:00
Val:   672 samples, 2024-01-01 00:00:00 to 2024-07-01 00:00:00

Features: 264
X_train: (6912, 264), X_val: (672, 264)


## STEP 2: Hyperparameter Tuning

Grid search for each model type using validation set

In [41]:
# Reduce search space in test mode
if TEST_MODE:
    print("[TEST MODE] Using reduced hyperparameter search")
    xgb_param_grid = {
        'n_estimators': [500],
        'max_depth': [7],
        'learning_rate': [0.02]
    }
    lgb_param_grid = {
        'n_estimators': [500],
        'max_depth': [7],
        'learning_rate': [0.02]
    }
    cat_param_grid = {
        'iterations': [500],
        'depth': [7],
        'learning_rate': [0.03]
    }
else:
    print("[PRODUCTION MODE] Using full hyperparameter search")
    xgb_param_grid = {
        'n_estimators': [1500, 2000, 2500],
        'max_depth': [7, 8, 9],
        'learning_rate': [0.01, 0.015, 0.02]
    }
    lgb_param_grid = {
        'n_estimators': [1500, 2000, 2500],
        'max_depth': [7, 8, 9],
        'learning_rate': [0.01, 0.015, 0.02]
    }
    cat_param_grid = {
        'iterations': [1000, 1500, 2000],
        'depth': [7, 8, 9],
        'learning_rate': [0.02, 0.03, 0.04]
    }

# Tune XGBoost
print("\nTuning XGBoost...")
best_xgb_score = -1
best_xgb_params = None

for n_est in xgb_param_grid['n_estimators']:
    for depth in xgb_param_grid['max_depth']:
        for lr in xgb_param_grid['learning_rate']:
            model = xgb.XGBRegressor(
                n_estimators=n_est,
                max_depth=depth,
                learning_rate=lr,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                n_jobs=-1
            )
            model.fit(X_train, y_train, verbose=False)
            pred = model.predict(X_val)
            score = competition_metric(y_val, pred, verbose=False)
            
            if score > best_xgb_score:
                best_xgb_score = score
                best_xgb_params = {'n_estimators': n_est, 'max_depth': depth, 'learning_rate': lr}
                print(f"  New best: {best_xgb_params}, score: {best_xgb_score:.4f}")

print(f"\nBest XGBoost: {best_xgb_params}, score: {best_xgb_score:.4f}")

# Tune LightGBM
print("\nTuning LightGBM...")
best_lgb_score = -1
best_lgb_params = None

for n_est in lgb_param_grid['n_estimators']:
    for depth in lgb_param_grid['max_depth']:
        for lr in lgb_param_grid['learning_rate']:
            model = lgb.LGBMRegressor(
                n_estimators=n_est,
                max_depth=depth,
                learning_rate=lr,
                num_leaves=2**depth,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                n_jobs=-1,
                verbose=-1
            )
            model.fit(X_train, y_train)
            pred = model.predict(X_val)
            score = competition_metric(y_val, pred, verbose=False)
            
            if score > best_lgb_score:
                best_lgb_score = score
                best_lgb_params = {'n_estimators': n_est, 'max_depth': depth, 'learning_rate': lr}
                print(f"  New best: {best_lgb_params}, score: {best_lgb_score:.4f}")

print(f"\nBest LightGBM: {best_lgb_params}, score: {best_lgb_score:.4f}")

# Tune CatBoost
print("\nTuning CatBoost...")
best_cat_score = -1
best_cat_params = None

for n_est in cat_param_grid['iterations']:
    for depth in cat_param_grid['depth']:
        for lr in cat_param_grid['learning_rate']:
            model = CatBoostRegressor(
                iterations=n_est,
                depth=depth,
                learning_rate=lr,
                l2_leaf_reg=3.0,
                random_seed=42,
                loss_function='MAE',
                verbose=False
            )
            model.fit(X_train, y_train)
            pred = model.predict(X_val)
            score = competition_metric(y_val, pred, verbose=False)
            
            if score > best_cat_score:
                best_cat_score = score
                best_cat_params = {'iterations': n_est, 'depth': depth, 'learning_rate': lr}
                print(f"  New best: {best_cat_params}, score: {best_cat_score:.4f}")

print(f"\nBest CatBoost: {best_cat_params}, score: {best_cat_score:.4f}")

[PRODUCTION MODE] Using full hyperparameter search

Tuning XGBoost...
  New best: {'n_estimators': 1500, 'max_depth': 7, 'learning_rate': 0.01}, score: 0.0000

Best XGBoost: {'n_estimators': 1500, 'max_depth': 7, 'learning_rate': 0.01}, score: 0.0000

Tuning LightGBM...
  New best: {'n_estimators': 1500, 'max_depth': 7, 'learning_rate': 0.01}, score: 0.0000

Best LightGBM: {'n_estimators': 1500, 'max_depth': 7, 'learning_rate': 0.01}, score: 0.0000

Tuning CatBoost...
  New best: {'iterations': 1000, 'depth': 7, 'learning_rate': 0.02}, score: 0.5070
  New best: {'iterations': 1000, 'depth': 8, 'learning_rate': 0.02}, score: 0.5088
  New best: {'iterations': 1000, 'depth': 9, 'learning_rate': 0.04}, score: 0.5117
  New best: {'iterations': 2000, 'depth': 7, 'learning_rate': 0.02}, score: 0.5122

Best CatBoost: {'iterations': 2000, 'depth': 7, 'learning_rate': 0.02}, score: 0.5122


## STEP 3: Train Models with Best Params

Create ensemble of models using best hyperparameters found above

In [42]:
# Reduce ensemble size in test mode
if TEST_MODE:
    print("[TEST MODE] Training 2 models per algorithm")
    seeds = [42, 123]
else:
    print("[PRODUCTION MODE] Training 5 models per algorithm")
    seeds = [42, 123, 456, 789, 2024]

# Train XGBoost ensemble
print("\nTraining XGBoost models...")
xgb_models = []
for seed in seeds:
    model = xgb.XGBRegressor(
        n_estimators=best_xgb_params['n_estimators'],
        max_depth=best_xgb_params['max_depth'],
        learning_rate=best_xgb_params['learning_rate'],
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0.1,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=seed,
        n_jobs=-1
    )
    model.fit(X_train, y_train, verbose=False)
    xgb_models.append(model)
    print(f"  XGB seed {seed} trained")

# Train LightGBM ensemble
print("\nTraining LightGBM models...")
lgb_models = []
for seed in seeds:
    model = lgb.LGBMRegressor(
        n_estimators=best_lgb_params['n_estimators'],
        max_depth=best_lgb_params['max_depth'],
        learning_rate=best_lgb_params['learning_rate'],
        num_leaves=2**best_lgb_params['max_depth'],
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=seed,
        n_jobs=-1,
        verbose=-1
    )
    model.fit(X_train, y_train)
    lgb_models.append(model)
    print(f"  LGB seed {seed} trained")

# Train CatBoost ensemble
print("\nTraining CatBoost models...")
cat_models = []
for seed in seeds:
    model = CatBoostRegressor(
        iterations=best_cat_params['iterations'],
        depth=best_cat_params['depth'],
        learning_rate=best_cat_params['learning_rate'],
        l2_leaf_reg=3.0,
        random_seed=seed,
        loss_function='MAE',
        verbose=False
    )
    model.fit(X_train, y_train)
    cat_models.append(model)
    print(f"  CAT seed {seed} trained")

print(f"\nTotal models: {len(xgb_models)} XGB, {len(lgb_models)} LGB, {len(cat_models)} CAT")

[PRODUCTION MODE] Training 5 models per algorithm

Training XGBoost models...
  XGB seed 42 trained
  XGB seed 123 trained
  XGB seed 456 trained
  XGB seed 789 trained
  XGB seed 2024 trained

Training LightGBM models...
  LGB seed 42 trained
  LGB seed 123 trained
  LGB seed 456 trained
  LGB seed 789 trained
  LGB seed 2024 trained

Training CatBoost models...
  CAT seed 42 trained
  CAT seed 123 trained
  CAT seed 456 trained
  CAT seed 789 trained
  CAT seed 2024 trained

Total models: 5 XGB, 5 LGB, 5 CAT


## STEP 4: Optimize Ensemble Weights (Including EWGM)

Find the best weighted combination of all 4 model types using validation set

In [43]:
# Get predictions from each model type (average across seeds)
xgb_val_preds = np.mean([m.predict(X_val) for m in xgb_models], axis=0)
lgb_val_preds = np.mean([m.predict(X_val) for m in lgb_models], axis=0)
cat_val_preds = np.mean([m.predict(X_val) for m in cat_models], axis=0)

# Generate EWGM predictions for validation set
print("Generating EWGM validation predictions...")
# Need to use only training data for EWGM (not validation)
ewgm_val_preds = generate_ewgm_predictions(
    train_data=new_house,  # Original training data
    test_data=val_data[['month', 'sector']].assign(id=lambda x: x['month'] + '_' + x['sector']),
    n_lags=6,
    alpha=0.9
)
print(f"EWGM predictions: min={ewgm_val_preds.min():.2f}, max={ewgm_val_preds.max():.2f}, mean={ewgm_val_preds.mean():.2f}")

print("\nSearching for optimal ensemble weights (including EWGM)...")

# Grid search over weight combinations
# w_xgb + w_lgb + w_cat + w_ewgm = 1.0
if TEST_MODE:
    grid = np.linspace(0, 1, 6)  # Even coarser grid with 4 models
else:
    grid = np.linspace(0, 1, 11)  # Finer grid in production

best_score = -1
best_weights = None

for w_xgb in grid:
    for w_lgb in grid:
        for w_cat in grid:
            w_ewgm = 1.0 - w_xgb - w_lgb - w_cat
            
            # Skip invalid weight combinations
            if w_ewgm < 0 or w_ewgm > 1:
                continue
            
            # Weighted ensemble
            ensemble_pred = (w_xgb * xgb_val_preds + 
                           w_lgb * lgb_val_preds + 
                           w_cat * cat_val_preds + 
                           w_ewgm * ewgm_val_preds)
            score = competition_metric(y_val, ensemble_pred, verbose=False)
            
            if score > best_score:
                best_score = score
                best_weights = (w_xgb, w_lgb, w_cat, w_ewgm)
                print(f"  New best: XGB={w_xgb:.2f}, LGB={w_lgb:.2f}, CAT={w_cat:.2f}, EWGM={w_ewgm:.2f}, score={score:.4f}")

print(f"\n=== Best Ensemble Weights ===")
print(f"XGBoost:  {best_weights[0]:.3f}")
print(f"LightGBM: {best_weights[1]:.3f}")
print(f"CatBoost: {best_weights[2]:.3f}")
print(f"EWGM:     {best_weights[3]:.3f}")
print(f"Val Score: {best_score:.4f}")

# Validate the final ensemble
final_val_pred = (
    best_weights[0] * xgb_val_preds + 
    best_weights[1] * lgb_val_preds + 
    best_weights[2] * cat_val_preds +
    best_weights[3] * ewgm_val_preds
)

print("\n=== Validation Performance ===")
competition_metric(y_val, final_val_pred)
print(f"MAE: {mean_absolute_error(y_val, final_val_pred):.2f}")

Generating EWGM validation predictions...
EWGM predictions: min=0.00, max=117040.84, mean=21391.46

Searching for optimal ensemble weights (including EWGM)...
  New best: XGB=0.00, LGB=0.00, CAT=0.00, EWGM=1.00, score=0.4933
  New best: XGB=0.00, LGB=0.00, CAT=0.10, EWGM=0.90, score=0.5299
  New best: XGB=0.00, LGB=0.00, CAT=0.20, EWGM=0.80, score=0.5385
  New best: XGB=0.00, LGB=0.00, CAT=0.30, EWGM=0.70, score=0.5487
  New best: XGB=0.00, LGB=0.10, CAT=0.20, EWGM=0.70, score=0.5513

=== Best Ensemble Weights ===
XGBoost:  0.000
LightGBM: 0.100
CatBoost: 0.200
EWGM:     0.700
Val Score: 0.5513

=== Validation Performance ===
  APE > 100%: 25.9%
  Valid samples: 74.1%
  MAPE (valid): 0.3325
  Scaled MAPE: 0.4487
MAE: 8386.62


## STEP 5: Retrain on ALL Data

Now that we have best params and weights, retrain everything on full dataset for final submission

In [44]:
print("=== RETRAINING ON FULL DATA ===")

# Changed: rebuild full dense grid and features
months = sorted(set(new_house['month']) | set(pre_owned['month']) | set(land_trans['month']))
sectors = sorted(set(new_house['sector']) | set(pre_owned['sector']) | set(land_trans['sector']))
full_grid = pd.DataFrame([(m, s) for m in months for s in sectors], columns=['month', 'sector'])
full_grid = full_grid.merge(new_house[['month', 'sector', target]], on=['month', 'sector'], how='left')
full_grid[target] = full_grid[target].fillna(0)

full_data = create_base_features(full_grid, target=target)
full_data = create_lag_features(full_data, target=target)
full_data = add_sector_stats(full_data, full_data, target=target)

X_full = full_data[feature_cols].fillna(0)
y_full = full_data[target]

print(f"Full training set: {X_full.shape}")

# Calculate clipping bounds from full data
hist_q99 = y_full.quantile(0.99)
print(f"Target range: {y_full.min():.2f} to {y_full.max():.2f}")
print(f"Q99: {hist_q99:.2f}")

# Changed: per-sector caps from recent history (last 6 months), fallback to sector max
last_train_month = full_data['month_date'].max()
window_start = last_train_month - pd.DateOffset(months=6)
recent = full_data[full_data['month_date'] >= window_start]
sector_recent_max = recent.groupby('sector')[target].max()
sector_all_max = full_data.groupby('sector')[target].max()
sector_cap_series = (1.2 * sector_recent_max).fillna(1.2 * sector_all_max)
sector_caps = sector_cap_series.to_dict()

# Global fallback cap
global_cap = float(hist_q99 * 1.5)
print(f"Will cap predictions per sector using recent history; global cap {global_cap:.2f}")

# Train final XGBoost models
print("\nTraining final XGBoost models...")
final_xgb_models = []
for seed in seeds:
    model = xgb.XGBRegressor(
        n_estimators=best_xgb_params['n_estimators'],
        max_depth=best_xgb_params['max_depth'],
        learning_rate=best_xgb_params['learning_rate'],
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0.1,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=seed,
        n_jobs=-1
    )
    model.fit(X_full, y_full, verbose=False)
    final_xgb_models.append(model)
    print(f"  XGB seed {seed} ✓")

# Train final LightGBM models
print("\nTraining final LightGBM models...")
final_lgb_models = []
for seed in seeds:
    model = lgb.LGBMRegressor(
        n_estimators=best_lgb_params['n_estimators'],
        max_depth=best_lgb_params['max_depth'],
        learning_rate=best_lgb_params['learning_rate'],
        num_leaves=2**best_lgb_params['max_depth'],
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=seed,
        n_jobs=-1,
        verbose=-1
    )
    model.fit(X_full, y_full)
    final_lgb_models.append(model)
    print(f"  LGB seed {seed} ✓")

# Train final CatBoost models
print("\nTraining final CatBoost models...")
final_cat_models = []
for seed in seeds:
    model = CatBoostRegressor(
        iterations=best_cat_params['iterations'],
        depth=best_cat_params['depth'],
        learning_rate=best_cat_params['learning_rate'],
        l2_leaf_reg=3.0,
        random_seed=seed,
        loss_function='MAE',
        verbose=False
    )
    model.fit(X_full, y_full)
    final_cat_models.append(model)


=== RETRAINING ON FULL DATA ===
Full training set: (7584, 264)
Target range: 0.00 to 606407.64
Q99: 224087.58
Will cap predictions per sector using recent history; global cap 336131.37

Training final XGBoost models...
  XGB seed 42 ✓
  XGB seed 123 ✓
  XGB seed 456 ✓
  XGB seed 789 ✓
  XGB seed 2024 ✓

Training final LightGBM models...
  LGB seed 42 ✓
  LGB seed 123 ✓
  LGB seed 456 ✓
  LGB seed 789 ✓
  LGB seed 2024 ✓

Training final CatBoost models...


## STEP 6: Prepare Test Data

Apply same feature engineering to test set

In [45]:
# Base features for test
test_data = create_base_features(test, target=target)

# For lag features, need to concatenate with training data
# so test can use recent history from training
test_combined = pd.concat([
    full_data[['month', 'sector', 'month_date', target]],
    test_data[['month', 'sector', 'month_date']].assign(**{target: np.nan})
]).sort_values(['sector', 'month_date']).reset_index(drop=True)

# Create lag features on combined data
test_combined = create_lag_features(test_combined, target=target)

# Extract just the test portion
test_features = test_combined[test_combined[target].isna()].reset_index(drop=True)

# Merge lag features back to test_data
lag_cols = ['sector_target_encoded'] + \
           [f'amount_lag_{lag}' for lag in [1,2,3,4,6,12]] + \
           [f'amount_ewm_{span}' for span in [3,6,12]] + \
           [f'amount_rolling_mean_{w}' for w in [3,6,12]] + \
           [f'amount_rolling_std_{w}' for w in [3,6,12]] + \
           ['amount_lag_12_growth']

for col in lag_cols:
    test_data[col] = test_features[col].values

# Add sector stats from full training data
test_data = add_sector_stats(test_data, full_data, target=target)

# Create feature matrix
X_test = test_data[feature_cols].fillna(0)

print(f"Test data: {X_test.shape}")
print(f"Features match: {list(X_full.columns) == list(X_test.columns)}")

Test data: (1152, 264)
Features match: True


In [46]:
# Average predictions from each model type
xgb_test_preds = np.mean([m.predict(X_test) for m in final_xgb_models], axis=0)
lgb_test_preds = np.mean([m.predict(X_test) for m in final_lgb_models], axis=0)
cat_test_preds = np.mean([m.predict(X_test) for m in final_cat_models], axis=0)

# Generate EWGM predictions for test set
print("Generating EWGM test predictions...")
ewgm_test_preds = generate_ewgm_predictions(
    train_data=new_house,  # Use full training data
    test_data=test,
    n_lags=6,
    alpha=0.9
)
print(f"EWGM predictions: min={ewgm_test_preds.min():.2f}, max={ewgm_test_preds.max():.2f}, mean={ewgm_test_preds.mean():.2f}")

# Weighted ensemble with optimized weights (including EWGM)
test_pred = (
    best_weights[0] * xgb_test_preds + 
    best_weights[1] * lgb_test_preds + 
    best_weights[2] * cat_test_preds +
    best_weights[3] * ewgm_test_preds
)

# Changed: non-negativity and sector-wise clipping
# Non-negativity first
test_pred = np.maximum(test_pred, 0)

# Sector-wise caps (fallback to global)
caps = np.array([sector_caps.get(s, global_cap) for s in test['sector']])
test_pred = np.minimum(test_pred, caps)

# Changed: zero-history heuristic — if recent lags are all zero/missing, predict zero
lag_check_cols = [f'amount_lag_{k}' for k in [1,2,3,6]]
zero_hist_mask = test_data[lag_check_cols].fillna(0).sum(axis=1) == 0
test_pred[zero_hist_mask.values] = 0

# Zero out sectors that weren't in training data
train_sectors = set(full_data['sector'].unique())
unseen_mask = ~test['sector'].isin(train_sectors)
test_pred[unseen_mask] = 0

unseen_sectors = sorted([s for s in test['sector'].unique() if s not in train_sectors])
print(f"\nUnseen sectors (set to 0): {unseen_sectors}")
print(f"\nFinal Predictions:")
print(f"  Range: {test_pred.min():.2f} to {test_pred.max():.2f}")
print(f"  Mean: {test_pred.mean():.2f}")
print(f"  Median: {np.median(test_pred):.2f}")
print(f"  Zeros: {(test_pred == 0).sum()}")

Generating EWGM test predictions...
EWGM predictions: min=0.00, max=167256.95, mean=21926.91

Unseen sectors (set to 0): []

Final Predictions:
  Range: 0.00 to 151855.81
  Mean: 8566.48
  Median: 0.00
  Zeros: 684


## STEP 7: Generate Predictions

Use final models and optimal weights to create submission

## STEP 8: Create Submission File

In [47]:
submission = pd.DataFrame({
    'id': test['id'],
    'new_house_transaction_amount': test_pred
})

# Verify submission format
orig_test = pd.read_csv(file_path + 'test.csv')
assert submission['id'].tolist() == orig_test['id'].tolist(), "ID order mismatch!"
assert np.isfinite(submission['new_house_transaction_amount']).all(), "Non-finite predictions!"

submission.to_csv('submission.csv', index=False)
print("✅ Submission saved to submission.csv")

print("\nFirst 10 predictions:")
print(submission.head(10))

print("\nLast 10 predictions:")
print(submission.tail(10))

print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Mode: {'TEST' if TEST_MODE else 'PRODUCTION'}")
print(f"Best XGBoost params: {best_xgb_params}")
print(f"Best LightGBM params: {best_lgb_params}")
print(f"Best CatBoost params: {best_cat_params}")
print(f"Ensemble weights: XGB={best_weights[0]:.3f}, LGB={best_weights[1]:.3f}, CAT={best_weights[2]:.3f}")
print(f"Validation score: {best_score:.4f}")
print("="*60)

✅ Submission saved to submission.csv

First 10 predictions:
                   id  new_house_transaction_amount
0   2024 Aug_sector 1                   7328.270841
1   2024 Aug_sector 2                   5248.022657
2   2024 Aug_sector 3                   5585.699510
3   2024 Aug_sector 4                  51257.223070
4   2024 Aug_sector 5                   2930.945051
5   2024 Aug_sector 6                  12990.197647
6   2024 Aug_sector 7                      0.000000
7   2024 Aug_sector 8                      0.000000
8   2024 Aug_sector 9                      0.000000
9  2024 Aug_sector 10                      0.000000

Last 10 predictions:
                      id  new_house_transaction_amount
1142  2025 Jul_sector 87                      0.000000
1143  2025 Jul_sector 88                      0.000000
1144  2025 Jul_sector 89                      0.000000
1145  2025 Jul_sector 90                   5160.152581
1146  2025 Jul_sector 91                      0.000000
1147  2025 Jul_s