# Ensemble Model: Weighted Geometric Mean + Seasonal Bump

This notebook combines two approaches:
1. Weighted Geometric Mean with exponential decay
2. Simple seasonality bump approach

Final predictions are an ensemble of both methods.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

## Configuration

In [None]:
# ============================================================
# CONFIGURATION SECTION
# ============================================================

# Data Paths
DATA_PATH = Path("/kaggle/input/china-real-estate-demand-prediction")

# Method 1: Weighted Geometric Mean Configuration
CONFIG_METHOD1 = {
    'n_lags': 6,          # Number of months to look back
    'alpha': 0.5,         # Exponential decay parameter (0 < alpha < 1)
    't2': 6,              # Months to check for baseline condition (zero-handling)
}

# Method 2: Seasonality Bump Configuration
CONFIG_METHOD2 = {
    'n_lags': 7,          # Number of months to look back
    'alpha': 0.5,         # Exponential decay parameter
    't2': 6,              # Months to check for baseline condition
    'clip_low': 0.85,     # Lower bound for December multiplier
    'clip_high': 1.40,    # Upper bound for December multiplier
}

# Ensemble Configuration
CONFIG_ENSEMBLE = {
    'weight_method1': 0.30,    # Weight for Weighted Geometric Mean
    'weight_method2': 0.70,    # Weight for Seasonality Bump
}

# Output
OUTPUT_FILENAME = 'submission.csv'

# Display configuration
print("=" * 60)
print("ENSEMBLE MODEL CONFIGURATION")
print("=" * 60)
print("\nMethod 1 - Weighted Geometric Mean:")
for key, value in CONFIG_METHOD1.items():
    print(f"  {key}: {value}")

print("\nMethod 2 - Seasonality Bump:")
for key, value in CONFIG_METHOD2.items():
    print(f"  {key}: {value}")

print("\nEnsemble Weights:")
for key, value in CONFIG_ENSEMBLE.items():
    print(f"  {key}: {value}")

print("\nOutput:")
print(f"  filename: {OUTPUT_FILENAME}")
print("=" * 60)

## 1. Load Data

In [None]:
# Load training data
train_nht = pd.read_csv(DATA_PATH / "train" / "new_house_transactions.csv")
test = pd.read_csv(DATA_PATH / "test.csv")

# Convert month to datetime
train_nht['month'] = pd.to_datetime(train_nht['month'])

# Parse test IDs
test_id = test.id.str.split('_', expand=True)
test['month_text'] = test_id[0]
test['sector'] = test_id[1]

# Create month mapping
month_codes = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

# Add time features
train_nht['year'] = train_nht['month'].dt.year
train_nht['month_num'] = train_nht['month'].dt.month
train_nht['time'] = (train_nht['year'] - 2019) * 12 + train_nht['month_num'] - 1
train_nht['sector_id'] = train_nht.sector.str.slice(7, None).astype(int)

# Parse test data
test['year'] = test['month_text'].str.slice(0, 4).astype(int)
test['month_abbr'] = test['month_text'].str.slice(5, None)
test['month_num'] = test['month_abbr'].map(month_codes)
test['time'] = (test['year'] - 2019) * 12 + test['month_num'] - 1
test['sector_id'] = test.sector.str.slice(7, None).astype(int)

print(f"Training data: {train_nht.shape}")
print(f"Test data: {test.shape}")

## 2. Prepare Amount Matrix

In [None]:
# Create pivot table: time x sector_id
amount_matrix = train_nht.set_index(['time', 'sector_id']).amount_new_house_transactions.unstack()
amount_matrix = amount_matrix.fillna(0)

# Add sector 95 (missing in training)
amount_matrix[95] = 0
amount_matrix = amount_matrix[np.arange(1, 97)]

print(f"Amount matrix shape: {amount_matrix.shape}")
print(f"Time range: {amount_matrix.index.min()} to {amount_matrix.index.max()}")

## 3. Method 1: Weighted Geometric Mean (Exponential Decay)

In [None]:
def weighted_geometric_mean_prediction(amount_matrix, n_lags=6, alpha=0.5, t2=6):
    """
    Weighted geometric mean with exponential decay
    
    Parameters:
    - n_lags: number of months to use
    - alpha: exponential decay parameter (0.5 works best)
    - t2: months to check for baseline condition
    """
    # Generate exponential weights
    weights = np.array([alpha**(n_lags-1-i) for i in range(n_lags)])
    weights = weights / weights.sum()
    
    print(f"Weighted Geometric Mean:")
    print(f"  Lags: {n_lags}, Alpha: {alpha}, Weights: {weights.round(3)}")
    
    # Create prediction dataframe for test period (months 67-78)
    predictions = pd.DataFrame(index=np.arange(67, 79), columns=amount_matrix.columns, dtype=float)
    
    for sector in amount_matrix.columns:
        # Check baseline condition
        if (amount_matrix.tail(t2)[sector].min() == 0) or (amount_matrix[sector].sum() == 0):
            predictions[sector] = 0
            continue
        
        # Get recent values
        recent_vals = amount_matrix.tail(n_lags)[sector].values
        
        if len(recent_vals) == n_lags and (recent_vals > 0).any():
            # Only use positive values
            positive_mask = recent_vals > 0
            positive_vals = recent_vals[positive_mask]
            corresponding_weights = weights[positive_mask]
            
            if len(positive_vals) > 0:
                # Renormalize weights
                corresponding_weights = corresponding_weights / corresponding_weights.sum()
                
                # Weighted geometric mean: exp(sum(wi * log(xi)))
                log_vals = np.log(positive_vals)
                weighted_log_mean = np.sum(corresponding_weights * log_vals) / corresponding_weights.sum()
                weighted_geom_mean = np.exp(weighted_log_mean)
                
                predictions[sector] = weighted_geom_mean
            else:
                predictions[sector] = 0
        else:
            predictions[sector] = 0
    
    return predictions

# Generate Method 1 predictions
method1_predictions = weighted_geometric_mean_prediction(
    amount_matrix, 
    n_lags=CONFIG_METHOD1['n_lags'], 
    alpha=CONFIG_METHOD1['alpha'], 
    t2=CONFIG_METHOD1['t2']
)
print(f"Method 1 predictions shape: {method1_predictions.shape}")

## 4. Method 2: Seasonality Bump

In [None]:
def compute_december_multipliers(amount_matrix, clip_low=0.85, clip_high=1.40):
    """Compute December seasonality multipliers"""
    is_december = (amount_matrix.index.values % 12) == 11
    dec_means = amount_matrix[is_december].mean(axis=0)
    nondec_means = amount_matrix[~is_december].mean(axis=0)
    dec_counts = amount_matrix[is_december].notna().sum(axis=0)
    
    raw_mult = dec_means / (nondec_means + 1e-9)
    overall_mult = float(dec_means.mean() / (nondec_means.mean() + 1e-9))
    
    raw_mult = raw_mult.where(dec_counts >= 1, overall_mult)
    raw_mult = raw_mult.replace([np.inf, -np.inf], 1.0).fillna(1.0)
    clipped_mult = raw_mult.clip(lower=clip_low, upper=clip_high)
    
    return clipped_mult.to_dict()

def ewgm_per_sector(amount_matrix, sector, n_lags, alpha):
    """Exponential weighted geometric mean for one sector"""
    weights = np.array([alpha**(n_lags - 1 - i) for i in range(n_lags)], dtype=float)
    weights = weights / weights.sum()
    
    recent_vals = amount_matrix.tail(n_lags)[sector].values
    if (len(recent_vals) != n_lags) or (recent_vals <= 0).all():
        return 0.0
    
    mask = recent_vals > 0
    pos_vals = recent_vals[mask]
    pos_w = weights[mask]
    
    if pos_vals.size == 0:
        return 0.0
    
    pos_w = pos_w / pos_w.sum()
    log_vals = np.log(pos_vals + 1e-12)
    wlm = np.sum(pos_w * log_vals) / pos_w.sum()
    return float(np.exp(wlm))

def seasonal_bump_prediction(amount_matrix, n_lags=7, alpha=0.5, t2=6):
    """Seasonality bump method"""
    print(f"Seasonality Bump Method:")
    print(f"  Lags: {n_lags}, Alpha: {alpha}")
    
    # Base predictions
    predictions = pd.DataFrame(index=np.arange(67, 79), columns=amount_matrix.columns, dtype=float)
    
    for sector in amount_matrix.columns:
        if (amount_matrix.tail(t2)[sector].min() == 0) or (amount_matrix[sector].sum() == 0):
            predictions[sector] = 0.0
            continue
        
        base = ewgm_per_sector(amount_matrix, sector, n_lags, alpha)
        predictions[sector] = base
    
    # Apply December multipliers
    dec_multipliers = compute_december_multipliers(amount_matrix)
    dec_rows = [t for t in predictions.index.values if (t % 12) == 11]
    
    if len(dec_rows) > 0:
        for sector in predictions.columns:
            m = dec_multipliers.get(sector, 1.0)
            predictions.loc[dec_rows, sector] = predictions.loc[dec_rows, sector] * m
    
    return predictions

# Generate Method 2 predictions
method2_predictions = seasonal_bump_prediction(
    amount_matrix, 
    n_lags=CONFIG_METHOD2['n_lags'], 
    alpha=CONFIG_METHOD2['alpha'], 
    t2=CONFIG_METHOD2['t2']
)
print(f"Method 2 predictions shape: {method2_predictions.shape}")

## 5. Create Ensemble

In [None]:
def create_ensemble(method1_preds, method2_preds, weight1=0.45, weight2=0.55):
    """
    Create weighted ensemble of two prediction methods
    
    Parameters:
    - weight1: weight for method 1 (weighted geometric mean)
    - weight2: weight for method 2 (seasonality bump)
    """
    print(f"\nEnsemble Weights:")
    print(f"  Method 1 (Weighted Geom Mean): {weight1:.2f}")
    print(f"  Method 2 (Seasonality Bump): {weight2:.2f}")
    
    ensemble_preds = weight1 * method1_preds + weight2 * method2_preds
    return ensemble_preds

# Create ensemble with optimal weights
ensemble_predictions = create_ensemble(
    method1_predictions, 
    method2_predictions, 
    weight1=CONFIG_ENSEMBLE['weight_method1'], 
    weight2=CONFIG_ENSEMBLE['weight_method2']
)

print(f"\nEnsemble statistics:")
print(f"  Min: {ensemble_predictions.min().min():,.0f}")
print(f"  Max: {ensemble_predictions.max().max():,.0f}")
print(f"  Mean: {ensemble_predictions.mean().mean():,.0f}")
print(f"  Median: {ensemble_predictions.median().median():,.0f}")

## 6. Create Submission

In [None]:
# Convert predictions to submission format
submission = test.copy()

# Map predictions to test set
prediction_values = []
for _, row in test.iterrows():
    time_idx = row['time']
    sector_id = row['sector_id']
    pred_value = ensemble_predictions.loc[time_idx, sector_id]
    prediction_values.append(pred_value)

submission['new_house_transaction_amount'] = prediction_values

# Save submission
submission[['id', 'new_house_transaction_amount']].to_csv(OUTPUT_FILENAME, index=False)

print(f"\n✅ Ensemble submission created successfully!")
print(f"Saved to: {OUTPUT_FILENAME}")
print(f"Submission shape: {submission.shape}")
print(f"\nFirst few predictions:")
print(submission[['id', 'new_house_transaction_amount']].head(10))

## 7. Analysis & Comparison

In [None]:
# Compare methods
comparison = pd.DataFrame({
    'Method 1 (WGM)': method1_predictions.values.flatten(),
    'Method 2 (Seasonal)': method2_predictions.values.flatten(),
    'Ensemble': ensemble_predictions.values.flatten()
})

print("\nMethod Comparison:")
print(comparison.describe())

print(f"\nCorrelation between methods:")
print(comparison.corr())

## Summary

This ensemble combines:
- **Method 1**: Weighted Geometric Mean with exponential decay (α=0.5)
  - Emphasizes recent trends
  - Weight: 30%

- **Method 2**: Seasonality Bump
  - Captures December peaks
  - Uses 7-month lag window
  - Weight: 70%

The ensemble leverages the strengths of both approaches for robust predictions.