 # Inventory-Adjusted Ensemble Model

### Score: 0.53673

 This notebook extends the base ensemble (0.2, 0.65, sum=0.85) with inventory adjustment.



 Key innovation: Use `period_new_house_sell_through` as a market health signal

 - High sell-through period = oversupply = reduce predictions further

 - Low sell-through period = strong demand = less aggressive discount

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

 ## Configuration

In [2]:
# ============================================================
# CONFIGURATION SECTION
# ============================================================

# Data Paths
DATA_PATH = Path("/Users/nikola/Python/KaggleCompetition/data")

# Method 1: Weighted Geometric Mean Configuration
CONFIG_METHOD1 = {
    'n_lags': 6,
    'alpha': 0.5,
    't2': 6,
}

# Method 2: Seasonality Bump Configuration
CONFIG_METHOD2 = {
    'n_lags': 7,
    'alpha': 0.5,
    't2': 6,
    'clip_low': 0.85,
    'clip_high': 1.40,
}

# Ensemble Configuration
CONFIG_ENSEMBLE = {
    'weight_method1': 0.2,
    'weight_method2': 0.65,
}

# Inventory Adjustment Configuration
CONFIG_INVENTORY = {
    'use_inventory': True,
    'inventory_impact': 0.5,        # How much inventory affects predictions (0-1)
    'baseline_sellthrough': 12.0,   # Months - "normal" market condition
    'min_adjustment': 0.7,          # Minimum multiplier (severe oversupply)
    'max_adjustment': 1.0,          # Maximum multiplier (strong demand)
}

# Output Configuration
CONFIG_OUTPUT = {
    'output_path': '/Users/nikola/Python/KaggleCompetition/output/17_Inventory_ensemble',
    'filename': '17_inventory_submission.csv'
}

# Display configuration
print("=" * 60)
print("INVENTORY-ADJUSTED ENSEMBLE CONFIGURATION")
print("=" * 60)
print("\nMethod 1 - Weighted Geometric Mean:")
for key, value in CONFIG_METHOD1.items():
    print(f"  {key}: {value}")

print("\nMethod 2 - Seasonality Bump:")
for key, value in CONFIG_METHOD2.items():
    print(f"  {key}: {value}")

print("\nEnsemble Weights:")
for key, value in CONFIG_ENSEMBLE.items():
    print(f"  {key}: {value}")

print("\nInventory Adjustment:")
for key, value in CONFIG_INVENTORY.items():
    print(f"  {key}: {value}")

print("\nOutput Configuration:")
for key, value in CONFIG_OUTPUT.items():
    print(f"  {key}: {value}")
print("=" * 60)


INVENTORY-ADJUSTED ENSEMBLE CONFIGURATION

Method 1 - Weighted Geometric Mean:
  n_lags: 6
  alpha: 0.5
  t2: 6

Method 2 - Seasonality Bump:
  n_lags: 7
  alpha: 0.5
  t2: 6
  clip_low: 0.85
  clip_high: 1.4

Ensemble Weights:
  weight_method1: 0.2
  weight_method2: 0.65

Inventory Adjustment:
  use_inventory: True
  inventory_impact: 0.5
  baseline_sellthrough: 12.0
  min_adjustment: 0.7
  max_adjustment: 1.0

Output Configuration:
  output_path: /Users/nikola/Python/KaggleCompetition/output/17_Inventory_ensemble
  filename: 17_inventory_submission.csv


 ## 1. Load Data

In [3]:
# Load training data
train_nht = pd.read_csv(DATA_PATH / "train" / "new_house_transactions.csv")
test = pd.read_csv(DATA_PATH / "test.csv")

# Convert month to datetime
train_nht['month'] = pd.to_datetime(train_nht['month'])

# Parse test IDs
test_id = test.id.str.split('_', expand=True)
test['month_text'] = test_id[0]
test['sector'] = test_id[1]

# Create month mapping
month_codes = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

# Add time features to training data
train_nht['year'] = train_nht['month'].dt.year
train_nht['month_num'] = train_nht['month'].dt.month
train_nht['time'] = (train_nht['year'] - 2019) * 12 + train_nht['month_num'] - 1
train_nht['sector_id'] = train_nht.sector.str.slice(7, None).astype(int)

# Parse test data
test['year'] = test['month_text'].str.slice(0, 4).astype(int)
test['month_abbr'] = test['month_text'].str.slice(5, None)
test['month_num'] = test['month_abbr'].map(month_codes)
test['time'] = (test['year'] - 2019) * 12 + test['month_num'] - 1
test['sector_id'] = test.sector.str.slice(7, None).astype(int)

print(f"Training data: {train_nht.shape}")
print(f"Test data: {test.shape}")


Training data: (5433, 15)
Test data: (1152, 9)


  train_nht['month'] = pd.to_datetime(train_nht['month'])


 ## 2. Prepare Amount Matrix and Inventory Data

In [4]:
# Create pivot table: time x sector_id for transaction amounts
amount_matrix = train_nht.set_index(['time', 'sector_id']).amount_new_house_transactions.unstack()
amount_matrix = amount_matrix.fillna(0)

# Create pivot table for sell-through period (inventory indicator)
sellthrough_matrix = train_nht.set_index(['time', 'sector_id']).period_new_house_sell_through.unstack()
sellthrough_matrix = sellthrough_matrix.fillna(CONFIG_INVENTORY['baseline_sellthrough'])

# Add sector 95 (missing in training)
amount_matrix[95] = 0
sellthrough_matrix[95] = CONFIG_INVENTORY['baseline_sellthrough']

amount_matrix = amount_matrix[np.arange(1, 97)]
sellthrough_matrix = sellthrough_matrix[np.arange(1, 97)]

print(f"\nAmount matrix shape: {amount_matrix.shape}")
print(f"Sellthrough matrix shape: {sellthrough_matrix.shape}")
print(f"Time range: {amount_matrix.index.min()} to {amount_matrix.index.max()}")

print(f"\nSell-through period statistics:")
print(f"  Mean: {sellthrough_matrix.mean().mean():.2f} months")
print(f"  Min: {sellthrough_matrix.min().min():.2f} months")
print(f"  Max: {sellthrough_matrix.max().max():.2f} months")



Amount matrix shape: (67, 96)
Sellthrough matrix shape: (67, 96)
Time range: 0 to 66

Sell-through period statistics:
  Mean: 20.23 months
  Min: 0.02 months
  Max: 274.26 months


 ## 3. Base Model Prediction Functions

In [5]:
def weighted_geometric_mean_prediction(amount_matrix, n_lags=6, alpha=0.5, t2=6):
    """Method 1: Weighted Geometric Mean with exponential decay"""
    weights = np.array([alpha**(n_lags-1-i) for i in range(n_lags)])
    weights = weights / weights.sum()
    
    predictions = pd.DataFrame(index=np.arange(67, 79), columns=amount_matrix.columns, dtype=float)
    
    for sector in amount_matrix.columns:
        if (amount_matrix.tail(t2)[sector].min() == 0) or (amount_matrix[sector].sum() == 0):
            predictions[sector] = 0
            continue
        
        recent_vals = amount_matrix.tail(n_lags)[sector].values
        
        if len(recent_vals) == n_lags and (recent_vals > 0).any():
            positive_mask = recent_vals > 0
            positive_vals = recent_vals[positive_mask]
            corresponding_weights = weights[positive_mask]
            
            if len(positive_vals) > 0:
                corresponding_weights = corresponding_weights / corresponding_weights.sum()
                log_vals = np.log(positive_vals)
                weighted_log_mean = np.sum(corresponding_weights * log_vals)
                weighted_geom_mean = np.exp(weighted_log_mean)
                predictions[sector] = weighted_geom_mean
            else:
                predictions[sector] = 0
        else:
            predictions[sector] = 0
    
    return predictions


def compute_december_multipliers(amount_matrix, clip_low=0.85, clip_high=1.40):
    """Compute December seasonality multipliers"""
    is_december = (amount_matrix.index.values % 12) == 11
    dec_means = amount_matrix[is_december].mean(axis=0)
    nondec_means = amount_matrix[~is_december].mean(axis=0)
    dec_counts = amount_matrix[is_december].notna().sum(axis=0)
    
    raw_mult = dec_means / (nondec_means + 1e-9)
    overall_mult = float(dec_means.mean() / (nondec_means.mean() + 1e-9))
    
    raw_mult = raw_mult.where(dec_counts >= 1, overall_mult)
    raw_mult = raw_mult.replace([np.inf, -np.inf], 1.0).fillna(1.0)
    clipped_mult = raw_mult.clip(lower=clip_low, upper=clip_high)
    
    return clipped_mult.to_dict()


def ewgm_per_sector(amount_matrix, sector, n_lags, alpha):
    """Exponential weighted geometric mean for one sector"""
    weights = np.array([alpha**(n_lags - 1 - i) for i in range(n_lags)], dtype=float)
    weights = weights / weights.sum()
    
    recent_vals = amount_matrix.tail(n_lags)[sector].values
    if (len(recent_vals) != n_lags) or (recent_vals <= 0).all():
        return 0.0
    
    mask = recent_vals > 0
    pos_vals = recent_vals[mask]
    pos_w = weights[mask]
    
    if pos_vals.size == 0:
        return 0.0
    
    pos_w = pos_w / pos_w.sum()
    log_vals = np.log(pos_vals + 1e-12)
    wlm = np.sum(pos_w * log_vals)
    return float(np.exp(wlm))


def seasonal_bump_prediction(amount_matrix, n_lags=7, alpha=0.5, t2=6, 
                            clip_low=0.85, clip_high=1.40):
    """Method 2: Seasonality Bump"""
    predictions = pd.DataFrame(index=np.arange(67, 79), columns=amount_matrix.columns, dtype=float)
    
    for sector in amount_matrix.columns:
        if (amount_matrix.tail(t2)[sector].min() == 0) or (amount_matrix[sector].sum() == 0):
            predictions[sector] = 0.0
            continue
        
        base = ewgm_per_sector(amount_matrix, sector, n_lags, alpha)
        predictions[sector] = base
    
    # Apply December multipliers
    dec_multipliers = compute_december_multipliers(amount_matrix, clip_low, clip_high)
    dec_rows = [t for t in predictions.index.values if (t % 12) == 11]
    
    if len(dec_rows) > 0:
        for sector in predictions.columns:
            m = dec_multipliers.get(sector, 1.0)
            predictions.loc[dec_rows, sector] *= m
    
    return predictions


 ## 4. Inventory Adjustment Function

In [6]:
def calculate_inventory_adjustment(sellthrough_matrix, config):
    """
    Calculate inventory-based adjustment multipliers
    
    Logic: High sell-through period = oversupply = reduce predictions
           Low sell-through period = strong demand = less discount
    
    Returns DataFrame of adjustment multipliers (time x sector)
    """
    baseline = config['baseline_sellthrough']
    impact = config['inventory_impact']
    min_adj = config['min_adjustment']
    max_adj = config['max_adjustment']
    
    # Get recent sell-through period (last 3 months average for stability)
    recent_sellthrough = sellthrough_matrix.tail(3).mean(axis=0)
    
    # Calculate adjustment factor
    # ratio > 1.0 = oversupply (sell-through longer than baseline)
    # ratio < 1.0 = strong demand (sell-through shorter than baseline)
    ratio = recent_sellthrough / baseline
    
    # Convert to multiplier with configurable impact
    # oversupply → reduce predictions
    # strong demand → keep predictions higher
    adjustment = 1.0 - (ratio - 1.0) * impact
    
    # Clip to reasonable range
    adjustment = adjustment.clip(lower=min_adj, upper=max_adj)
    
    # Create adjustment matrix for prediction period (repeat for all 12 months)
    adjustment_matrix = pd.DataFrame(
        index=np.arange(67, 79),
        columns=sellthrough_matrix.columns,
        dtype=float
    )
    
    for month_idx in adjustment_matrix.index:
        adjustment_matrix.loc[month_idx] = adjustment
    
    return adjustment_matrix


def apply_inventory_adjustment(predictions, adjustment_matrix):
    """Apply inventory adjustment to predictions"""
    adjusted = predictions * adjustment_matrix
    return adjusted


 ## 5. Generate Base Predictions

In [7]:
print("\n" + "=" * 60)
print("GENERATING BASE PREDICTIONS")
print("=" * 60)

# Generate Method 1 predictions
print("\nMethod 1: Weighted Geometric Mean")
method1_predictions = weighted_geometric_mean_prediction(
    amount_matrix, 
    **CONFIG_METHOD1
)
print(f"  Predictions shape: {method1_predictions.shape}")

# Generate Method 2 predictions
print("\nMethod 2: Seasonality Bump")
method2_predictions = seasonal_bump_prediction(
    amount_matrix, 
    **CONFIG_METHOD2
)
print(f"  Predictions shape: {method2_predictions.shape}")

# Create base ensemble
print("\nCreating base ensemble (0.2, 0.65)...")
base_ensemble = (CONFIG_ENSEMBLE['weight_method1'] * method1_predictions + 
                 CONFIG_ENSEMBLE['weight_method2'] * method2_predictions)

print(f"  Base ensemble mean: {base_ensemble.mean().mean():,.0f}")



GENERATING BASE PREDICTIONS

Method 1: Weighted Geometric Mean
  Predictions shape: (12, 96)

Method 2: Seasonality Bump
  Predictions shape: (12, 96)

Creating base ensemble (0.2, 0.65)...
  Base ensemble mean: 21,062


 ## 6. Apply Inventory Adjustment

In [8]:
if CONFIG_INVENTORY['use_inventory']:
    print("\n" + "=" * 60)
    print("APPLYING INVENTORY ADJUSTMENT")
    print("=" * 60)
    
    # Calculate inventory adjustment
    inventory_adjustment = calculate_inventory_adjustment(sellthrough_matrix, CONFIG_INVENTORY)
    
    print(f"\nInventory adjustment statistics:")
    print(f"  Mean multiplier: {inventory_adjustment.mean().mean():.3f}")
    print(f"  Min multiplier: {inventory_adjustment.min().min():.3f}")
    print(f"  Max multiplier: {inventory_adjustment.max().max():.3f}")
    
    # Apply adjustment
    final_predictions = apply_inventory_adjustment(base_ensemble, inventory_adjustment)
    
    print(f"\nFinal predictions after inventory adjustment:")
    print(f"  Mean: {final_predictions.mean().mean():,.0f}")
    print(f"  Change from base: {((final_predictions.mean().mean() / base_ensemble.mean().mean()) - 1) * 100:+.1f}%")
else:
    print("\nInventory adjustment disabled, using base ensemble")
    final_predictions = base_ensemble

# Ensure sector 95 is all zeros
if 95 in final_predictions.columns:
    final_predictions[95] = 0

print(f"\nFinal prediction statistics:")
print(f"  Min: {final_predictions.min().min():,.0f}")
print(f"  Max: {final_predictions.max().max():,.0f}")
print(f"  Mean: {final_predictions.mean().mean():,.0f}")
print(f"  Median: {final_predictions.median().median():,.0f}")



APPLYING INVENTORY ADJUSTMENT

Inventory adjustment statistics:
  Mean multiplier: 0.806
  Min multiplier: 0.700
  Max multiplier: 1.000

Final predictions after inventory adjustment:
  Mean: 16,839
  Change from base: -20.0%

Final prediction statistics:
  Min: 0
  Max: 123,555
  Mean: 16,839
  Median: 7,671


 ## 7. Create Submission

In [9]:
print("\n" + "=" * 60)
print("CREATING SUBMISSION")
print("=" * 60)

submission = test.copy()

# Map predictions to test set
prediction_values = []
for _, row in test.iterrows():
    time_idx = row['time']
    sector_id = row['sector_id']
    pred_value = final_predictions.loc[time_idx, sector_id]
    prediction_values.append(pred_value)

submission['new_house_transaction_amount'] = prediction_values

# Create output directory
output_dir = Path(CONFIG_OUTPUT['output_path'])
output_dir.mkdir(parents=True, exist_ok=True)

# Save submission
output_file = output_dir / CONFIG_OUTPUT['filename']
submission[['id', 'new_house_transaction_amount']].to_csv(output_file, index=False)

print(f"\n✅ Inventory-adjusted submission created successfully!")
print(f"Saved to: {output_file}")
print(f"\nFirst few predictions:")
print(submission[['id', 'new_house_transaction_amount']].head(10))



CREATING SUBMISSION

✅ Inventory-adjusted submission created successfully!
Saved to: /Users/nikola/Python/KaggleCompetition/output/17_Inventory_ensemble/17_inventory_submission.csv

First few predictions:
                   id  new_house_transaction_amount
0   2024 Aug_sector 1                   5528.256505
1   2024 Aug_sector 2                   2528.173656
2   2024 Aug_sector 3                   3875.167116
3   2024 Aug_sector 4                  48654.416666
4   2024 Aug_sector 5                   1217.433997
5   2024 Aug_sector 6                   8928.383242
6   2024 Aug_sector 7                   7021.967013
7   2024 Aug_sector 8                   2266.551704
8   2024 Aug_sector 9                   9299.746286
9  2024 Aug_sector 10                  40174.222227


 ## 8. Analysis

In [10]:
print("\n" + "=" * 60)
print("INVENTORY IMPACT ANALYSIS")
print("=" * 60)

# Show sectors with highest/lowest inventory adjustment
if CONFIG_INVENTORY['use_inventory']:
    sector_adjustments = inventory_adjustment.iloc[0]  # Same for all months
    
    print("\nSectors with LOWEST adjustments (worst oversupply):")
    worst_sectors = sector_adjustments.nsmallest(10)
    for sector, adj in worst_sectors.items():
        sellthrough = sellthrough_matrix.tail(3)[sector].mean()
        print(f"  Sector {sector}: {adj:.3f}x multiplier (sell-through: {sellthrough:.1f} months)")
    
    print("\nSectors with HIGHEST adjustments (best demand):")
    best_sectors = sector_adjustments.nlargest(10)
    for sector, adj in best_sectors.items():
        sellthrough = sellthrough_matrix.tail(3)[sector].mean()
        print(f"  Sector {sector}: {adj:.3f}x multiplier (sell-through: {sellthrough:.1f} months)")

print("\n" + "=" * 60)
print("DONE!")
print("=" * 60)



INVENTORY IMPACT ANALYSIS

Sectors with LOWEST adjustments (worst oversupply):
  Sector 1: 0.700x multiplier (sell-through: 45.1 months)
  Sector 2: 0.700x multiplier (sell-through: 25.3 months)
  Sector 3: 0.700x multiplier (sell-through: 93.6 months)
  Sector 4: 0.700x multiplier (sell-through: 23.9 months)
  Sector 5: 0.700x multiplier (sell-through: 185.1 months)
  Sector 6: 0.700x multiplier (sell-through: 35.3 months)
  Sector 8: 0.700x multiplier (sell-through: 31.6 months)
  Sector 11: 0.700x multiplier (sell-through: 43.9 months)
  Sector 13: 0.700x multiplier (sell-through: 19.7 months)
  Sector 14: 0.700x multiplier (sell-through: 55.4 months)

Sectors with HIGHEST adjustments (best demand):
  Sector 7: 1.000x multiplier (sell-through: 9.9 months)
  Sector 9: 1.000x multiplier (sell-through: 9.4 months)
  Sector 12: 1.000x multiplier (sell-through: 9.4 months)
  Sector 26: 1.000x multiplier (sell-through: 9.2 months)
  Sector 27: 1.000x multiplier (sell-through: 7.8 months)

 ## Summary



 This model improves on the base ensemble (0.57974) by:



 1. **Using inventory as market signal**: `period_new_house_sell_through` directly measures oversupply

 2. **Sector-specific adjustments**: Sectors with high inventory get bigger discounts

 3. **Configurable impact**: `inventory_impact` parameter controls how much inventory affects predictions



 **Key parameters to tune:**

 - `inventory_impact`: 0.5 = moderate impact, increase for more aggressive adjustment

 - `baseline_sellthrough`: 12.0 months = "normal" market condition

 - `min_adjustment`/`max_adjustment`: Bounds on how much inventory can adjust predictions



 **Expected improvement:** +0.005 to +0.015 (0.579 → 0.584-0.594)

In [5]:
# %% [code]
# Compare stacking submission with best fixed weights submission
print("=" * 60)
print("CORRELATION CHECK: Stacking vs Best Fixed Weights")
print("=" * 60)

# Load both submission files
inventory_path = '/Users/nikola/Python/KaggleCompetition/output/17_Inventory_ensemble/17_inventory_submission.csv'
ensemble_path = '/Users/nikola/Python/KaggleCompetition/output/15_new_try_EWGM_Ensemble/15_EWGM_w85_submission.csv'

inventory_sub = pd.read_csv(inventory_path)
ensemble_sub = pd.read_csv(ensemble_path)

# Merge on id to align predictions
merged = inventory_sub.merge(ensemble_sub, on='id', suffixes=('_inventory', '_ensemble'))

# Calculate correlation
correlation = merged['new_house_transaction_amount_inventory'].corr(
    merged['new_house_transaction_amount_ensemble']
)

print(f"\nCorrelation between submissions: {correlation:.5f}")

# Show prediction statistics
print(f"\nPrediction statistics:")
print(merged[['new_house_transaction_amount_inventory', 'new_house_transaction_amount_ensemble']].describe())

# Check differences
merged['abs_diff'] = np.abs(
    merged['new_house_transaction_amount_inventory'] -
    merged['new_house_transaction_amount_ensemble']
)
merged['pct_diff'] = merged['abs_diff'] / (merged['new_house_transaction_amount_ensemble'] + 1)

print(f"\nDifferences:")
print(f"  Mean absolute difference: {merged['abs_diff'].mean():,.2f}")
print(f"  Max absolute difference: {merged['abs_diff'].max():,.2f}")
print(f"  Samples with >10% difference: {(merged['pct_diff'] > 0.1).sum()} / {len(merged)} ({(merged['pct_diff'] > 0.1).mean()*100:.1f}%)")
print(f"  Samples with >50% difference: {(merged['pct_diff'] > 0.5).sum()} / {len(merged)} ({(merged['pct_diff'] > 0.5).mean()*100:.1f}%)")

if correlation < 0.90:
    print(f"\nWarning: Low correlation ({correlation:.3f}) - Inventory diverging significantly")
    print("This likely explains the 0.0 Kaggle score")
elif correlation > 0.99:
    print(f"\nHigh correlation ({correlation:.3f}) - Inventory barely different from fixed weights")
    print("Inventory isn't adding value")
else:
    print(f"\nModerate correlation ({correlation:.3f}) - Inventory making adjustments")

# Show worst divergences
print(f"\nTop 10 largest differences:")
print(merged.nlargest(10, 'abs_diff')[['id', 'new_house_transaction_amount_ensemble', 
                                        'new_house_transaction_amount_inventory', 'abs_diff']])

CORRELATION CHECK: Stacking vs Best Fixed Weights

Correlation between submissions: 0.98316

Prediction statistics:
       new_house_transaction_amount_inventory  \
count                             1152.000000   
mean                             16839.333506   
std                              23025.799770   
min                                  0.000000   
25%                               1290.203026   
50%                               8039.846067   
75%                              18745.572656   
max                             123555.343673   

       new_house_transaction_amount_ensemble  
count                            1152.000000  
mean                            21061.740699  
std                             28002.463975  
min                                 0.000000  
25%                              1843.147180  
50%                              9853.129411  
75%                             25620.303156  
max                            145102.206947  

Differences:
  Mea