<a href="https://colab.research.google.com/github/clementina-tom/Feed-to-farm-competition/blob/main/pipeline_redo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Memory-Efficient ML Pipeline for Farm To Feed Dataset (Redo)

This notebook implements a memory-efficient machine learning pipeline to predict customer purchasing behavior for 1-week and 2-week windows using pandas, gc, and LightGBM. Includes FileUpload widgets for easy file handling in Colab.

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from ipywidgets import FileUpload
import io

In [None]:
# File Upload Widgets
upload_train = FileUpload(accept='.csv', multiple=False, description='Train.csv')
upload_test = FileUpload(accept='.csv', multiple=False, description='Test.csv')
upload_customer = FileUpload(accept='.csv', multiple=False, description='customer_data.csv')
upload_sku = FileUpload(accept='.csv', multiple=False, description='sku_data.csv')

display(upload_train, upload_test, upload_customer, upload_sku)

FileUpload(value={}, accept='.csv', description='Train.csv')

FileUpload(value={}, accept='.csv', description='Test.csv')

FileUpload(value={}, accept='.csv', description='customer_data.csv')

FileUpload(value={}, accept='.csv', description='sku_data.csv')

In [None]:
# Step 1: Efficient Data Loading & Downcasting
print("Loading data...")
if upload_train.value:
    train = pd.read_csv(io.BytesIO(list(upload_train.value.values())[0]['content']))
else:
    train = pd.read_csv('Train.csv')

if upload_test.value:
    test = pd.read_csv(io.BytesIO(list(upload_test.value.values())[0]['content']))
else:
    test = pd.read_csv('Test.csv')

if upload_customer.value:
    customer = pd.read_csv(io.BytesIO(list(upload_customer.value.values())[0]['content']))
else:
    customer = pd.read_csv('customer_data.csv')

if upload_sku.value:
    sku = pd.read_csv(io.BytesIO(list(upload_sku.value.values())[0]['content']))
else:
    sku = pd.read_csv('sku_data.csv')

def downcast(df):
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    for col in df.select_dtypes(include=['int64']).columns:
        max_val = df[col].max()
        if max_val < 2**8:
            df[col] = df[col].astype('int8')
        elif max_val < 2**16:
            df[col] = df[col].astype('int16')
        else:
            df[col] = df[col].astype('int32')
    return df

train = downcast(train)
test = downcast(test)
customer = downcast(customer)
sku = downcast(sku)

train['week_start'] = pd.to_datetime(train['week_start'])
test['week_start'] = pd.to_datetime(test['week_start'])
customer['customer_created_at'] = pd.to_datetime(customer['customer_created_at'])

gc.collect()

Loading data...


NameError: name 'upload_train' is not defined

In [None]:
# Merge additional data
train = train.merge(customer, on='customer_id', how='left', suffixes=('', '_cust'))
train = train.merge(sku, on='product_unit_variant_id', how='left', suffixes=('', '_sku'))
test = test.merge(customer, on='customer_id', how='left', suffixes=('', '_cust'))
test = test.merge(sku, on='product_unit_variant_id', how='left', suffixes=('', '_sku'))

gc.collect()

In [None]:
# Step 2: Smart Grid Creation & Feature Engineering
full_df = pd.concat([train[['customer_id', 'product_unit_variant_id', 'week_start', 'qty_this_week']],
                     test[['customer_id', 'product_unit_variant_id', 'week_start']].assign(qty_this_week=0.0)])
full_df = downcast(full_df)
full_df = full_df.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])

print("Creating features...")
full_df['lag1_qty'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(1)
full_df['lag2_qty'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(2)
full_df['lag3_qty'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(3)
full_df['rolling_mean_4'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].rolling(4).mean().reset_index(0, drop=True)
full_df['rolling_max_4'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].rolling(4).max().reset_index(0, drop=True)

full_df[['lag1_qty', 'lag2_qty', 'lag3_qty', 'rolling_mean_4', 'rolling_max_4']] = full_df[['lag1_qty', 'lag2_qty', 'lag3_qty', 'rolling_mean_4', 'rolling_max_4']].fillna(0)

gc.collect()

In [None]:
# Merge features back
feature_cols = ['lag1_qty', 'lag2_qty', 'lag3_qty', 'rolling_mean_4', 'rolling_max_4']
train = train.merge(full_df[['customer_id', 'product_unit_variant_id', 'week_start'] + feature_cols],
                    on=['customer_id', 'product_unit_variant_id', 'week_start'], how='left')
test = test.merge(full_df[['customer_id', 'product_unit_variant_id', 'week_start'] + feature_cols],
                  on=['customer_id', 'product_unit_variant_id', 'week_start'], how='left')

del full_df
gc.collect()

In [None]:
# Step 3: Target Generation
print("Creating targets...")
train = train.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])
train['target_purchase_1w'] = (train.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(-1) > 0).astype(int)
train['target_qty_1w'] = train.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(-1).fillna(0)
train['target_purchase_2w'] = (train.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(-2) > 0).astype(int)
train['target_qty_2w'] = train.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(-2).fillna(0)

train = train.dropna(subset=['target_purchase_1w'])

gc.collect()

In [None]:
# Encode categoricals
cat_cols = ['customer_category', 'customer_status', 'grade_name', 'unit_name']
le = LabelEncoder()
for col in cat_cols:
    combined = pd.concat([train[col], test[col]])
    le.fit(combined)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

features = ['selling_price', 'customer_category', 'customer_status', 'grade_name', 'unit_name'] + feature_cols

In [None]:
# Step 4: Modeling
print("Training models...")
weeks = sorted(train['week_start'].unique())
val_weeks = weeks[-2:]
val_mask = train['week_start'].isin(val_weeks)

models = {}
targets = ['target_purchase_1w', 'target_qty_1w', 'target_purchase_2w', 'target_qty_2w']
for target in targets:
    print(f"Training {target}...")
    X = train[features]
    y = train[target]
    X_train, X_val = X[~val_mask], X[val_mask]
    y_train, y_val = y[~val_mask], y[val_mask]

    if 'purchase' in target:
        model = lgb.LGBMClassifier(n_estimators=1000, early_stopping_rounds=50, verbose=-1)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='auc')
        pred_val = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, pred_val)
        print(f"AUC for {target}: {auc}")
    else:
        model = lgb.LGBMRegressor(n_estimators=1000, early_stopping_rounds=50, verbose=-1)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='mae')
        pred_val = model.predict(X_val)
        mae = mean_absolute_error(y_val, pred_val)
        print(f"MAE for {target}: {mae}")

    models[target] = model
    gc.collect()

In [None]:
# Step 5: Submission
print("Generating predictions...")
test['Target_purchase_next_1w'] = models['target_purchase_1w'].predict_proba(test[features])[:, 1]
test['Target_qty_next_1w'] = models['target_qty_1w'].predict(test[features])
test['Target_purchase_next_2w'] = models['target_purchase_2w'].predict_proba(test[features])[:, 1]
test['Target_qty_next_2w'] = models['target_qty_2w'].predict(test[features])

submission = test[['ID', 'Target_purchase_next_1w', 'Target_qty_next_1w', 'Target_purchase_next_2w', 'Target_qty_next_2w']]
submission.to_csv('submission.csv', index=False)
print("Submission saved.")

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from sklearn.preprocessing import LabelEncoder

# --- 1. Load Data ---
print("Loading...")
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
customer = pd.read_csv('customer_data.csv') # Load customer data
sku = pd.read_csv('sku_data.csv')

# --- 2. Downcast & Date Conversion ---
def downcast(df):
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    for col in df.select_dtypes(include=['int64']).columns:
        # Adjusted int downcasting to avoid overflow for larger values
        max_val = df[col].max()
        min_val = df[col].min()
        if min_val >= np.iinfo('int8').min and max_val <= np.iinfo('int8').max:
            df[col] = df[col].astype('int8')
        elif min_val >= np.iinfo('int16').min and max_val <= np.iinfo('int16').max:
            df[col] = df[col].astype('int16')
        elif min_val >= np.iinfo('int32').min and max_val <= np.iinfo('int32').max:
            df[col] = df[col].astype('int32')
    return df

train = downcast(train)
test = downcast(test)
customer = downcast(customer) # Downcast customer data
sku = downcast(sku)

train['week_start'] = pd.to_datetime(train['week_start'])
test['week_start'] = pd.to_datetime(test['week_start'])
customer['customer_created_at'] = pd.to_datetime(customer['customer_created_at'])

# --- Merge additional data (customer and sku) into train and test FIRST ---
# This ensures all categorical and customer_created_at columns are present in train/test
train = train.merge(customer, on='customer_id', how='left')
train = train.merge(sku, on='product_unit_variant_id', how='left')
test = test.merge(customer, on='customer_id', how='left')
test = test.merge(sku, on='product_unit_variant_id', how='left')

print("Train columns after merges:", train.columns.tolist())
print("Test columns after merges:", test.columns.tolist())

gc.collect()

# --- 3. Feature Engineering (Unified) ---
# Combine Train and Test to generate Lag Features smoothly
# Drop 'ID' column before concat as it's not a feature and could be duplicated.
test_with_qty = test.assign(qty_this_week=0.0).drop(columns=['ID'], errors='ignore')
full_df = pd.concat([train.drop(columns=['ID'], errors='ignore'), test_with_qty], ignore_index=True)

# The merges with customer and sku are now done earlier on train/test, no need to merge into full_df again here.

full_df = downcast(full_df) # Downcast full_df after concat
full_df = full_df.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])

print("Full_df columns after concat and sort:", full_df.columns.tolist())

# Generate Lags
print("Generating features...")
grp = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week']
full_df['lag1'] = grp.shift(1)
full_df['lag2'] = grp.shift(2)
full_df['roll_mean_4'] = grp.rolling(4).mean().reset_index(level=[0,1], drop=True)

# Fill NaNs
full_df[['lag1', 'lag2', 'roll_mean_4']] = full_df[['lag1', 'lag2', 'roll_mean_4']].fillna(0)

gc.collect()

# --- 4. Create "Stacked" Training Data (The Grandmaster Move) ---
# We transform the data:
# Row 1: Target=Week1, Horizon=1
# Row 2: Target=Week2, Horizon=2

# Ensure all relevant columns are carried over from full_df
# Use all columns that will be features or used for stacking
relevant_cols = ['customer_id', 'product_unit_variant_id', 'week_start', 'qty_this_week',
                 'customer_category_x', 'customer_status_x', 'grade_name_x', 'unit_name_x',
                 'lag1', 'lag2', 'roll_mean_4', 'customer_created_at_x'] # Use _x suffixes for these columns

print("Full_df columns before train_df/test_df split:", full_df.columns.tolist())

train_df = full_df[full_df['week_start'].isin(train['week_start'])][relevant_cols].copy()
test_df = full_df[full_df['week_start'].isin(test['week_start'])][relevant_cols].copy()

# Create Horizon 1 Subset
h1 = train_df.copy()
h1['target_qty'] = h1.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(-1)
h1['target_buy'] = (h1['target_qty'] > 0).astype(int)
h1['horizon'] = 1

# Create Horizon 2 Subset
h2 = train_df.copy()
h2['target_qty'] = h2.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(-2)
h2['target_buy'] = (h2['target_qty'] > 0).astype(int)
h2['horizon'] = 2

# Stack them!
train_stacked = pd.concat([h1, h2], ignore_index=True)
train_stacked = train_stacked.dropna(subset=['target_qty']) # Remove rows where target is NaN (end of data)

# --- 5. Train Unified Models ---
# Add the new merged features to the list of features, including customer_category and customer_status
features = ['lag1', 'lag2', 'roll_mean_4', 'horizon',
            'customer_category_x', 'customer_status_x', 'grade_name_x', 'unit_name_x'] # Use _x suffixes

# Before training, we need to LabelEncode the categorical features
cat_cols_to_encode = ['customer_category_x', 'customer_status_x', 'grade_name_x', 'unit_name_x'] # Use _x suffixes
le = LabelEncoder()
for col in cat_cols_to_encode:
    # Fit on combined data from train_stacked, test_h1, test_h2 for consistent encoding
    # Note: test_df is used for encoding consistency across train_stacked and final test predictions
    combined_data = pd.concat([train_stacked[col], test_df[col]]).astype(str).fillna('UNKNOWN')
    le.fit(combined_data)

    train_stacked[col] = le.transform(train_stacked[col].astype(str).fillna('UNKNOWN'))
    test_df[col] = le.transform(test_df[col].astype(str).fillna('UNKNOWN'))

# Prepare Test Set for H1 and H2 BEFORE encoding customer data to ensure test_df is used correctly
test_h1 = test_df.copy()
test_h2 = test_df.copy()

# Apply encoding to test_h1 and test_h2 based on the encoded test_df (already done above, but confirm this line is still needed for clarity if test_h1/h2 were not direct copies)
# This line ensures test_h1 and test_h2 get the encoded values from test_df
test_h1[cat_cols_to_encode] = test_df[cat_cols_to_encode]
test_h2[cat_cols_to_encode] = test_df[cat_cols_to_encode]

print("Training Unified Classifier...")
clf = lgb.LGBMClassifier(n_estimators=1500, learning_rate=0.05)
clf.fit(train_stacked[features], train_stacked['target_buy'])

print("Training Unified Regressor (Tweedie)...")
reg = lgb.LGBMRegressor(objective='tweedie', n_estimators=1500, learning_rate=0.05)
reg.fit(train_stacked[features], train_stacked['target_qty'])

# --- 6. Predict ---
test_h1['horizon'] = 1
test_h2['horizon'] = 2

submission = test[['ID']].copy()
submission['Target_purchase_next_1w'] = clf.predict_proba(test_h1[features])[:, 1]
submission['Target_qty_next_1w'] = reg.predict(test_h1[features])
submission['Target_purchase_next_2w'] = clf.predict_proba(test_h2[features])[:, 1]
submission['Target_qty_next_2w'] = reg.predict(test_h2[features])

submission.to_csv('submission_horizon.csv', index=False)
print("Done!")

Loading...
Train columns after merges: ['ID', 'customer_id', 'product_unit_variant_id', 'week_start', 'qty_this_week', 'num_orders_week', 'spend_this_week', 'purchased_this_week', 'product_id', 'grade_name_x', 'unit_name_x', 'product_grade_variant_id', 'selling_price', 'customer_category_x', 'customer_status_x', 'customer_created_at_x', 'Target_qty_next_1w', 'Target_purchase_next_1w', 'Target_qty_next_2w', 'Target_purchase_next_2w', 'customer_category_y', 'customer_status_y', 'customer_created_at_y', 'product_name', 'product_grade_variant_sku', 'unit_name_y', 'grade_name_y', 'grade_active_status']
Test columns after merges: ['ID', 'customer_id', 'product_unit_variant_id', 'week_start', 'product_id', 'grade_name_x', 'unit_name_x', 'product_grade_variant_id', 'customer_category_x', 'customer_status_x', 'customer_created_at_x', 'customer_category_y', 'customer_status_y', 'customer_created_at_y', 'product_name', 'product_grade_variant_sku', 'unit_name_y', 'grade_name_y', 'grade_active_stat

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from sklearn.preprocessing import LabelEncoder

# --- 1. Load Data ---
print("Loading data...")
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
customer = pd.read_csv('customer_data.csv')
sku = pd.read_csv('sku_data.csv')

# --- 2. Downcast & Pre-processing ---
def downcast(df):
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    for col in df.select_dtypes(include=['int64']).columns:
        if df[col].max() < 2**32:
            df[col] = df[col].astype('int32')
    return df

train = downcast(train)
test = downcast(test)
customer = downcast(customer)
sku = downcast(sku)

# Convert Dates
train['week_start'] = pd.to_datetime(train['week_start'])
test['week_start'] = pd.to_datetime(test['week_start'])
customer['customer_created_at'] = pd.to_datetime(customer['customer_created_at'])

# --- 3. Unified Feature Engineering (The "Full Grid" Approach) ---
print("Building Full History...")

# A. Combine Train and Test FIRST
# We need the Test set to exist so that 'shift(-1)' on the last week of Train
# can actually see the first week of Test (if applicable) or handle boundaries correctly.
full_df = pd.concat([
    train[['customer_id', 'product_unit_variant_id', 'week_start', 'qty_this_week']],
    test[['customer_id', 'product_unit_variant_id', 'week_start']].assign(qty_this_week=0.0)
], ignore_index=True)

full_df = full_df.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])

# B. Add "Grandmaster Features" (Seasonality & Trends)
# Fix: Handle potential NaT values before converting to int
full_df['month'] = full_df['week_start'].dt.month.fillna(0).astype(int)
full_df['week_of_year'] = full_df['week_start'].dt.isocalendar().week.fillna(0).astype(int)

# C. Global Product Trend (Shifted to prevent leakage)
# "How much is this product selling globally across all customers?"
prod_trend = full_df.groupby(['product_unit_variant_id', 'week_start'])['qty_this_week'].mean().reset_index().rename(columns={'qty_this_week': 'global_qty'})
prod_trend = prod_trend.sort_values(['product_unit_variant_id', 'week_start'])
prod_trend['global_trend_lag1'] = prod_trend.groupby('product_unit_variant_id')['global_qty'].shift(1)
full_df = full_df.merge(prod_trend[['product_unit_variant_id', 'week_start', 'global_trend_lag1']],
                        on=['product_unit_variant_id', 'week_start'], how='left')

# D. Lag Features
print("Generating lags...")
grp = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week']
full_df['lag1'] = grp.shift(1)
full_df['lag2'] = grp.shift(2)
full_df['roll_mean_4'] = grp.rolling(4).mean().reset_index(level=[0,1], drop=True)

# --- 4. Target Generation (CRITICAL FIX) ---
# We generate targets ON THE FULL DATASET so the shifts work correctly.
print("Generating Targets on Full Data...")
full_df['target_qty_1w'] = grp.shift(-1) # Next week's qty
full_df['target_qty_2w'] = grp.shift(-2) # Week after next qty

# --- 5. Merge Metadata ---
# Now we bring in Customer and SKU details
full_df = full_df.merge(customer, on='customer_id', how='left')
full_df = full_df.merge(sku, on='product_unit_variant_id', how='left')

# Encode Categoricals
cat_cols = ['customer_category', 'customer_status', 'grade_name', 'unit_name']
le = LabelEncoder()
for col in cat_cols:
    full_df[col] = full_df[col].astype(str).fillna('UNKNOWN')
    full_df[col] = le.fit_transform(full_df[col])

# Fill Numerical NaNs
num_cols = ['lag1', 'lag2', 'roll_mean_4', 'global_trend_lag1']
full_df[num_cols] = full_df[num_cols].fillna(0)

# --- 6. Create Stacked Training Data ---
print("Stacking Data...")

# Identify Train rows vs Test rows
# Train rows are those that were in the original Train CSV
train_mask = full_df['week_start'].isin(train['week_start'].unique())
train_df_source = full_df[train_mask].copy()
test_df_source = full_df[~train_mask].copy()

# Stack 1: Horizon 1
h1 = train_df_source.copy()
h1['target_qty'] = h1['target_qty_1w']
h1['target_buy'] = (h1['target_qty'] > 0).astype(int)
h1['horizon'] = 1

# Stack 2: Horizon 2
h2 = train_df_source.copy()
h2['target_qty'] = h2['target_qty_2w']
h2['target_buy'] = (h2['target_qty'] > 0).astype(int)
h2['horizon'] = 2

# Combine
train_stacked = pd.concat([h1, h2], ignore_index=True)

# Drop rows where target is NaN (End of history)
train_stacked = train_stacked.dropna(subset=['target_qty'])

# Define Features
features = ['lag1', 'lag2', 'roll_mean_4', 'global_trend_lag1',
            'month', 'week_of_year', 'horizon',
            'customer_category', 'customer_status', 'grade_name', 'unit_name']

# --- 7. Modeling ---
print("Training Unified Models...")

# Classifier (AUC)
clf = lgb.LGBMClassifier(n_estimators=2000, learning_rate=0.03, num_leaves=31, random_state=42)
clf.fit(train_stacked[features], train_stacked['target_buy'])

# Regressor (Tweedie for MAE)
reg = lgb.LGBMRegressor(objective='tweedie', tweedie_variance_power=1.5,
                        n_estimators=2000, learning_rate=0.03, num_leaves=31, random_state=42)
reg.fit(train_stacked[features], train_stacked['target_qty'])

# --- 8. Prediction ---
print("Generating Predictions...")

# Prepare Test sets
test_h1 = test_df_source.copy()
test_h1['horizon'] = 1

test_h2 = test_df_source.copy()
test_h2['horizon'] = 2

# Predict
submission = test[['ID']].copy()
submission['Target_purchase_next_1w'] = clf.predict_proba(test_h1[features])[:, 1]
submission['Target_qty_next_1w'] = reg.predict(test_h1[features])
submission['Target_purchase_next_2w'] = clf.predict_proba(test_h2[features])[:, 1]
submission['Target_qty_next_2w'] = reg.predict(test_h2[features])

# --- 9. Consistency Check ---
# If probability of buy is very low, force quantity to 0?
# (Optional, but helps MAE). Let's be conservative.
# submission.loc[submission['Target_purchase_next_1w'] < 0.1, 'Target_qty_next_1w'] = 0
# submission.loc[submission['Target_purchase_next_2w'] < 0.1, 'Target_qty_next_2w'] = 0

submission.to_csv('submission_horizon_fixed.csv', index=False)
print("Done! Saved as submission_horizon_fixed.csv")

Loading data...
Building Full History...
Generating lags...
Generating Targets on Full Data...
Stacking Data...
Training Unified Models...


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from sklearn.preprocessing import LabelEncoder

# --- 1. Load Data ---
print("Loading data...")
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
customer = pd.read_csv('customer_data.csv')
sku = pd.read_csv('sku_data.csv')

# --- 2. Downcast to Save Memory ---
def downcast(df):
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    for col in df.select_dtypes(include=['int64']).columns:
        if df[col].max() < 2**32:
            df[col] = df[col].astype('int32')
    return df

train = downcast(train)
test = downcast(test)
customer = downcast(customer)
sku = downcast(sku)

# Convert Dates
train['week_start'] = pd.to_datetime(train['week_start'])
test['week_start'] = pd.to_datetime(test['week_start'])
customer['customer_created_at'] = pd.to_datetime(customer['customer_created_at'])

# --- 3. Unified Feature Engineering ---
print("Building Full History...")

# A. Combine Train and Test FIRST
# We need the full timeline to calculate trends and lags correctly
full_df = pd.concat([
    train[['customer_id', 'product_unit_variant_id', 'week_start', 'qty_this_week']],
    test[['customer_id', 'product_unit_variant_id', 'week_start']].assign(qty_this_week=0.0)
], ignore_index=True)

full_df = full_df.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])

# --- GRANDMASTER FEATURES ---

# 1. Seasonality (Time)
full_df['month'] = full_df['week_start'].dt.month.fillna(0).astype(int)
full_df['week_of_year'] = full_df['week_start'].dt.isocalendar().week.fillna(0).astype(int)

# 2. Global Product Trend (The "Viral" Factor)
# Calculates: "How much is this specific product selling across ALL customers right now?"
# We shift by 1 week so we don't peek into the future.
prod_trend = full_df.groupby(['product_unit_variant_id', 'week_start'])['qty_this_week'].mean().reset_index().rename(columns={'qty_this_week': 'global_qty'})
prod_trend = prod_trend.sort_values(['product_unit_variant_id', 'week_start'])
prod_trend['global_trend_lag1'] = prod_trend.groupby('product_unit_variant_id')['global_qty'].shift(1)
prod_trend['global_trend_roll4'] = prod_trend.groupby('product_unit_variant_id')['global_qty'].transform(lambda x: x.shift(1).rolling(4).mean())

full_df = full_df.merge(prod_trend[['product_unit_variant_id', 'week_start', 'global_trend_lag1', 'global_trend_roll4']],
                        on=['product_unit_variant_id', 'week_start'], how='left')

# 3. Lag Features (Individual History)
print("Generating lags...")
grp = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week']
full_df['lag1'] = grp.shift(1)
full_df['lag2'] = grp.shift(2)
full_df['lag3'] = grp.shift(3)
full_df['roll_mean_4'] = grp.rolling(4).mean().reset_index(level=[0,1], drop=True)
full_df['roll_max_4'] = grp.rolling(4).max().reset_index(level=[0,1], drop=True)

# --- 4. Target Generation (THE FIX) ---
# Generate targets on the full timeline BEFORE splitting
print("Generating Targets...")
full_df['target_qty_1w'] = grp.shift(-1) # Next week's qty
full_df['target_qty_2w'] = grp.shift(-2) # Week after next qty

# --- 5. Merge Metadata & Encode ---
full_df = full_df.merge(customer, on='customer_id', how='left')
full_df = full_df.merge(sku, on='product_unit_variant_id', how='left')

# Customer Tenure (How long have they been with us?)
full_df['customer_tenure_days'] = (full_df['week_start'] - full_df['customer_created_at']).dt.days

# Encode Categoricals
cat_cols = ['customer_category', 'customer_status', 'grade_name', 'unit_name']
le = LabelEncoder()
for col in cat_cols:
    full_df[col] = full_df[col].astype(str).fillna('UNKNOWN')
    full_df[col] = le.fit_transform(full_df[col])

# Fill NaNs in features with 0
num_cols = ['lag1', 'lag2', 'lag3', 'roll_mean_4', 'roll_max_4', 'global_trend_lag1', 'global_trend_roll4', 'customer_tenure_days']
full_df[num_cols] = full_df[num_cols].fillna(0)

# --- 6. Create Stacked Training Data ---
print("Stacking Data...")

# Filter back to Train and Test sets
train_mask = full_df['week_start'].isin(train['week_start'].unique())
train_df_source = full_df[train_mask].copy()
test_df_source = full_df[~train_mask].copy()

# Stack 1: Horizon 1 (Predicting Next Week)
h1 = train_df_source.copy()
h1['target_qty'] = h1['target_qty_1w']
h1['target_buy'] = (h1['target_qty'] > 0).astype(int)
h1['horizon'] = 1

# Stack 2: Horizon 2 (Predicting 2 Weeks out)
h2 = train_df_source.copy()
h2['target_qty'] = h2['target_qty_2w']
h2['target_buy'] = (h2['target_qty'] > 0).astype(int)
h2['horizon'] = 2

# Combine Stacks
train_stacked = pd.concat([h1, h2], ignore_index=True)
train_stacked = train_stacked.dropna(subset=['target_qty']) # Valid targets only

# Define Features
features = [
    'lag1', 'lag2', 'lag3', 'roll_mean_4', 'roll_max_4', # Individual behavior
    'global_trend_lag1', 'global_trend_roll4',           # Market behavior
    'month', 'week_of_year', 'horizon',                  # Time context
    'customer_tenure_days',                              # Loyalty
    'customer_category', 'customer_status', 'grade_name', 'unit_name' # Metadata
]

# --- 7. Modeling (High Precision) ---
print("Training Unified Models...")

# Classifier (Optimized for AUC)
# Lower learning rate (0.02) + More trees (2500) = Better Pattern Recognition
clf = lgb.LGBMClassifier(
    n_estimators=2500,
    learning_rate=0.02,
    num_leaves=40,
    random_state=42,
    subsample=0.8,
    colsample_bytree=0.8
)
clf.fit(train_stacked[features], train_stacked['target_buy'])
print("Classifier Trained.")

# Regressor (Optimized for Tweedie/MAE)
reg = lgb.LGBMRegressor(
    objective='tweedie',
    tweedie_variance_power=1.5,
    n_estimators=2500,
    learning_rate=0.02,
    num_leaves=40,
    random_state=42,
    subsample=0.8,
    colsample_bytree=0.8
)
reg.fit(train_stacked[features], train_stacked['target_qty'])
print("Regressor Trained.")

# --- 8. Prediction ---
print("Generating Predictions...")

# Prepare Test sets for both horizons
test_h1 = test_df_source.copy()
test_h1['horizon'] = 1

test_h2 = test_df_source.copy()
test_h2['horizon'] = 2

submission = test[['ID']].copy()

# Horizon 1 Predictions
submission['Target_purchase_next_1w'] = clf.predict_proba(test_h1[features])[:, 1]
submission['Target_qty_next_1w'] = reg.predict(test_h1[features])

# Horizon 2 Predictions
submission['Target_purchase_next_2w'] = clf.predict_proba(test_h2[features])[:, 1]
submission['Target_qty_next_2w'] = reg.predict(test_h2[features])

# Final Cleanup (No negative quantities)
submission['Target_qty_next_1w'] = submission['Target_qty_next_1w'].clip(lower=0)
submission['Target_qty_next_2w'] = submission['Target_qty_next_2w'].clip(lower=0)

submission.to_csv('submission_horizon_promax.csv', index=False)
print("Done! Saved as submission_horizon_promax.csv")

Loading data...
Building Full History...
Generating lags...
Generating Targets...
Stacking Data...
Training Unified Models...
Classifier Trained.
Regressor Trained.
Generating Predictions...


In [None]:
# --- 7. Validation & Modeling (With Scores!) ---
print("--- Starting Validation ---")

# A. Create a Time-Based Split for Validation
# We use the last 4 weeks of the training data to check our score
weeks = sorted(train_stacked['week_start'].unique())
val_start_week = weeks[-4]

train_subset = train_stacked[train_stacked['week_start'] < val_start_week]
val_subset = train_stacked[train_stacked['week_start'] >= val_start_week]

# B. Train & Score Classifier (AUC)
print("Validating Classifier...")
clf_val = lgb.LGBMClassifier(
    n_estimators=2000,
    learning_rate=0.02,
    num_leaves=40,
    random_state=42,
    subsample=0.8,
    colsample_bytree=0.8
)
clf_val.fit(
    train_subset[features],
    train_subset['target_buy'],
    eval_set=[(val_subset[features], val_subset['target_buy'])],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

# Calculate Validation AUC
val_preds_prob = clf_val.predict_proba(val_subset[features])[:, 1]
from sklearn.metrics import roc_auc_score, mean_absolute_error
val_auc = roc_auc_score(val_subset['target_buy'], val_preds_prob)
print(f"✅ LOCAL VALIDATION AUC: {val_auc:.5f}")

# C. Train & Score Regressor (MAE)
print("Validating Regressor...")
reg_val = lgb.LGBMRegressor(
    objective='tweedie',
    tweedie_variance_power=1.5,
    n_estimators=2000,
    learning_rate=0.02,
    num_leaves=40,
    random_state=42,
    subsample=0.8,
    colsample_bytree=0.8
)
reg_val.fit(
    train_subset[features],
    train_subset['target_qty'],
    eval_set=[(val_subset[features], val_subset['target_qty'])],
    eval_metric='mae',
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)

# Calculate Validation MAE
val_preds_qty = reg_val.predict(val_subset[features])
val_mae = mean_absolute_error(val_subset['target_qty'], val_preds_qty)
print(f"✅ LOCAL VALIDATION MAE: {val_mae:.5f}")

# --- 8. Final Retraining & Prediction ---
print("\n--- Retraining on FULL Dataset for Submission ---")
# Now that we know the score, we train on EVERYTHING (Train + Validation) to get max performance
clf_full = lgb.LGBMClassifier(
    n_estimators=2500,
    learning_rate=0.02,
    num_leaves=40,
    random_state=42,
    subsample=0.8,
    colsample_bytree=0.8
)
clf_full.fit(train_stacked[features], train_stacked['target_buy'])

reg_full = lgb.LGBMRegressor(
    objective='tweedie',
    tweedie_variance_power=1.5,
    n_estimators=2500,
    learning_rate=0.02,
    num_leaves=40,
    random_state=42,
    subsample=0.8,
    colsample_bytree=0.8
)
reg_full.fit(train_stacked[features], train_stacked['target_qty'])

print("Generating Final Predictions...")

# Prepare Test sets for both horizons
test_h1 = test_df_source.copy()
test_h1['horizon'] = 1

test_h2 = test_df_source.copy()
test_h2['horizon'] = 2

submission = test[['ID']].copy()

# Horizon 1 Predictions
submission['Target_purchase_next_1w'] = clf_full.predict_proba(test_h1[features])[:, 1]
submission['Target_qty_next_1w'] = reg_full.predict(test_h1[features])

# Horizon 2 Predictions
submission['Target_purchase_next_2w'] = clf_full.predict_proba(test_h2[features])[:, 1]
submission['Target_qty_next_2w'] = reg_full.predict(test_h2[features])

# Final Cleanup (No negative quantities)
submission['Target_qty_next_1w'] = submission['Target_qty_next_1w'].clip(lower=0)
submission['Target_qty_next_2w'] = submission['Target_qty_next_2w'].clip(lower=0)

submission.to_csv('submission_horizon_promax_scored.csv', index=False)
print("Done! Saved as submission_horizon_promax_scored.csv")

In [None]:
# --- ADD THIS TO YOUR FEATURE ENGINEERING STEP ---

# 1. Extract Date Features
# These help the model learn seasonality (e.g., mango season)
train['month'] = train['week_start'].dt.month
train['week_of_year'] = train['week_start'].dt.isocalendar().week.astype(int)
train['quarter'] = train['week_start'].dt.quarter

test['month'] = test['week_start'].dt.month
test['week_of_year'] = test['week_start'].dt.isocalendar().week.astype(int)
test['quarter'] = test['week_start'].dt.quarter

# 2. Calculate Customer Tenure (Days on Platform)
# Older customers behave differently than new ones
train['tenure_days'] = (train['week_start'] - train['customer_created_at']).dt.days
test['tenure_days'] = (test['week_start'] - test['customer_created_at']).dt.days

# 3. Add these new columns to your 'features' list
# (Update the list in the Modeling Step)
# features = ['month', 'week_of_year', 'tenure_days', 'customer_category', ...] + feature_cols

ValueError: cannot convert NA to integer

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, mean_absolute_error

# --- 1. Load Data ---
print("Loading data...")
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
customer = pd.read_csv('customer_data.csv')
sku = pd.read_csv('sku_data.csv')

# --- 2. Downcast (Keep Memory Low) ---
def downcast(df):
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    for col in df.select_dtypes(include=['int64']).columns:
        if df[col].max() < 2**32:
            df[col] = df[col].astype('int32')
    return df

train = downcast(train)
test = downcast(test)
customer = downcast(customer)
sku = downcast(sku)

# Date Conversion
train['week_start'] = pd.to_datetime(train['week_start'])
test['week_start'] = pd.to_datetime(test['week_start'])
customer['customer_created_at'] = pd.to_datetime(customer['customer_created_at'])

# --- 3. Feature Engineering (Unified Process) ---
print("Building Full History and Features...")

# A. Create a base full_df with essential columns for timeline and qty
full_df = pd.concat([
    train[['customer_id', 'product_unit_variant_id', 'week_start', 'qty_this_week',
           'num_orders_week', 'spend_this_week', 'purchased_this_week']], # Train-specific cols
    test[['customer_id', 'product_unit_variant_id', 'week_start']].assign(qty_this_week=0.0,
           num_orders_week=0.0, spend_this_week=0.0, purchased_this_week=0.0) # Test with placeholder zeros
], ignore_index=True)
full_df = full_df.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])

# B. Merge Customer and SKU data into full_df
full_df = full_df.merge(customer[['customer_id', 'customer_category', 'customer_status', 'customer_created_at']],
                        on='customer_id', how='left')
full_df = full_df.merge(sku[['product_unit_variant_id', 'grade_name', 'unit_name']],
                        on='product_unit_variant_id', how='left')

# C. Add Seasonality (The Missing Piece)
full_df['month'] = full_df['week_start'].dt.month.fillna(0).astype(int)
full_df['week_of_year'] = full_df['week_start'].dt.isocalendar().week.fillna(0).astype(int)

# D. Calculate Lags
print("Generating Lags...")
grp = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week']
full_df['lag1'] = grp.shift(1)
full_df['lag2'] = grp.shift(2)
full_df['lag3'] = grp.shift(3)
full_df['roll_mean_4'] = grp.rolling(4).mean().reset_index(level=[0,1], drop=True)

# E. Global Trend (Calculated STRICTLY on Train to prevent Leakage)
print("Generating Global Trends...")
# Calculate average sales per product per week in TRAIN only (where qty_this_week > 0)
train_only_for_global = full_df[full_df['qty_this_week'] > 0.0].copy()
global_stats = train_only_for_global.groupby(['product_unit_variant_id', 'week_start'])['qty_this_week'].mean().reset_index()
global_stats.rename(columns={'qty_this_week': 'global_avg_qty'}, inplace=True)

# Merge this back to full_df, but SHIFTED by 1 week to represent prior week's trend
global_stats['week_start'] = global_stats['week_start'] + pd.Timedelta(weeks=1)
full_df = full_df.merge(global_stats, on=['product_unit_variant_id', 'week_start'], how='left')
full_df['global_avg_qty'] = full_df['global_avg_qty'].fillna(0)

# F. Customer Tenure
full_df['tenure_days'] = (full_df['week_start'] - full_df['customer_created_at']).dt.days

# G. Fill NaNs for numerical features
cols_to_fill = ['lag1', 'lag2', 'lag3', 'roll_mean_4', 'global_avg_qty', 'tenure_days']
full_df[cols_to_fill] = full_df[cols_to_fill].fillna(0)

# H. Encode Categoricals (after all merges and before splitting)
cat_cols = ['customer_category', 'customer_status', 'grade_name', 'unit_name']
le = LabelEncoder()
for col in cat_cols:
    full_df[col] = full_df[col].astype(str).fillna('UNKNOWN')
    full_df[col] = le.fit_transform(full_df[col])

gc.collect()

# --- 4. Prepare Train/Test for Modeling (Split Enriched full_df) ---
print("Preparing Train/Test sets...")

# Get original train IDs and week_starts
original_train_ids = train[['customer_id', 'product_unit_variant_id', 'week_start']].drop_duplicates()

# Filter full_df to get the enriched train and test sets
train_enriched = pd.merge(original_train_ids, full_df, on=['customer_id', 'product_unit_variant_id', 'week_start'], how='left')
test_enriched = full_df[~full_df.set_index(['customer_id', 'product_unit_variant_id', 'week_start']).index.isin(
    original_train_ids.set_index(['customer_id', 'product_unit_variant_id', 'week_start']).index
)].copy()

# Ensure 'ID' column is restored for test_enriched from the original test dataframe
test_enriched = pd.merge(test[['ID', 'customer_id', 'product_unit_variant_id', 'week_start']],
                         test_enriched,
                         on=['customer_id', 'product_unit_variant_id', 'week_start'],
                         how='left')

# Overwrite original train/test with enriched versions
train = train_enriched
test = test_enriched

# --- 5. Target Generation (The "Wide" Way) ---
print("Generating Targets...")
train = train.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])
grp = train.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week']
train['target_qty_1w'] = grp.shift(-1).fillna(0)
train['target_qty_2w'] = grp.shift(-2).fillna(0)
train['target_buy_1w'] = (train['target_qty_1w'] > 0).astype(int)
train['target_buy_2w'] = (train['target_qty_2w'] > 0).astype(int)

# Drop rows where targets are NaN (end of history for target calculation)
train = train.dropna(subset=['target_qty_1w', 'target_qty_2w'])

# --- 6. Modeling (4 Separate Models - The Stable Way) ---
print("Training Models...")

# Features list
features = cols_to_fill + ['month', 'week_of_year'] + cat_cols + \
           ['num_orders_week', 'spend_this_week', 'purchased_this_week'] # Add original train specific columns if desired

# Filter features to only include those present in the DataFrame (e.g. if num_orders_week etc were not merged properly)
features = [f for f in features if f in train.columns and f in test.columns]

# Validation Split (Last 4 weeks)
weeks = sorted(train['week_start'].unique())
val_weeks = weeks[-4:]
val_mask = train['week_start'].isin(val_weeks)

X_train = train.loc[~val_mask, features]
X_val = train.loc[val_mask, features]

models = {}

# 1. Week 1 Classifier
print("Training Week 1 Classifier...")
clf1 = lgb.LGBMClassifier(n_estimators=1500, learning_rate=0.03, random_state=42, verbose=-1)
clf1.fit(X_train, train.loc[~val_mask, 'target_buy_1w'],
         eval_set=[(X_val, train.loc[val_mask, 'target_buy_1w'])], eval_metric='auc',
         callbacks=[lgb.early_stopping(50)])
models['p1'] = clf1

# 2. Week 1 Regressor (Tweedie)
print("Training Week 1 Regressor...")
reg1 = lgb.LGBMRegressor(objective='tweedie', tweedie_variance_power=1.5, n_estimators=1500, learning_rate=0.03, random_state=42, verbose=-1)
reg1.fit(X_train, train.loc[~val_mask, 'target_qty_1w'],
         eval_set=[(X_val, train.loc[val_mask, 'target_qty_1w'])], eval_metric='mae',
         callbacks=[lgb.early_stopping(50)])
models['q1'] = reg1

# 3. Week 2 Classifier
print("Training Week 2 Classifier...")
clf2 = lgb.LGBMClassifier(n_estimators=1500, learning_rate=0.03, random_state=42, verbose=-1)
clf2.fit(X_train, train.loc[~val_mask, 'target_buy_2w'],
         eval_set=[(X_val, train.loc[val_mask, 'target_buy_2w'])], eval_metric='auc',
         callbacks=[lgb.early_stopping(50)])
models['p2'] = clf2

# 4. Week 2 Regressor (Tweedie)
print("Training Week 2 Regressor...")
reg2 = lgb.LGBMRegressor(objective='tweedie', tweedie_variance_power=1.5, n_estimators=1500, learning_rate=0.03, random_state=42, verbose=-1)
reg2.fit(X_train, train.loc[~val_mask, 'target_qty_2w'],
         eval_set=[(X_val, train.loc[val_mask, 'target_qty_2w'])], eval_metric='mae',
         callbacks=[lgb.early_stopping(50)])
models['q2'] = reg2

# --- 7. Submission ---
print("Generating Predictions...")
submission = test[['ID']].copy() # Use the original test ID column

submission['Target_purchase_next_1w'] = models['p1'].predict_proba(test[features])[:, 1]
submission['Target_qty_next_1w'] = models['q1'].predict(test[features])
submission['Target_purchase_next_2w'] = models['p2'].predict_proba(test[features])[:, 1]
submission['Target_qty_next_2w'] = models['q2'].predict(test[features])

# Clip negatives
submission['Target_qty_next_1w'] = submission['Target_qty_next_1w'].clip(lower=0)
submission['Target_qty_next_2w'] = submission['Target_qty_next_2w'].clip(lower=0)

submission.to_csv('submission_safe_upgrade.csv', index=False)
print("Done! Safe upgrade saved.")

Loading data...
Building Full History and Features...
Generating Lags...
Generating Global Trends...
Preparing Train/Test sets...
Generating Targets...
Training Models...
Training Week 1 Classifier...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[61]	valid_0's auc: 0.951647	valid_0's binary_logloss: 0.0534513
Training Week 1 Regressor...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[18]	valid_0's l1: 1.49447	valid_0's tweedie: 3.89564
Training Week 2 Classifier...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[31]	valid_0's auc: 0.936496	valid_0's binary_logloss: 0.0504207
Training Week 2 Regressor...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[75]	valid_0's l1: 1.14246	valid_0's tweedie: 1.55625
Generating Predictions...
Done! Safe upgrade saved.


In [None]:
# Load your latest submission
sub = pd.read_csv('submission_safe_upgrade.csv')

# Multiply Quantity * Probability
# If Prob is 0.1 and Qty is 10, the "Expected" Qty is 1.
sub['Target_qty_next_1w'] = sub['Target_qty_next_1w'] * sub['Target_purchase_next_1w']
sub['Target_qty_next_2w'] = sub['Target_qty_next_2w'] * sub['Target_purchase_next_2w']

# Save as a new version
sub.to_csv('submission_expected_value.csv', index=False)
print("Applied Expected Value correction.")

Applied Expected Value correction.


In [None]:
Limport pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from sklearn.preprocessing import LabelEncoder

# --- 1. Load Data ---
print("Loading data...")
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
customer = pd.read_csv('customer_data.csv')
sku = pd.read_csv('sku_data.csv')

# --- 2. Downcast ---
def downcast(df):
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    for col in df.select_dtypes(include=['int64']).columns:
        if df[col].max() < 2**32:
            df[col] = df[col].astype('int32')
    return df

train = downcast(train)
test = downcast(test)
customer = downcast(customer)
sku = downcast(sku)

train['week_start'] = pd.to_datetime(train['week_start'])
test['week_start'] = pd.to_datetime(test['week_start'])
customer['customer_created_at'] = pd.to_datetime(customer['customer_created_at'])

# --- 3. LEAK-FREE Feature Engineering ---
print("Generating Leak-Free Features...")

# A. Create Full Timeline (Train + Test)
# This is crucial so rolling windows flow naturally from Train into Test
full_df = pd.concat([
    train[['customer_id', 'product_unit_variant_id', 'week_start', 'qty_this_week']],
    test[['customer_id', 'product_unit_variant_id', 'week_start']].assign(qty_this_week=0.0) # FIXED: Use 0.0 instead of np.nan
], ignore_index=True)

full_df = full_df.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])

# B. Individual Lag Features
# "What did THIS customer buy recently?"
grp = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week']
full_df['lag1'] = grp.shift(1) # Last week
full_df['lag2'] = grp.shift(2) # 2 weeks ago
full_df['roll_mean_4'] = grp.shift(1).rolling(4).mean() # Avg of last 4 weeks (excluding current)

# C. Global Product Trend (The "Viral" Factor)
# "How much is this product selling generally?"
# We calculate this on the WHOLE dataset using transform, but we SHIFT first.
# This ensures Week 10 only knows about the global volume of Weeks 1-9.
# Note: We fillna(0) temporarily for the calculation but keep original logic
temp_df = full_df.copy()
temp_df['qty_this_week'] = temp_df['qty_this_week'].fillna(0)

# Global trend per product per week
global_trend = temp_df.groupby(['product_unit_variant_id', 'week_start'])['qty_this_week'].mean().reset_index().rename(columns={'qty_this_week': 'daily_global_vol'})
full_df = full_df.merge(global_trend, on=['product_unit_variant_id', 'week_start'], how='left')

# Now calculate the LAGGED global trend (Leak-Free)
grp_global = full_df.groupby('product_unit_variant_id')['daily_global_vol']
full_df['global_lag1'] = grp_global.shift(1)
full_df['global_roll_4'] = grp_global.shift(1).rolling(4).mean()

# D. Seasonality
full_df['month'] = full_df['week_start'].dt.month.fillna(0).astype(int) # Fixed: fillna(0) added
full_df['week_of_year'] = full_df['week_start'].dt.isocalendar().week.fillna(0).astype(int) # Fixed: fillna(0) added

# --- 4. Merge Metadata ---
full_df = full_df.merge(customer, on='customer_id', how='left')
full_df = full_df.merge(sku, on='product_unit_variant_id', how='left')

# Encode
cat_cols = ['customer_category', 'customer_status', 'grade_name', 'unit_name']
le = LabelEncoder()
for col in cat_cols:
    full_df[col] = full_df[col].astype(str).fillna('UNKNOWN')
    full_df[col] = le.fit_transform(full_df[col])

# Fill Numerical NaNs
num_cols = ['lag1', 'lag2', 'roll_mean_4', 'global_lag1', 'global_roll_4']
full_df[num_cols] = full_df[num_cols].fillna(0)

# --- 5. Target Generation ---
print("Generating Targets...")
# We use the full_df to shift, then split
full_df = full_df.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])
grp = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week']

full_df['target_qty_1w'] = grp.shift(-1)
full_df['target_qty_2w'] = grp.shift(-2)
full_df['target_buy_1w'] = (full_df['target_qty_1w'] > 0).astype(float) # Float for probability
full_df['target_buy_2w'] = (full_df['target_qty_2w'] > 0).astype(float)

# --- 6. Split Train/Test ---
# Filter back to original Train set rows (where we have actuals) and Test set rows
train_df = full_df[full_df['week_start'].isin(train['week_start'])].copy()
test_df = full_df[full_df['week_start'].isin(test['week_start'])].copy()

# Remove rows in Train that don't have valid targets (the very last weeks of history)
train_df = train_df.dropna(subset=['target_qty_1w']) # For 1w model
train_df_2w = train_df.dropna(subset=['target_qty_2w']) # For 2w model

# --- 7. Modeling ---
print("Training Models...")
feature_cols = ['lag1', 'lag2', 'roll_mean_4', 'global_lag1', 'global_roll_4',
                'month', 'week_of_year'] + cat_cols

models = {}

# Week 1 Classifier
print("  Week 1 Classifier...")
clf1 = lgb.LGBMClassifier(n_estimators=2000, learning_rate=0.03, num_leaves=31, random_state=42)
clf1.fit(train_df[feature_cols], train_df['target_buy_1w'])

# Filter for non-zero quantities for regression tasks
train_df_qty1 = train_df[train_df['target_qty_1w'] > 0].copy()
train_df_qty2 = train_df_2w[train_df_2w['target_qty_2w'] > 0].copy()

# Week 1 Regressor (Tweedie)
print("  Week 1 Regressor...")
reg1 = lgb.LGBMRegressor(objective='tweedie', tweedie_variance_power=1.5, n_estimators=2000, learning_rate=0.03, random_state=42)
# Only train on rows where target_qty_1w is greater than 0
if not train_df_qty1.empty:
    reg1.fit(train_df_qty1[feature_cols], train_df_qty1['target_qty_1w'])
else:
    print("Warning: train_df_qty1 is empty. Regressor 1 will not be trained.")

# Week 2 Classifier
print("  Week 2 Classifier...")
clf2 = lgb.LGBMClassifier(n_estimators=2000, learning_rate=0.03, num_leaves=31, random_state=42)
clf2.fit(train_df_2w[feature_cols], train_df_2w['target_buy_2w'])

# Week 2 Regressor (Tweedie)
print("  Week 2 Regressor...")
reg2 = lgb.LGBMRegressor(objective='tweedie', tweedie_variance_power=1.5, n_estimators=2000, learning_rate=0.03, random_state=42)
# Only train on rows where target_qty_2w is greater than 0
if not train_df_qty2.empty:
    reg2.fit(train_df_qty2[feature_cols], train_df_qty2['target_qty_2w'])
else:
    print("Warning: train_df_qty2 is empty. Regressor 2 will not be trained.")

# --- 8. Submission with EV Optimization ---
print("Generating Final Optimized Predictions...")
submission = test[['ID']].copy()

# A. Raw Predictions
p1 = clf1.predict_proba(test_df[feature_cols])[:, 1]
q1 = reg1.predict(test_df[feature_cols]) if not train_df_qty1.empty else np.zeros(len(test_df))
p2 = clf2.predict_proba(test_df[feature_cols])[:, 1]
q2 = reg2.predict(test_df[feature_cols]) if not train_df_qty2.empty else np.zeros(len(test_df))

# B. Store Raw Probs (For AUC)
submission['Target_purchase_next_1w'] = p1
submission['Target_purchase_next_2w'] = p2

# C. Apply Expected Value Optimization (For MAE)
# Qty = Predicted_Qty * Probability_of_Purchase
submission['Target_qty_next_1w'] = (q1 * p1).clip(min=0)
submission['Target_qty_next_2w'] = (q2 * p2).clip(min=0)

submission.to_csv('submission_leak_free_optimized.csv', index=False)
print("Done! Upload 'submission_leak_free_optimized.csv'")

Loading data...
Generating Leak-Free Features...
Generating Targets...
Training Models...
  Week 1 Classifier...
  Week 1 Regressor...
  Week 2 Classifier...
  Week 2 Regressor...
Generating Final Optimized Predictions...
Done! Upload 'submission_leak_free_optimized.csv'


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from sklearn.preprocessing import LabelEncoder

# --- 1. Load Data (NO DOWNCASTING) ---
print("Loading data...")
# We let Pandas choose the safe types (int64/float64) automatically.
# This uses more RAM but guarantees 100% data integrity.
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
customer = pd.read_csv('customer_data.csv')
sku = pd.read_csv('sku_data.csv')

# Explicitly convert Dates
train['week_start'] = pd.to_datetime(train['week_start'])
test['week_start'] = pd.to_datetime(test['week_start'])
customer['customer_created_at'] = pd.to_datetime(customer['customer_created_at'])

# --- 2. Feature Engineering ---
print("Generating Features...")

# Create Flag
train['is_train'] = 1
test['is_train'] = 0

# Combine (Safely)
# We only take necessary columns to keep RAM manageable
cols = ['customer_id', 'product_unit_variant_id', 'week_start', 'qty_this_week', 'is_train']
full_df = pd.concat([
    train[cols],
    test[['customer_id', 'product_unit_variant_id', 'week_start', 'is_train']].assign(qty_this_week=0.0) # FIXED: Use 0.0 for test qty_this_week
], ignore_index=True)

# Sort (Crucial: IDs must be correct for this to work)
full_df = full_df.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])

# --- Lags & Trends ---
print("  Calculating Rolling Stats...")
grp_obj = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week']

full_df['lag1'] = grp_obj.shift(1)
full_df['lag2'] = grp_obj.shift(2)
# transform keeps the index alignment safe
full_df['roll_mean_4'] = grp_obj.transform(lambda x: x.shift(1).rolling(4).mean())

print("  Calculating Global Trends...")
# Global Trend logic
temp_df = full_df.copy()
temp_df['qty_this_week'] = temp_df['qty_this_week'].fillna(0) # Fill NaN for global calculation only

global_trend = temp_df.groupby(['product_unit_variant_id', 'week_start'])['qty_this_week'].mean().reset_index().rename(columns={'qty_this_week': 'daily_global_vol'})
full_df = full_df.merge(global_trend, on=['product_unit_variant_id', 'week_start'], how='left')

grp_global = full_df.groupby('product_unit_variant_id')['daily_global_vol']
full_df['global_lag1'] = grp_global.shift(1)
full_df['global_roll_4'] = grp_global.transform(lambda x: x.shift(1).rolling(4).mean())

# Seasonality
full_df['month'] = full_df['week_start'].dt.month.fillna(0).astype(int) # Fixed: fillna(0) added
full_df['week_of_year'] = full_df['week_start'].dt.isocalendar().week.fillna(0).astype(int) # Fixed: fillna(0) added

# Cleanup
del temp_df, grp_obj, grp_global, global_trend
gc.collect()

# --- 3. Target Generation ---
print("Generating Targets...")
# Ensure sort is still perfect
full_df = full_df.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])
grp = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week']

full_df['target_qty_1w'] = grp.shift(-1)
full_df['target_qty_2w'] = grp.shift(-2)

# --- 4. Merge Metadata ---
print("Merging Metadata...")
full_df = full_df.merge(customer, on='customer_id', how='left')
full_df = full_df.merge(sku, on='product_unit_variant_id', how='left')

# Encode Categoricals
cat_cols = ['customer_category', 'customer_status', 'grade_name', 'unit_name']
le = LabelEncoder()
for col in cat_cols:
    full_df[col] = full_df[col].astype(str).fillna('UNKNOWN')
    full_df[col] = le.fit_transform(full_df[col])

# Fill NaNs in Features (Lags/Trends) with 0
num_cols = ['lag1', 'lag2', 'roll_mean_4', 'global_lag1', 'global_roll_4']
full_df[num_cols] = full_df[num_cols].fillna(0)

# --- 5. Split & Prepare ---
print("Splitting Data...")
train_df = full_df[full_df['is_train'] == 1].copy()
test_df = full_df[full_df['is_train'] == 0].copy()

# Fix Targets for Training
# If the history ends, we assume 0 purchase. This prevents dropping rows.
train_df['target_qty_1w'] = train_df['target_qty_1w'].fillna(0)
train_df['target_qty_2w'] = train_df['target_qty_2w'].fillna(0)

# Generate Binary Targets
train_df['target_buy_1w'] = (train_df['target_qty_1w'] > 0).astype(int)
train_df['target_buy_2w'] = (train_df['target_qty_2w'] > 0).astype(int)

print(f"✅ Training Rows: {len(train_df)} (Should be > 700,000)")

# --- 6. Modeling ---
print("Training Models...")
features = ['lag1', 'lag2', 'roll_mean_4', 'global_lag1', 'global_roll_4',
            'month', 'week_of_year'] + cat_cols

# Define LightGBM Params (Tweedie is Critical)
clf_params = {'n_estimators': 1500, 'learning_rate': 0.03, 'random_state': 42, 'verbose': -1}
reg_params = {'objective': 'tweedie', 'tweedie_variance_power': 1.5,
              'n_estimators': 1500, 'learning_rate': 0.03, 'random_state': 42, 'verbose': -1}

models = {}

# Week 1
print("  Week 1 Classifier...")
clf1 = lgb.LGBMClassifier(**clf_params)
clf1.fit(train_df[features], train_df['target_buy_1w'])

# Filter for non-zero quantities for regression tasks
train_df_qty1 = train_df[train_df['target_qty_1w'] > 0].copy()

print("  Week 1 Regressor...")
reg1 = lgb.LGBMRegressor(**reg_params)
# Only train on rows where target_qty_1w is greater than 0
if not train_df_qty1.empty:
    reg1.fit(train_df_qty1[features], train_df_qty1['target_qty_1w'])
else:
    print("Warning: train_df_qty1 (positive quantities for Regressor 1) is empty. Regressor 1 will not be trained.")

# Week 2
print("  Week 2 Classifier...")
clf2 = lgb.LGBMClassifier(**clf_params)
clf2.fit(train_df[features], train_df['target_buy_2w'])

# Filter for non-zero quantities for regression tasks (for 2w targets)
train_df_qty2 = train_df[train_df['target_qty_2w'] > 0].copy()

print("  Week 2 Regressor...")
reg2 = lgb.LGBMRegressor(**reg_params)
# Only train on rows where target_qty_2w is greater than 0
if not train_df_qty2.empty:
    reg2.fit(train_df_qty2[features], train_df_qty2['target_qty_2w'])
else:
    print("Warning: train_df_qty2 (positive quantities for Regressor 2) is empty. Regressor 2 will not be trained.")

# --- 7. Submission ---
print("Generating Submission...")
submission = test[['ID']].copy()

# Raw Predictions
p1 = clf1.predict_proba(test_df[features])[:, 1]
p2 = clf2.predict_proba(test_df[features])[:, 1]

# Predict Quantities
q1 = reg1.predict(test_df[features]) if not train_df_qty1.empty else np.zeros(len(test_df))
q2 = reg2.predict(test_df[features]) if not train_df_qty2.empty else np.zeros(len(test_df))

# Save Raw Probabilities (For AUC)
submission['Target_purchase_next_1w'] = p1
submission['Target_purchase_next_2w'] = p2

# Save Optimized Quantities (For MAE) -> Qty * Prob
submission['Target_qty_next_1w'] = np.maximum(0, q1 * p1) # FIXED: Using np.maximum for robustness
submission['Target_qty_next_2w'] = np.maximum(0, q2 * p2) # FIXED: Using np.maximum for robustness

submission.to_csv('submission_safe_v4.csv', index=False)
print("Done! Upload 'submission_safe_v4.csv'")

Loading data...
Generating Features...
  Calculating Rolling Stats...
  Calculating Global Trends...
Generating Targets...
Merging Metadata...
Splitting Data...
✅ Training Rows: 40262 (Should be > 700,000)
Training Models...
  Week 1 Classifier...
  Week 1 Regressor...
  Week 2 Classifier...
  Week 2 Regressor...
Generating Submission...
Done! Upload 'submission_safe_v4.csv'


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from sklearn.preprocessing import LabelEncoder

# --- 1. Load Data (FRESH & SAFE) ---
print("Loading data...")
# Read fresh to ensure no previous state corruption
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
customer = pd.read_csv('customer_data.csv')
sku = pd.read_csv('sku_data.csv')

# Verify initial shape
print(f"Original Test Rows: {len(test)}")

# Convert Dates
train['week_start'] = pd.to_datetime(train['week_start'])
test['week_start'] = pd.to_datetime(test['week_start'])
customer['customer_created_at'] = pd.to_datetime(customer['customer_created_at'])

# --- 2. Feature Engineering ---
print("Generating Features...")

# Create Flag
train['is_train'] = 1
test['is_train'] = 0

# Combine (No downcasting, keep IDs safe)
cols = ['customer_id', 'product_unit_variant_id', 'week_start', 'qty_this_week', 'is_train']
full_df = pd.concat([
    train[cols],
    test[['customer_id', 'product_unit_variant_id', 'week_start', 'is_train']].assign(qty_this_week=np.nan)
], ignore_index=True)

# Sort strictly for Lag generation
full_df = full_df.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])

# --- Lags & Trends ---
print("  Calculating Rolling Stats...")
grp_obj = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week']

full_df['lag1'] = grp_obj.shift(1)
full_df['lag2'] = grp_obj.shift(2)
# transform keeps the index alignment safe
full_df['roll_mean_4'] = grp_obj.transform(lambda x: x.shift(1).rolling(4).mean())

print("  Calculating Global Trends...")
# Global Trend logic
temp_df = full_df.copy()
temp_df['qty_this_week'] = temp_df['qty_this_week'].fillna(0) # Fill NaN for global calc

global_trend = temp_df.groupby(['product_unit_variant_id', 'week_start'])['qty_this_week'].mean().reset_index().rename(columns={'qty_this_week': 'daily_global_vol'})
full_df = full_df.merge(global_trend, on=['product_unit_variant_id', 'week_start'], how='left')

grp_global = full_df.groupby('product_unit_variant_id')['daily_global_vol']
full_df['global_lag1'] = grp_global.shift(1)
full_df['global_roll_4'] = grp_global.transform(lambda x: x.shift(1).rolling(4).mean())

# Seasonality
full_df['month'] = full_df['week_start'].dt.month.fillna(0).astype(int)
full_df['week_of_year'] = full_df['week_start'].dt.isocalendar().week.fillna(0).astype(int)

# Cleanup
del temp_df, grp_obj, grp_global, global_trend
gc.collect()

# --- 3. Target Generation ---
print("Generating Targets...")
# Ensure sort is still perfect
full_df = full_df.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])
grp = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week']

full_df['target_qty_1w'] = grp.shift(-1)
full_df['target_qty_2w'] = grp.shift(-2)

# --- 4. Merge Metadata ---
print("Merging Metadata...")
full_df = full_df.merge(customer, on='customer_id', how='left')
full_df = full_df.merge(sku, on='product_unit_variant_id', how='left')

# Encode Categoricals
cat_cols = ['customer_category', 'customer_status', 'grade_name', 'unit_name']
le = LabelEncoder()
for col in cat_cols:
    full_df[col] = full_df[col].astype(str).fillna('UNKNOWN')
    full_df[col] = le.fit_transform(full_df[col])

# Fill NaNs in Features with 0
num_cols = ['lag1', 'lag2', 'roll_mean_4', 'global_lag1', 'global_roll_4']
full_df[num_cols] = full_df[num_cols].fillna(0)

# --- 5. Split & Prepare ---
print("Splitting Data...")
train_df = full_df[full_df['is_train'] == 1].copy()
test_df = full_df[full_df['is_train'] == 0].copy()

# Ensure Test is aligned
test_df = test_df.sort_values(['ID'] if 'ID' in test_df.columns else ['customer_id', 'product_unit_variant_id'])

# Fix Targets for Training
train_df['target_qty_1w'] = train_df['target_qty_1w'].fillna(0)
train_df['target_qty_2w'] = train_df['target_qty_2w'].fillna(0)

# Generate Binary Targets
train_df['target_buy_1w'] = (train_df['target_qty_1w'] > 0).astype(int)
train_df['target_buy_2w'] = (train_df['target_qty_2w'] > 0).astype(int)

print(f"✅ Training Rows: {len(train_df)}")
print(f"✅ Test Rows: {len(test_df)} (Should match Original: {len(test)})")

# --- 6. Modeling ---
print("Training Models...")
features = ['lag1', 'lag2', 'roll_mean_4', 'global_lag1', 'global_roll_4',
            'month', 'week_of_year'] + cat_cols

# Params
clf_params = {'n_estimators': 1500, 'learning_rate': 0.03, 'random_state': 42, 'verbose': -1}
# Tweedie is designed for data with many zeros. Do NOT filter zeros out.
reg_params = {'objective': 'tweedie', 'tweedie_variance_power': 1.5,
              'n_estimators': 1500, 'learning_rate': 0.03, 'random_state': 42, 'verbose': -1}

models = {}

# Week 1
print("  Week 1 Classifier...")
clf1 = lgb.LGBMClassifier(**clf_params)
clf1.fit(train_df[features], train_df['target_buy_1w'])

print("  Week 1 Regressor...")
reg1 = lgb.LGBMRegressor(**reg_params)
reg1.fit(train_df[features], train_df['target_qty_1w'])

# Week 2
print("  Week 2 Classifier...")
clf2 = lgb.LGBMClassifier(**clf_params)
clf2.fit(train_df[features], train_df['target_buy_2w'])

print("  Week 2 Regressor...")
reg2 = lgb.LGBMRegressor(**reg_params)
reg2.fit(train_df[features], train_df['target_qty_2w'])

# --- 7. Submission ---
print("Generating Submission...")

# Ensure we use the exact Test IDs from the source file to prevent "Missing Entries"
submission = pd.read_csv('Test.csv')[['ID']]

# We must align predictions to this submission ID order
# We merge the predictions onto the submission DataFrame to guarantee alignment
preds_df = test_df[['customer_id', 'product_unit_variant_id', 'week_start']].copy()

# Raw Predictions
preds_df['p1'] = clf1.predict_proba(test_df[features])[:, 1]
preds_df['q1'] = reg1.predict(test_df[features])
preds_df['p2'] = clf2.predict_proba(test_df[features])[:, 1]
preds_df['q2'] = reg2.predict(test_df[features])

# Re-construct ID for merging to be 100% safe
# Format: customer_id_product_id_date (Assuming standard format, but safer to merge on columns)
# Actually, Test.csv has the ID. Let's merge on keys.
preds_df['ID'] = test['ID'] # Since test_df comes from test and we haven't dropped rows, this aligns.

# Merge predictions onto the master submission template
submission = submission.merge(preds_df[['ID', 'p1', 'q1', 'p2', 'q2']], on='ID', how='left')

# Assign to Target Columns
submission['Target_purchase_next_1w'] = submission['p1']
submission['Target_purchase_next_2w'] = submission['p2']

# Optimization: Qty * Prob
submission['Target_qty_next_1w'] = (submission['q1'] * submission['p1']).clip(lower=0)
submission['Target_qty_next_2w'] = (submission['q2'] * submission['p2']).clip(lower=0)

# Drop temp columns
submission = submission[['ID', 'Target_purchase_next_1w', 'Target_qty_next_1w', 'Target_purchase_next_2w', 'Target_qty_next_2w']]

# Final Check
print(f"Submission Rows: {len(submission)}")
if submission.isnull().sum().sum() > 0:
    print("⚠️ WARNING: NaNs found in submission. Filling with 0.")
    submission = submission.fillna(0)

submission.to_csv('submission_final_clean.csv', index=False)
print("Done! Upload 'submission_final_clean.csv'")

Loading data...
Original Test Rows: 275796
Generating Features...
  Calculating Rolling Stats...
  Calculating Global Trends...
Generating Targets...
Merging Metadata...
Splitting Data...
✅ Training Rows: 1690870
✅ Test Rows: 275796 (Should match Original: 275796)
Training Models...
  Week 1 Classifier...
  Week 1 Regressor...
  Week 2 Classifier...
  Week 2 Regressor...
Generating Submission...
Submission Rows: 275796
Done! Upload 'submission_final_clean.csv'


In [None]:
# --- SUBMISSION FIX: Align IDs 100% Correctly ---
print("FIXING SUBMISSION ALIGNMENT...")

# 1. Reconstruct the ID in test_df from the data columns
# Format appears to be: CustomerID_ProductUnitID_YYYYMMDD
# We use the columns that are definitely aligned with the predictions
test_df['reconstructed_ID'] = (
    test_df['customer_id'].astype(str) + '_' +
    test_df['product_unit_variant_id'].astype(str) + '_' +
    test_df['week_start'].dt.strftime('%Y%m%d')
)

# 2. Assign the predictions to test_df (which is sorted/shuffled)
test_df['p1_pred'] = clf1.predict_proba(test_df[features])[:, 1]
test_df['q1_pred'] = reg1.predict(test_df[features])
test_df['p2_pred'] = clf2.predict_proba(test_df[features])[:, 1]
test_df['q2_pred'] = reg2.predict(test_df[features])

# 3. Load the Template to ensure we have the exact Target IDs order
template = pd.read_csv('Test.csv')
target_ids = template[['ID']].copy()

# 4. Merge predictions onto the Template using the Reconstructed ID
# This ignores row order and matches strictly by the Unique ID string
final_sub = target_ids.merge(
    test_df[['reconstructed_ID', 'p1_pred', 'q1_pred', 'p2_pred', 'q2_pred']],
    left_on='ID',
    right_on='reconstructed_ID',
    how='left'
)

# 5. Optimization (Expected Value)
final_sub['Target_purchase_next_1w'] = final_sub['p1_pred']
final_sub['Target_qty_next_1w'] = (final_sub['q1_pred'] * final_sub['p1_pred']).clip(lower=0)
final_sub['Target_purchase_next_2w'] = final_sub['p2_pred']
final_sub['Target_qty_next_2w'] = (final_sub['q2_pred'] * final_sub['p2_pred']).clip(lower=0)

# 6. Final Cleanup
submission_file = final_sub[['ID', 'Target_purchase_next_1w', 'Target_qty_next_1w', 'Target_purchase_next_2w', 'Target_qty_next_2w']]

# Fill any missing matches with 0 (Safe fallback)
submission_file = submission_file.fillna(0)

print(f"Final Submission Shape: {submission_file.shape}")
submission_file.to_csv('submission_aligned_final.csv', index=False)
print("✅ Done! Upload 'submission_aligned_final.csv'")

FIXING SUBMISSION ALIGNMENT...
Final Submission Shape: (275796, 5)
✅ Done! Upload 'submission_aligned_final.csv'


In [4]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from sklearn.preprocessing import LabelEncoder

# --- 1. Load Data (SAFE MODE) ---
print("Loading data...")
# We use standard loading. No downcasting. No ID corruption.
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
customer = pd.read_csv('customer_data.csv')
sku = pd.read_csv('sku_data.csv')

# Safe Date Conversion
train['week_start'] = pd.to_datetime(train['week_start'])
test['week_start'] = pd.to_datetime(test['week_start'])
customer['customer_created_at'] = pd.to_datetime(customer['customer_created_at'])

# --- 2. Feature Engineering (The Hybrid Approach) ---
print("Generating Features...")

# We create a temporary 'Universal' dataframe just to calculate the smart features.
# We will NOT use this for the final submission structure.
cols_needed = ['customer_id', 'product_unit_variant_id', 'week_start', 'qty_this_week']
temp_df = pd.concat([
    train[cols_needed],
    test[['customer_id', 'product_unit_variant_id', 'week_start']].assign(qty_this_week=0.0)
], ignore_index=True)

# Sort strictly for Lag calculation
temp_df = temp_df.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])

# --- CALCULATIONS (The 0.97 AUC Logic) ---

print("  Calculating Rolling Stats...")
grp_obj = temp_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week']
temp_df['lag1'] = grp_obj.shift(1)
temp_df['lag2'] = grp_obj.shift(2)
# Use .transform to avoid index errors
temp_df['roll_mean_4'] = grp_obj.transform(lambda x: x.shift(1).rolling(4).mean())

print("  Calculating Global Trends...")
# 1. Calculate Global Volume per week
global_calc = temp_df.copy()
global_trend = global_calc.groupby(['product_unit_variant_id', 'week_start'])['qty_this_week'].mean().reset_index().rename(columns={'qty_this_week': 'daily_global_vol'})

# 2. Merge back to temp_df
temp_df = temp_df.merge(global_trend, on=['product_unit_variant_id', 'week_start'], how='left')

# 3. Calculate Lagged Global Trend (Leak-Free)
grp_global = temp_df.groupby('product_unit_variant_id')['daily_global_vol']
temp_df['global_lag1'] = grp_global.shift(1)
temp_df['global_roll_4'] = grp_global.transform(lambda x: x.shift(1).rolling(4).mean())

# Seasonality
temp_df['month'] = temp_df['week_start'].dt.month.fillna(0).astype(int)
temp_df['week_of_year'] = temp_df['week_start'].dt.isocalendar().week.fillna(0).astype(int)

# --- 3. Merge Features Back to SAFE Dataframes ---
# This is the critical step. We take the smart numbers and put them into the original, correct row order.
print("Merging Features back to Train/Test...")
feature_cols = ['lag1', 'lag2', 'roll_mean_4', 'global_lag1', 'global_roll_4', 'month', 'week_of_year']

train = train.merge(temp_df[['customer_id', 'product_unit_variant_id', 'week_start'] + feature_cols],
                    on=['customer_id', 'product_unit_variant_id', 'week_start'], how='left')

test = test.merge(temp_df[['customer_id', 'product_unit_variant_id', 'week_start'] + feature_cols],
                  on=['customer_id', 'product_unit_variant_id', 'week_start'], how='left')

# Cleanup RAM
del temp_df, grp_obj, global_trend, grp_global, global_calc
gc.collect()

# --- 4. Merge Metadata & Encode ---
print("Merging Metadata...")
train = train.merge(customer, on='customer_id', how='left')
train = train.merge(sku, on='product_unit_variant_id', how='left')
test = test.merge(customer, on='customer_id', how='left')
test = test.merge(sku, on='product_unit_variant_id', how='left')

# Print columns to debug
print("Train columns after metadata merges:", train.columns.tolist())
print("Test columns after metadata merges:", test.columns.tolist())

# Encode
# Dynamically determine categorical column names, accounting for potential suffixes from merges
cat_cols = []
original_cat_names = ['customer_category', 'customer_status', 'grade_name', 'unit_name']

for name in original_cat_names:
    if f'{name}_x' in train.columns: # Check for _x suffix first
        cat_cols.append(f'{name}_x')
    elif name in train.columns: # Fallback to original name if no suffix
        cat_cols.append(name)
    else:
        print(f"Warning: Categorical column '{name}' (or '{name}_x') not found in train. Skipping.")

print("Categorical columns for encoding:", cat_cols)

le = LabelEncoder()
for col in cat_cols:
    train[col] = train[col].astype(str).fillna('UNKNOWN')
    test[col] = test[col].astype(str).fillna('UNKNOWN')

    combined = pd.concat([train[col], test[col]])
    le.fit(combined)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

# Fill NaNs for numerical features that might have been created or merged
# Ensure these are only numerical and exist
num_cols = ['lag1', 'lag2', 'roll_mean_4', 'global_lag1', 'global_roll_4']
for col in num_cols:
    if col in train.columns:
        train[col] = train[col].fillna(0)
    if col in test.columns:
        test[col] = test[col].fillna(0)

# --- 5. Target Generation (Safe Mode) ---
print("Generating Targets...")
# We generate targets on Train ONLY, using the sort order
train = train.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])
grp_train = train.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week']

train['target_qty_1w'] = grp_train.shift(-1).fillna(0)
train['target_qty_2w'] = grp_train.shift(-2).fillna(0)
train['target_buy_1w'] = (train['target_qty_1w'] > 0).astype(int)
train['target_buy_2w'] = (train['target_qty_2w'] > 0).astype(int)

# --- 6. Modeling ---
print("Training Models...")
features = feature_cols + cat_cols

# Define LightGBM Params (Tweedie + Optimization)
clf_params = {'n_estimators': 2000, 'learning_rate': 0.02, 'random_state': 42, 'verbose': -1}
reg_params = {'objective': 'tweedie', 'tweedie_variance_power': 1.5, 'n_estimators': 2000, 'learning_rate': 0.02, 'random_state': 42, 'verbose': -1}

models = {}

# Week 1
print("  Week 1 Models...")
clf1 = lgb.LGBMClassifier(**clf_params)
clf1.fit(train[features], train['target_buy_1w'])

reg1 = lgb.LGBMRegressor(**reg_params)
reg1.fit(train[features], train['target_qty_1w'])

# Week 2
print("  Week 2 Models...")
clf2 = lgb.LGBMClassifier(**clf_params)
clf2.fit(train[features], train['target_buy_2w'])

reg2 = lgb.LGBMRegressor(**reg_params)
reg2.fit(train[features], train['target_qty_2w'])

# --- 7. Submission (Guaranteed Alignment) ---
print("Generating Submission...")
# We predict directly on 'test' dataframe which is still in the original order from Test.csv
submission = test[['ID']].copy()

p1 = clf1.predict_proba(test[features])[:, 1]
q1 = reg1.predict(test[features])
p2 = clf2.predict_proba(test[features])[:, 1]
q2 = reg2.predict(test[features])

submission['Target_purchase_next_1w'] = p1
submission['Target_purchase_next_2w'] = p2

# EXPECTED VALUE OPTIMIZATION (Qty * Prob)
submission['Target_qty_next_1w'] = (q1 * p1).clip(min=0)
submission['Target_qty_next_2w'] = (q2 * p2).clip(min=0)

submission.to_csv('submission_logic_fixed.csv', index=False)
print("Done! Upload 'submission_logic_fixed.csv'")

Loading data...
Generating Features...
  Calculating Rolling Stats...
  Calculating Global Trends...
Merging Features back to Train/Test...
Merging Metadata...
Train columns after metadata merges: ['ID', 'customer_id', 'product_unit_variant_id', 'week_start', 'qty_this_week', 'num_orders_week', 'spend_this_week', 'purchased_this_week', 'product_id', 'grade_name_x', 'unit_name_x', 'product_grade_variant_id', 'selling_price', 'customer_category_x', 'customer_status_x', 'customer_created_at_x', 'Target_qty_next_1w', 'Target_purchase_next_1w', 'Target_qty_next_2w', 'Target_purchase_next_2w', 'lag1', 'lag2', 'roll_mean_4', 'global_lag1', 'global_roll_4', 'month', 'week_of_year', 'customer_category_y', 'customer_status_y', 'customer_created_at_y', 'product_name', 'product_grade_variant_sku', 'unit_name_y', 'grade_name_y', 'grade_active_status']
Test columns after metadata merges: ['ID', 'customer_id', 'product_unit_variant_id', 'week_start', 'product_id', 'grade_name_x', 'unit_name_x', 'prod

In [5]:
# ============================================================
# Farm To Feed Competition - Grandmaster Final Solution
# Author: Kaggle GM Time-Series Retail Pipeline
# ============================================================

import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from sklearn.preprocessing import LabelEncoder

# ============================================================
# 1. LOAD DATA (SAFE MODE - NO DOWNCASTING)
# ============================================================

print("Loading data...")

train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")
customer = pd.read_csv("customer_data.csv")
sku = pd.read_csv("sku_data.csv")

train["week_start"] = pd.to_datetime(train["week_start"])
test["week_start"] = pd.to_datetime(test["week_start"])
customer["customer_created_at"] = pd.to_datetime(customer["customer_created_at"])

# ============================================================
# 2. UNIVERSAL DATAFRAME FOR FEATURE ENGINEERING
# ============================================================

print("Building universal dataframe...")

base_cols = ["customer_id", "product_unit_variant_id", "week_start", "qty_this_week"]

temp_df = pd.concat(
    [
        train[base_cols],
        test[["customer_id", "product_unit_variant_id", "week_start"]]
        .assign(qty_this_week=0.0),
    ],
    ignore_index=True,
)

temp_df = temp_df.sort_values(
    ["customer_id", "product_unit_variant_id", "week_start"]
).reset_index(drop=True)

# ============================================================
# 3. CUSTOMER-PRODUCT LAGS & ROLLING STATS
# ============================================================

print("Generating customer-product features...")

pair_grp = temp_df.groupby(
    ["customer_id", "product_unit_variant_id"]
)["qty_this_week"]

temp_df["lag1"] = pair_grp.shift(1)
temp_df["lag2"] = pair_grp.shift(2)

temp_df["roll_mean_4"] = pair_grp.transform(
    lambda x: x.shift(1).rolling(4).mean()
)

# Cold-start indicator
temp_df["is_new_pair"] = pair_grp.shift(1).isna().astype(int)

# ============================================================
# 4. PRODUCT-CUSTOMER RECENCY (WEEKS SINCE LAST PURCHASE)
# ============================================================

print("Calculating recency...")

def recency_weeks(x):
    last = x.where(x > 0).shift(1)
    return last.groupby(level=0).cumcount()

temp_df["pair_recency"] = pair_grp.transform(
    lambda x: x.shift(1).notna().cumsum()
)

# ============================================================
# 5. CUSTOMER MOMENTUM (GLOBAL CUSTOMER BEHAVIOR)
# ============================================================

print("Generating customer momentum features...")

cust_grp = temp_df.groupby("customer_id")["qty_this_week"]

temp_df["cust_lag1"] = cust_grp.shift(1)
temp_df["cust_roll_4"] = cust_grp.transform(
    lambda x: x.shift(1).rolling(4).mean()
)

# ============================================================
# 6. GLOBAL PRODUCT TRENDS (CRITICAL FIX: USE SUM)
# ============================================================

print("Generating global product trends...")

global_weekly = (
    temp_df.groupby(
        ["product_unit_variant_id", "week_start"]
    )["qty_this_week"]
    .sum()
    .reset_index()
    .rename(columns={"qty_this_week": "global_weekly_vol"})
)

temp_df = temp_df.merge(
    global_weekly,
    on=["product_unit_variant_id", "week_start"],
    how="left",
)

prod_grp = temp_df.groupby("product_unit_variant_id")["global_weekly_vol"]

temp_df["global_lag1"] = prod_grp.shift(1)
temp_df["global_roll_4"] = prod_grp.transform(
    lambda x: x.shift(1).rolling(4).mean()
)

# ============================================================
# 7. SEASONALITY
# ============================================================

temp_df["month"] = temp_df["week_start"].dt.month.astype(int)
temp_df["week_of_year"] = (
    temp_df["week_start"].dt.isocalendar().week.astype(int)
)

# ============================================================
# 8. MERGE FEATURES BACK (ID-SAFE)
# ============================================================

print("Merging engineered features...")

feature_cols = [
    "lag1", "lag2", "roll_mean_4",
    "cust_lag1", "cust_roll_4",
    "global_lag1", "global_roll_4",
    "is_new_pair", "pair_recency",
    "month", "week_of_year",
]

merge_cols = ["customer_id", "product_unit_variant_id", "week_start"]

train = train.merge(
    temp_df[merge_cols + feature_cols],
    on=merge_cols,
    how="left",
)

test = test.merge(
    temp_df[merge_cols + feature_cols],
    on=merge_cols,
    how="left",
)

del temp_df, pair_grp, cust_grp, prod_grp, global_weekly
gc.collect()

# ============================================================
# 9. MERGE METADATA & ENCODE CATEGORICALS
# ============================================================

print("Merging metadata...")

train = train.merge(customer, on="customer_id", how="left")
train = train.merge(sku, on="product_unit_variant_id", how="left")

test = test.merge(customer, on="customer_id", how="left")
test = test.merge(sku, on="product_unit_variant_id", how="left")

cat_candidates = [
    "customer_category",
    "customer_status",
    "grade_name",
    "unit_name",
]

cat_cols = []
for c in cat_candidates:
    if f"{c}_x" in train.columns:
        cat_cols.append(f"{c}_x")
    elif c in train.columns:
        cat_cols.append(c)

le = LabelEncoder()
for col in cat_cols:
    train[col] = train[col].astype(str).fillna("UNKNOWN")
    test[col] = test[col].astype(str).fillna("UNKNOWN")
    le.fit(pd.concat([train[col], test[col]]))
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

# Fill numerical NaNs
for col in feature_cols:
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)

# ============================================================
# 10. TARGET GENERATION (NO ROW DROPPING)
# ============================================================

print("Generating targets...")

train = train.sort_values(
    ["customer_id", "product_unit_variant_id", "week_start"]
)

grp = train.groupby(
    ["customer_id", "product_unit_variant_id"]
)["qty_this_week"]

train["target_qty_1w"] = grp.shift(-1).fillna(0)
train["target_qty_2w"] = grp.shift(-2).fillna(0)

train["target_buy_1w"] = (train["target_qty_1w"] > 0).astype(int)
train["target_buy_2w"] = (train["target_qty_2w"] > 0).astype(int)

# ============================================================
# 11. MODEL TRAINING (4 MODELS - TUNED)
# ============================================================

print("Training models...")

features = feature_cols + cat_cols

clf_params = dict(
    n_estimators=2000,
    learning_rate=0.02,
    num_leaves=64,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    random_state=42,
    verbose=-1,
)

reg1_params = dict(
    objective="tweedie",
    tweedie_variance_power=1.3,
    n_estimators=2000,
    learning_rate=0.02,
    num_leaves=64,
    feature_fraction=0.8,
    random_state=42,
    verbose=-1,
)

reg2_params = dict(
    objective="tweedie",
    tweedie_variance_power=1.6,
    n_estimators=2000,
    learning_rate=0.02,
    num_leaves=64,
    feature_fraction=0.8,
    random_state=42,
    verbose=-1,
)

clf1 = lgb.LGBMClassifier(**clf_params)
clf2 = lgb.LGBMClassifier(**clf_params)

reg1 = lgb.LGBMRegressor(**reg1_params)
reg2 = lgb.LGBMRegressor(**reg2_params)

clf1.fit(train[features], train["target_buy_1w"])
clf2.fit(train[features], train["target_buy_2w"])

reg1.fit(train[features], train["target_qty_1w"])
reg2.fit(train[features], train["target_qty_2w"])

# ============================================================
# 12. SUBMISSION (EXPECTED VALUE OPTIMIZATION)
# ============================================================

print("Generating submission...")

submission = test[["ID"]].copy()

p1 = clf1.predict_proba(test[features])[:, 1]
p2 = clf2.predict_proba(test[features])[:, 1]

q1 = reg1.predict(test[features])
q2 = reg2.predict(test[features])

submission["Target_purchase_next_1w"] = p1
submission["Target_purchase_next_2w"] = p2

submission["Target_qty_next_1w"] = (p1 * q1).clip(min=0)
submission["Target_qty_next_2w"] = (p2 * q2).clip(min=0)

submission.to_csv("submission_gm_final.csv", index=False)

print("Done. File saved as submission_gm_final.csv")

Loading data...
Building universal dataframe...
Generating customer-product features...
Calculating recency...
Generating customer momentum features...
Generating global product trends...
Merging engineered features...
Merging metadata...
Generating targets...
Training models...
Generating submission...
Done. File saved as submission_gm_final.csv
