<a href="https://colab.research.google.com/github/dimna21/ML_Final_Project/blob/main/model_experiment_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !pip install darts

# Preprocessing 1

In [None]:
# Load your data
import pandas as pd

features = pd.read_csv('/content/drive/MyDrive/ML_Final_Project/features.csv')
stores = pd.read_csv('/content/drive/MyDrive/ML_Final_Project/stores.csv')
train = pd.read_csv('/content/drive/MyDrive/ML_Final_Project/train.csv')
test = pd.read_csv('/content/drive/MyDrive/ML_Final_Project/test.csv')

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class BaseMerger(BaseEstimator, TransformerMixin):
    def __init__(self, features, stores):
        self.feature_store = features.merge(stores, how='inner', on='Store')
        self.feature_store['Date'] = pd.to_datetime(self.feature_store['Date'])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['Date'] = pd.to_datetime(X['Date'])
        merged = X.merge(self.feature_store, how='inner', on=['Store', 'Date', 'IsHoliday'])
        merged = merged.sort_values(by=['Store', 'Dept', 'Date']).reset_index(drop=True)
        return merged

In [None]:
class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.superbowl = pd.to_datetime(['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'])
        self.labor_day = pd.to_datetime(['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'])
        self.thanksgiving = pd.to_datetime(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'])
        self.christmas = pd.to_datetime(['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Convert temperature to Celsius
        if 'Temperature' in X.columns:
            X['Temperature'] = (X['Temperature'] - 32) * (5.0 / 9.0)

        # Basic date parts
        X['Day'] = X['Date'].dt.day
        X['Month'] = X['Date'].dt.month
        X['Year'] = X['Date'].dt.year

        # Extract ISO week and year for holiday matching
        X['Week'] = X['Date'].dt.isocalendar().week
        X['YearNum'] = X['Date'].dt.year

        # Helper to flag if a date is in same ISO week/year as a known holiday
        def is_holiday_week(date_series, holidays):
            holiday_weeks = set((d.isocalendar().week, d.year) for d in holidays)
            return date_series.apply(lambda d: (d.isocalendar().week, d.year) in holiday_weeks if pd.notnull(d) else False).astype(int)

        X['SuperbowlWeek'] = is_holiday_week(X['Date'], self.superbowl)
        X['LaborDayWeek'] = is_holiday_week(X['Date'], self.labor_day)
        X['ThanksgivingWeek'] = is_holiday_week(X['Date'], self.thanksgiving)
        X['ChristmasWeek'] = is_holiday_week(X['Date'], self.christmas)

        # Calculate days to Thanksgiving and Christmas (using Nov 24 and Dec 24 as anchor dates)
        thanksgiving_dates = pd.to_datetime(X['Year'].astype(str) + "-11-24")
        christmas_dates = pd.to_datetime(X['Year'].astype(str) + "-12-24")

        X['Days_to_Thanksgiving'] = (thanksgiving_dates - X['Date']).dt.days
        X['Days_to_Christmas'] = (christmas_dates - X['Date']).dt.days

        # Clean up helper cols
        X = X.drop(columns=['Week', 'YearNum'])

        return X

In [None]:
class MissingValueFiller(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        self.mean_cols = ['CPI', 'Unemployment']
        self.mean_values = {}

    def fit(self, X, y=None):
        for col in self.mean_cols:
            if col in X.columns:
                self.mean_values[col] = X[col].mean()
        return self

    def transform(self, X):
        X = X.copy()

        # Fill markdowns with 0
        for col in self.markdown_cols:
            if col in X.columns:
                X[col] = X[col].fillna(0.0)

        # Fill CPI and Unemployment with learned mean
        for col in self.mean_cols:
            if col in X.columns and col in self.mean_values:
                X[col] = X[col].fillna(self.mean_values[col])

        return X

In [None]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.type_mapping = {'A': 3, 'B': 2, 'C': 1}
        self.holiday_mapping = {False: 0, True: 1}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        if 'Type' in X.columns:
            X['Type'] = X['Type'].map(self.type_mapping)

        if 'IsHoliday' in X.columns:
            X['IsHoliday'] = X['IsHoliday'].map(self.holiday_mapping)

        return X

# Preprocessing 2 - Deal with date features

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

class LagFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,
                 lags=[1, 2, 3, 4],
                 rolling_windows=[4, 8],
                 drop_na=True):
        self.lags = lags
        self.rolling_windows = rolling_windows
        self.drop_na = drop_na
        self.history_ = None
        self.lag_values_ = {}
        self.rolling_values_ = {}

    def fit(self, X, y=None):
        df = X.copy()
        df['Date'] = pd.to_datetime(df['Date'])

        # Sort by Store, Dept, Date
        df = df.sort_values(['Store', 'Dept', 'Date'])

        # Store the last few values for each Store-Dept combination
        # This will be used to compute lags for test data
        max_lag = max(self.lags)
        max_window = max(self.rolling_windows) if self.rolling_windows else 0
        history_length = max(max_lag, max_window)

        self.history_ = (
            df[['Store', 'Dept', 'Date', 'Weekly_Sales']]
            .groupby(['Store', 'Dept'], as_index=False)
            .tail(history_length)
        )

        # Pre-compute lag and rolling features for the last rows
        # This will help with test data transformation
        self.lag_values_ = {}
        self.rolling_values_ = {}

        for (store, dept), group in df.groupby(['Store', 'Dept']):
            group = group.sort_values('Date')

            # Store last lag values
            self.lag_values_[(store, dept)] = {}
            for lag in self.lags:
                if len(group) >= lag:
                    self.lag_values_[(store, dept)][lag] = group['Weekly_Sales'].iloc[-lag]
                else:
                    self.lag_values_[(store, dept)][lag] = np.nan

            # Store last rolling values
            self.rolling_values_[(store, dept)] = {}
            for window in self.rolling_windows:
                if len(group) >= window:
                    self.rolling_values_[(store, dept)][window] = group['Weekly_Sales'].iloc[-window:].mean()
                else:
                    self.rolling_values_[(store, dept)][window] = np.nan

        return self

    def transform(self, X):
        df = X.copy()
        df['Date'] = pd.to_datetime(df['Date'])
        has_sales = 'Weekly_Sales' in df.columns

        # Create DateOrdinal
        df['DateOrdinal'] = df['Date'].map(pd.Timestamp.toordinal)
        df = df.sort_values(['Store', 'Dept', 'Date'])

        if has_sales:
            # Training data - compute lags normally
            for lag in self.lags:
                df[f'lag_{lag}'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(lag)

            # Compute rolling means
            for window in self.rolling_windows:
                df[f'rolling_mean_{window}'] = (
                    df.groupby(['Store', 'Dept'])['Weekly_Sales']
                    .transform(lambda s: s.rolling(window).mean())
                )
        else:
            # Test data - use pre-computed values from training
            # Initialize lag columns
            for lag in self.lags:
                df[f'lag_{lag}'] = np.nan
            for window in self.rolling_windows:
                df[f'rolling_mean_{window}'] = np.nan

            # Fill with pre-computed values
            for idx, row in df.iterrows():
                store_dept = (row['Store'], row['Dept'])

                if store_dept in self.lag_values_:
                    for lag in self.lags:
                        if lag in self.lag_values_[store_dept]:
                            df.loc[idx, f'lag_{lag}'] = self.lag_values_[store_dept][lag]

                if store_dept in self.rolling_values_:
                    for window in self.rolling_windows:
                        if window in self.rolling_values_[store_dept]:
                            df.loc[idx, f'rolling_mean_{window}'] = self.rolling_values_[store_dept][window]

        # Drop helper columns
        drop_cols = [c for c in ['Day', 'Year', 'Date'] if c in df.columns]
        df = df.drop(columns=drop_cols)

        # Handle NaN values
        if self.drop_na and has_sales:
            # Only drop NaN for training data
            required = [f'lag_{l}' for l in self.lags] + [f'rolling_mean_{w}' for w in self.rolling_windows]
            df = df.dropna(subset=required).reset_index(drop=True)
        elif not has_sales:
            # For test data, fill remaining NaN values with appropriate defaults
            # You might want to adjust these defaults based on your domain knowledge
            for lag in self.lags:
                df[f'lag_{lag}'] = df[f'lag_{lag}'].fillna(0)  # or use median/mean from training
            for window in self.rolling_windows:
                df[f'rolling_mean_{window}'] = df[f'rolling_mean_{window}'].fillna(0)  # or use median/mean from training

        return df

# Preprocessing 3 - Correlation filter

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd

class SmartCorrelationDropper(BaseEstimator, TransformerMixin):
    """
    More conservative correlation filter that protects important feature groups
    """
    def __init__(self, threshold=0.95, protect_groups=None, verbose=False):
        self.threshold = threshold  # Higher threshold
        self.protect_groups = protect_groups or [
            ['lag_1', 'lag_2', 'lag_3', 'lag_4'],  # Protect lag features
            ['rolling_mean_4', 'rolling_mean_8'],   # Protect rolling features
            ['SuperbowlWeek', 'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek'],  # Protect holiday features
            ['Days_to_Thanksgiving', 'Days_to_Christmas'],  # Protect seasonal features
        ]
        self.verbose = verbose
        self.features_to_drop_ = []

    def fit(self, X, y=None):
        if self.verbose:
            print("----- Smart Correlation Filter Fitting -----")

        # Get target if it's in X
        if y is None and 'Weekly_Sales' in X.columns:
            y = X['Weekly_Sales']
            X_numeric = X.select_dtypes(include=[np.number]).drop(columns='Weekly_Sales')
        else:
            X_numeric = X.select_dtypes(include=[np.number]).copy()

        # Create protected features set
        protected_features = set()
        for group in self.protect_groups:
            for feature in group:
                if feature in X_numeric.columns:
                    protected_features.add(feature)

        # Calculate correlation matrix
        corr_matrix = X_numeric.corr().abs()
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        corr_matrix = corr_matrix.where(~mask)

        # Find highly correlated pairs
        high_corr_pairs = []
        cols = corr_matrix.columns
        for i in range(len(cols)):
            for j in range(i):
                val = corr_matrix.iloc[i, j]
                if pd.notnull(val) and val > self.threshold:
                    high_corr_pairs.append((cols[i], cols[j], val))

        # Decide which features to drop
        features_to_drop = set()
        for feat1, feat2, corr_val in high_corr_pairs:
            # Don't drop if both features are protected
            if feat1 in protected_features and feat2 in protected_features:
                continue

            # If one is protected, drop the other
            if feat1 in protected_features:
                features_to_drop.add(feat2)
            elif feat2 in protected_features:
                features_to_drop.add(feat1)
            else:
                # Neither is protected, use target correlation if available
                if y is not None:
                    corr1 = abs(X_numeric[feat1].corr(y)) if feat1 in X_numeric.columns else 0
                    corr2 = abs(X_numeric[feat2].corr(y)) if feat2 in X_numeric.columns else 0
                    to_drop = feat1 if corr1 < corr2 else feat2
                else:
                    to_drop = feat2
                features_to_drop.add(to_drop)

        self.features_to_drop_ = list(features_to_drop)

        if self.verbose:
            print(f"Highly correlated pairs (>{self.threshold}): {len(high_corr_pairs)}")
            print(f"Protected features: {protected_features}")
            print(f"Features to drop: {self.features_to_drop_}")

        return self

    def transform(self, X):
        return X.drop(columns=self.features_to_drop_, errors='ignore')


class ImportanceBasedSelector(BaseEstimator, TransformerMixin):
    """
    Feature selection based on feature importance, keeping top K features
    """
    def __init__(self, k_features=20, estimator=None, verbose=False):
        self.k_features = k_features
        self.estimator = estimator or RandomForestRegressor(
            n_estimators=100,
            max_depth=10,
            random_state=42,
            n_jobs=-1
        )
        self.verbose = verbose
        self.selected_features_ = []

    def fit(self, X, y=None):
        # Get target if it's in X
        if y is None and 'Weekly_Sales' in X.columns:
            y = X['Weekly_Sales']
            X_features = X.drop(columns=['Weekly_Sales'])
        else:
            X_features = X.copy()

        # Fit estimator and get feature importances
        self.estimator.fit(X_features, y)

        # Get feature importances
        if hasattr(self.estimator, 'feature_importances_'):
            importances = self.estimator.feature_importances_
        else:
            # Fallback for estimators without feature_importances_
            importances = np.abs(self.estimator.coef_) if hasattr(self.estimator, 'coef_') else np.ones(len(X_features.columns))

        # Create feature importance dataframe
        feature_importance = pd.DataFrame({
            'feature': X_features.columns,
            'importance': importances
        }).sort_values('importance', ascending=False)

        # Select top k features
        self.selected_features_ = feature_importance.head(self.k_features)['feature'].tolist()

        if self.verbose:
            print(f"Top {self.k_features} features by importance:")
            print(feature_importance.head(self.k_features))

        return self

    def transform(self, X):
        # Preserve target if it exists
        result = X[self.selected_features_].copy()
        if 'Weekly_Sales' in X.columns:
            result['Weekly_Sales'] = X['Weekly_Sales']
        return result


class MinimalFeatureSelector(BaseEstimator, TransformerMixin):
    """
    Very conservative feature selection that only removes clearly redundant features
    """
    def __init__(self, variance_threshold=0.01, verbose=False):
        self.variance_threshold = variance_threshold
        self.verbose = verbose
        self.features_to_drop_ = []

    def fit(self, X, y=None):
        # Get numeric features
        if 'Weekly_Sales' in X.columns:
            X_numeric = X.select_dtypes(include=[np.number]).drop(columns='Weekly_Sales')
        else:
            X_numeric = X.select_dtypes(include=[np.number])

        # Find features with very low variance (almost constant)
        low_variance_features = []
        for col in X_numeric.columns:
            if X_numeric[col].var() < self.variance_threshold:
                low_variance_features.append(col)

        self.features_to_drop_ = low_variance_features

        if self.verbose:
            print(f"Features with low variance (< {self.variance_threshold}): {self.features_to_drop_}")

        return self

    def transform(self, X):
        return X.drop(columns=self.features_to_drop_, errors='ignore')


# Example usage in your pipeline:
def create_feature_selection_pipeline(approach='minimal'):
    """
    Create feature selection pipeline based on approach

    Args:
        approach: 'none', 'minimal', 'conservative', or 'importance'
    """
    if approach == 'none':
        return []
    elif approach == 'minimal':
        return [('feature_select', MinimalFeatureSelector(verbose=True))]
    elif approach == 'conservative':
        return [
            ('correlation_filter', SmartCorrelationDropper(threshold=0.95, verbose=True)),
            ('minimal_select', MinimalFeatureSelector(verbose=True))
        ]
    elif approach == 'importance':
        return [
            ('correlation_filter', SmartCorrelationDropper(threshold=0.95, verbose=True)),
            ('importance_select', ImportanceBasedSelector(k_features=20, verbose=True))
        ]
    else:
        raise ValueError("approach must be 'none', 'minimal', 'conservative', or 'importance'")


# Modified pipeline example:
"""
from sklearn.pipeline import Pipeline

# Option 1: No feature selection (recommended)
pipeline = Pipeline([
    ('merge', BaseMerger(features, stores)),
    ('feature_add', FeatureAdder()),
    ('fillna', MissingValueFiller()),
    ('label_encode', CategoricalEncoder()),
    ('lags', LagFeatureTransformer(lags=[1,2,3,4], rolling_windows=[4,8], drop_na=True)),
])

# Option 2: Conservative feature selection
pipeline = Pipeline([
    ('merge', BaseMerger(features, stores)),
    ('feature_add', FeatureAdder()),
    ('fillna', MissingValueFiller()),
    ('label_encode', CategoricalEncoder()),
    ('lags', LagFeatureTransformer(lags=[1,2,3,4], rolling_windows=[4,8], drop_na=True)),
    ('feature_select', SmartCorrelationDropper(threshold=0.95, verbose=True)),
])

# Option 3: Importance-based selection
pipeline = Pipeline([
    ('merge', BaseMerger(features, stores)),
    ('feature_add', FeatureAdder()),
    ('fillna', MissingValueFiller()),
    ('label_encode', CategoricalEncoder()),
    ('lags', LagFeatureTransformer(lags=[1,2,3,4], rolling_windows=[4,8], drop_na=True)),
    ('importance_select', ImportanceBasedSelector(k_features=20, verbose=True)),
])
"""

"\nfrom sklearn.pipeline import Pipeline\n\n# Option 1: No feature selection (recommended)\npipeline = Pipeline([\n    ('merge', BaseMerger(features, stores)),\n    ('feature_add', FeatureAdder()),\n    ('fillna', MissingValueFiller()),\n    ('label_encode', CategoricalEncoder()),\n    ('lags', LagFeatureTransformer(lags=[1,2,3,4], rolling_windows=[4,8], drop_na=True)),\n])\n\n# Option 2: Conservative feature selection\npipeline = Pipeline([\n    ('merge', BaseMerger(features, stores)),\n    ('feature_add', FeatureAdder()),\n    ('fillna', MissingValueFiller()),\n    ('label_encode', CategoricalEncoder()),\n    ('lags', LagFeatureTransformer(lags=[1,2,3,4], rolling_windows=[4,8], drop_na=True)),\n    ('feature_select', SmartCorrelationDropper(threshold=0.95, verbose=True)),\n])\n\n# Option 3: Importance-based selection\npipeline = Pipeline([\n    ('merge', BaseMerger(features, stores)),\n    ('feature_add', FeatureAdder()),\n    ('fillna', MissingValueFiller()),\n    ('label_encode', C

In [None]:
y = train_df['Weekly_Sales']
X = train_df.drop(columns=['Weekly_Sales'])
X.shape, y.shape

((398796, 28), (398796,))

# Preprocessing 4 - Seasonal Feature Engineering

In [None]:
class AdvancedSeasonalFeatures(BaseEstimator, TransformerMixin):
    """
    Creates sophisticated seasonal and cyclical features
    """
    def __init__(self):
        # Define holiday periods more precisely
        self.holiday_periods = {
            'thanksgiving_period': [
                ('2010-11-19', '2010-11-26'),
                ('2011-11-18', '2011-11-25'),
                ('2012-11-16', '2012-11-23'),
                ('2013-11-22', '2013-11-29')
            ],
            'christmas_period': [
                ('2010-12-17', '2010-12-31'),
                ('2011-12-16', '2011-12-30'),
                ('2012-12-21', '2012-12-28'),
                ('2013-12-20', '2013-12-27')
            ],
            'superbowl_period': [
                ('2010-02-05', '2010-02-12'),
                ('2011-02-04', '2011-02-11'),
                ('2012-02-03', '2012-02-10'),
                ('2013-02-01', '2013-02-08')
            ],
            'labor_day_period': [
                ('2010-09-03', '2010-09-10'),
                ('2011-09-02', '2011-09-09'),
                ('2012-08-31', '2012-09-07'),
                ('2013-08-30', '2013-09-06')
            ]
        }

        # Back-to-school period (typically July-August)
        self.back_to_school_period = [
            ('2010-07-15', '2010-08-31'),
            ('2011-07-15', '2011-08-31'),
            ('2012-07-15', '2012-08-31'),
            ('2013-07-15', '2013-08-31')
        ]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df['Date'] = pd.to_datetime(df['Date'])

        # Cyclical encoding of time features
        df['month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
        df['month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)

        # Week of year for seasonal patterns
        df['week_of_year'] = df['Date'].dt.isocalendar().week
        df['week_sin'] = np.sin(2 * np.pi * df['week_of_year'] / 52)
        df['week_cos'] = np.cos(2 * np.pi * df['week_of_year'] / 52)

        # Quarter features
        df['quarter'] = df['Date'].dt.quarter
        df['quarter_sin'] = np.sin(2 * np.pi * df['quarter'] / 4)
        df['quarter_cos'] = np.cos(2 * np.pi * df['quarter'] / 4)

        # Advanced holiday features
        for holiday_name, periods in self.holiday_periods.items():
            df[f'{holiday_name}_flag'] = 0
            for start, end in periods:
                mask = (df['Date'] >= start) & (df['Date'] <= end)
                df.loc[mask, f'{holiday_name}_flag'] = 1

        # Back-to-school period
        df['back_to_school_flag'] = 0
        for start, end in self.back_to_school_period:
            mask = (df['Date'] >= start) & (df['Date'] <= end)
            df.loc[mask, 'back_to_school_flag'] = 1

        # Days since major holidays (continuous features)
        for year in [2010, 2011, 2012, 2013]:
            thanksgiving = pd.to_datetime(f'{year}-11-24')  # Approximate
            christmas = pd.to_datetime(f'{year}-12-25')

            year_mask = df['Date'].dt.year == year
            if year_mask.any():
                df.loc[year_mask, 'days_since_thanksgiving'] = (df.loc[year_mask, 'Date'] - thanksgiving).dt.days
                df.loc[year_mask, 'days_since_christmas'] = (df.loc[year_mask, 'Date'] - christmas).dt.days

        # Seasonal shopping intensity (pre-holiday buildup)
        df['pre_thanksgiving_intensity'] = np.where(
            (df['Days_to_Thanksgiving'] <= 14) & (df['Days_to_Thanksgiving'] > 0),
            15 - df['Days_to_Thanksgiving'], 0
        )

        df['pre_christmas_intensity'] = np.where(
            (df['Days_to_Christmas'] <= 21) & (df['Days_to_Christmas'] > 0),
            22 - df['Days_to_Christmas'], 0
        )

        # Post-holiday effect (returns, clearance)
        df['post_holiday_effect'] = np.where(
            ((df['days_since_thanksgiving'] > 0) & (df['days_since_thanksgiving'] <= 7)) |
            ((df['days_since_christmas'] > 0) & (df['days_since_christmas'] <= 14)),
            1, 0
        )

        # Clean up intermediate columns
        df = df.drop(columns=['week_of_year'], errors='ignore')

        return df

# Preprocessing 5 - add lag features

In [None]:
class ImprovedLagFeatureTransformer(BaseEstimator, TransformerMixin):
    """
    Enhanced lag features with more sophisticated patterns
    """
    def __init__(self,
                 lags=[1, 2, 3, 4, 8, 12, 52],  # Include yearly lag
                 rolling_windows=[2, 4, 8, 12, 26],  # More diverse windows
                 ewm_spans=[4, 8, 12],  # Exponential weighted moving averages
                 drop_na=True):
        self.lags = lags
        self.rolling_windows = rolling_windows
        self.ewm_spans = ewm_spans
        self.drop_na = drop_na
        self.history_ = None
        self.lag_stats_ = {}

    def fit(self, X, y=None):
        df = X.copy()
        df['Date'] = pd.to_datetime(df['Date'])
        df = df.sort_values(['Store', 'Dept', 'Date'])

        # Store more comprehensive history
        max_lag = max(self.lags) if self.lags else 0
        max_window = max(self.rolling_windows) if self.rolling_windows else 0
        max_ewm = max(self.ewm_spans) if self.ewm_spans else 0
        history_length = max(max_lag, max_window, max_ewm, 60)  # At least 60 weeks

        self.history_ = (
            df[['Store', 'Dept', 'Date', 'Weekly_Sales']]
            .groupby(['Store', 'Dept'], as_index=False)
            .tail(history_length)
        )

        # Calculate statistics for each store-dept combination
        for (store, dept), group in df.groupby(['Store', 'Dept']):
            group = group.sort_values('Date')
            sales = group['Weekly_Sales']

            self.lag_stats_[(store, dept)] = {
                'mean': sales.mean(),
                'std': sales.std(),
                'median': sales.median(),
                'q25': sales.quantile(0.25),
                'q75': sales.quantile(0.75),
                'trend': self._calculate_trend(sales),
                'seasonality': self._calculate_seasonality(sales)
            }

        return self

    def _calculate_trend(self, series):
        """Calculate simple trend"""
        if len(series) < 4:
            return 0
        x = np.arange(len(series))
        try:
            trend = np.polyfit(x, series, 1)[0]
            return trend
        except:
            return 0

    def _calculate_seasonality(self, series):
        """Calculate seasonal strength"""
        if len(series) < 52:
            return 0
        try:
            # Simple seasonal strength measure
            return series.std() / series.mean() if series.mean() > 0 else 0
        except:
            return 0

    def transform(self, X):
        df = X.copy()
        df['Date'] = pd.to_datetime(df['Date'])
        has_sales = 'Weekly_Sales' in df.columns

        df = df.sort_values(['Store', 'Dept', 'Date'])

        if has_sales:
            # Training data
            for lag in self.lags:
                df[f'lag_{lag}'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(lag)

            # Rolling statistics
            for window in self.rolling_windows:
                df[f'rolling_mean_{window}'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(
                    lambda x: x.rolling(window, min_periods=1).mean()
                )
                df[f'rolling_std_{window}'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(
                    lambda x: x.rolling(window, min_periods=1).std()
                )
                df[f'rolling_median_{window}'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(
                    lambda x: x.rolling(window, min_periods=1).median()
                )

            # Exponential weighted moving averages
            for span in self.ewm_spans:
                df[f'ewm_{span}'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(
                    lambda x: x.ewm(span=span, min_periods=1).mean()
                )

            # Sales momentum and acceleration
            df['sales_momentum'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(
                lambda x: x.diff()
            )
            df['sales_acceleration'] = df.groupby(['Store', 'Dept'])['sales_momentum'].transform(
                lambda x: x.diff()
            )

        else:
            # Test data - use historical statistics
            for lag in self.lags:
                df[f'lag_{lag}'] = np.nan

            for window in self.rolling_windows:
                df[f'rolling_mean_{window}'] = np.nan
                df[f'rolling_std_{window}'] = np.nan
                df[f'rolling_median_{window}'] = np.nan

            for span in self.ewm_spans:
                df[f'ewm_{span}'] = np.nan

            df['sales_momentum'] = np.nan
            df['sales_acceleration'] = np.nan

            # Fill with historical statistics
            for idx, row in df.iterrows():
                store_dept = (row['Store'], row['Dept'])
                if store_dept in self.lag_stats_:
                    stats = self.lag_stats_[store_dept]
                    # Use mean for missing lags
                    for lag in self.lags:
                        df.loc[idx, f'lag_{lag}'] = stats['mean']

                    # Use historical statistics for rolling features
                    for window in self.rolling_windows:
                        df.loc[idx, f'rolling_mean_{window}'] = stats['mean']
                        df.loc[idx, f'rolling_std_{window}'] = stats['std']
                        df.loc[idx, f'rolling_median_{window}'] = stats['median']

                    for span in self.ewm_spans:
                        df.loc[idx, f'ewm_{span}'] = stats['mean']

                    df.loc[idx, 'sales_momentum'] = stats['trend']
                    df.loc[idx, 'sales_acceleration'] = 0

        # Clean up
        if self.drop_na and has_sales:
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            df = df.dropna(subset=numeric_cols).reset_index(drop=True)

        return df


# Preprocess 6 - add specific features

In [None]:
class StoreSpecificFeatures(BaseEstimator, TransformerMixin):
    """
    Store and department specific features
    """
    def __init__(self):
        self.store_stats_ = {}
        self.dept_stats_ = {}
        self.store_dept_stats_ = {}

    def fit(self, X, y=None):
        df = X.copy()
        has_sales = 'Weekly_Sales' in df.columns

        if has_sales:
            # Store-level statistics
            store_groups = df.groupby('Store')['Weekly_Sales']
            self.store_stats_ = {
                'mean': store_groups.mean().to_dict(),
                'std': store_groups.std().to_dict(),
                'median': store_groups.median().to_dict(),
                'volume': store_groups.count().to_dict()
            }

            # Department-level statistics
            dept_groups = df.groupby('Dept')['Weekly_Sales']
            self.dept_stats_ = {
                'mean': dept_groups.mean().to_dict(),
                'std': dept_groups.std().to_dict(),
                'median': dept_groups.median().to_dict(),
                'volume': dept_groups.count().to_dict()
            }

            # Store-Department level statistics
            store_dept_groups = df.groupby(['Store', 'Dept'])['Weekly_Sales']
            self.store_dept_stats_ = {
                'mean': store_dept_groups.mean().to_dict(),
                'std': store_dept_groups.std().to_dict(),
                'volume': store_dept_groups.count().to_dict()
            }

        return self

    def transform(self, X):
        df = X.copy()

        # Store performance indicators
        df['store_avg_sales'] = df['Store'].map(self.store_stats_.get('mean', {})).fillna(0)
        df['store_sales_volatility'] = df['Store'].map(self.store_stats_.get('std', {})).fillna(0)
        df['store_volume'] = df['Store'].map(self.store_stats_.get('volume', {})).fillna(0)

        # Department performance indicators
        df['dept_avg_sales'] = df['Dept'].map(self.dept_stats_.get('mean', {})).fillna(0)
        df['dept_sales_volatility'] = df['Dept'].map(self.dept_stats_.get('std', {})).fillna(0)
        df['dept_volume'] = df['Dept'].map(self.dept_stats_.get('volume', {})).fillna(0)

        # Store-Department specific features
        df['store_dept_key'] = list(zip(df['Store'], df['Dept']))
        df['store_dept_avg_sales'] = df['store_dept_key'].map(self.store_dept_stats_.get('mean', {})).fillna(0)
        df['store_dept_volatility'] = df['store_dept_key'].map(self.store_dept_stats_.get('std', {})).fillna(0)

        # Performance ratios
        df['store_vs_overall_ratio'] = df['store_avg_sales'] / (df['store_avg_sales'].mean() + 1e-8)
        df['dept_vs_overall_ratio'] = df['dept_avg_sales'] / (df['dept_avg_sales'].mean() + 1e-8)

        # Size-normalized features
        df['sales_per_sqft'] = df['store_avg_sales'] / (df['Size'] + 1e-8)
        df['dept_penetration'] = df['dept_volume'] / (df['store_volume'] + 1e-8)

        # Store type interactions
        df['type_size_interaction'] = df['Type'] * np.log1p(df['Size'])

        # Clean up
        df = df.drop(columns=['store_dept_key'], errors='ignore')

        return df


# Preprocess 6 - add economic features

In [None]:
class EconomicInteractionFeatures(BaseEstimator, TransformerMixin):
    """
    Economic indicators and their interactions
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        # Temperature-based features
        if 'Temperature' in df.columns:
            # Comfort zone indicators
            df['temp_comfort_zone'] = ((df['Temperature'] >= 15) & (df['Temperature'] <= 25)).astype(int)
            df['temp_too_hot'] = (df['Temperature'] > 30).astype(int)
            df['temp_too_cold'] = (df['Temperature'] < 5).astype(int)

            # Seasonal temperature anomalies
            df['temp_month_interaction'] = df['Temperature'] * df['Month']

            # Weather-driven shopping patterns
            df['weather_shopping_boost'] = np.where(
                (df['Temperature'] < 0) | (df['Temperature'] > 35), 1, 0
            )

        # Fuel price interactions
        if 'Fuel_Price' in df.columns:
            df['fuel_price_high'] = (df['Fuel_Price'] > df['Fuel_Price'].quantile(0.75)).astype(int)
            df['fuel_price_low'] = (df['Fuel_Price'] < df['Fuel_Price'].quantile(0.25)).astype(int)

            # Fuel price vs store size (larger stores might be more affected)
            df['fuel_size_interaction'] = df['Fuel_Price'] * np.log1p(df['Size'])

        # Economic pressure indicators
        if 'CPI' in df.columns and 'Unemployment' in df.columns:
            df['economic_pressure'] = df['CPI'] * df['Unemployment']
            df['economic_stability'] = 1 / (1 + df['economic_pressure'])

            # Purchasing power proxy
            df['purchasing_power'] = df['CPI'] / (df['Unemployment'] + 1e-8)

        # Markdown effectiveness
        markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        existing_markdowns = [col for col in markdown_cols if col in df.columns]

        if existing_markdowns:
            df['total_markdown'] = df[existing_markdowns].sum(axis=1)
            df['markdown_count'] = (df[existing_markdowns] > 0).sum(axis=1)
            df['avg_markdown'] = df['total_markdown'] / (df['markdown_count'] + 1e-8)

            # Markdown intensity by store type
            df['markdown_type_interaction'] = df['total_markdown'] * df['Type']

            # Holiday markdown boost
            if 'IsHoliday' in df.columns:
                df['holiday_markdown_boost'] = df['IsHoliday'] * df['total_markdown']

        return df


# Preproces 7 - add date features

In [None]:
class AdvancedDateFeatures(BaseEstimator, TransformerMixin):
    """
    Advanced date-based features
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df['Date'] = pd.to_datetime(df['Date'])

        # Paycheck cycles (bi-weekly and monthly)
        df['is_paycheck_week'] = ((df['Date'].dt.day <= 7) |
                                  ((df['Date'].dt.day >= 14) & (df['Date'].dt.day <= 21))).astype(int)
        df['is_month_end'] = (df['Date'].dt.day >= 25).astype(int)
        df['is_month_start'] = (df['Date'].dt.day <= 7).astype(int)

        # School calendar effects
        df['is_school_week'] = ((df['Date'].dt.month >= 9) |
                                (df['Date'].dt.month <= 5)).astype(int)
        df['is_summer_break'] = ((df['Date'].dt.month >= 6) &
                                 (df['Date'].dt.month <= 8)).astype(int)

        # Tax season
        df['is_tax_season'] = ((df['Date'].dt.month >= 1) &
                               (df['Date'].dt.month <= 4)).astype(int)

        # Weekend proximity
        df['days_to_weekend'] = 6 - df['Date'].dt.dayofweek
        df['days_from_weekend'] = df['Date'].dt.dayofweek

        # Seasonal shopping patterns
        df['is_spring_shopping'] = ((df['Date'].dt.month >= 3) &
                                    (df['Date'].dt.month <= 5)).astype(int)
        df['is_summer_shopping'] = ((df['Date'].dt.month >= 6) &
                                    (df['Date'].dt.month <= 8)).astype(int)
        df['is_fall_shopping'] = ((df['Date'].dt.month >= 9) &
                                  (df['Date'].dt.month <= 11)).astype(int)
        df['is_winter_shopping'] = ((df['Date'].dt.month == 12) |
                                    (df['Date'].dt.month <= 2)).astype(int)

        return df

# End preprocess

In [None]:
enhanced_pipeline = Pipeline([
    ('merge', BaseMerger(features, stores)),
    ('feature_add', FeatureAdder()),  # Your existing feature adder
    ('advanced_seasonal', AdvancedSeasonalFeatures()),
    ('advanced_date', AdvancedDateFeatures()),
    ('store_features', StoreSpecificFeatures()),
    ('economic_features', EconomicInteractionFeatures()),
    ('fillna', MissingValueFiller()),
    ('label_encode', CategoricalEncoder()),
    ('improved_lags', ImprovedLagFeatureTransformer(
        lags=[1, 2, 3, 4, 8, 12, 52],
        rolling_windows=[2, 4, 8, 12, 26],
        ewm_spans=[4, 8, 12],
        drop_na=True
    )),
])

In [None]:
train_df = enhanced_pipeline.fit_transform(train)
test_df = enhanced_pipeline.transform(test)
train_df.shape, test_df.shape

TypeError: can't multiply sequence by non-int of type 'float'

In [None]:
# from sklearn.pipeline import Pipeline
# pipeline = Pipeline([
#     ('merge',        BaseMerger(features, stores)),
#     ('feature_add',  FeatureAdder()),
#     ('fillna',       MissingValueFiller()),
#     ('label_encode', CategoricalEncoder()),
#     ('lags',         LagFeatureTransformer(
#                         lags=[1,2,3,4],           # e.g. use 4 lags
#                         rolling_windows=[4,8],    # e.g. 4‑wk & 8‑wk rolling avg
#                     )),
# ])

# train_df = pipeline.fit_transform(train)
# test_df = pipeline.transform(test)

# y = train_df['Weekly_Sales']
# X = train_df.drop(columns=['Weekly_Sales'])

# train_df.columns, train_df.shape, test_df.columns, test_df.shape, X.shape, y.shape

from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('merge',        BaseMerger(features, stores)),
    ('fillna',       MissingValueFiller()),
    ('label_encode', CategoricalEncoder()),
    ('feature_add',  FeatureAdder()),
    ('lags',         LagFeatureTransformer(
                        lags=[1,2,3,4],           # e.g. use 4 lags
                        rolling_windows=[4,8],    # e.g. 4‑wk & 8‑wk rolling avg
                    )),
])

train_df = pipeline.fit_transform(train)
test_df = pipeline.transform(test)

y = train_df['Weekly_Sales']
X = train_df.drop(columns=['Weekly_Sales'])

train_df.columns, train_df.shape, test_df.columns, test_df.shape, X.shape, y.shape

(Index(['Store', 'Dept', 'Weekly_Sales', 'IsHoliday', 'Temperature',
        'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4',
        'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size', 'Month',
        'SuperbowlWeek', 'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek',
        'Days_to_Thanksgiving', 'Days_to_Christmas', 'DateOrdinal', 'lag_1',
        'lag_2', 'lag_3', 'lag_4', 'rolling_mean_4', 'rolling_mean_8'],
       dtype='object'),
 (398796, 29),
 Index(['Store', 'Dept', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1',
        'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI',
        'Unemployment', 'Type', 'Size', 'Month', 'SuperbowlWeek',
        'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek',
        'Days_to_Thanksgiving', 'Days_to_Christmas', 'DateOrdinal', 'lag_1',
        'lag_2', 'lag_3', 'lag_4', 'rolling_mean_4', 'rolling_mean_8'],
       dtype='object'),
 (115064, 28),
 (398796, 28),
 (398796,))

# ML flow tracking for cleaning and feature engineering

In [None]:
%pip install -q dagshub mlflow
import mlflow
import dagshub
import mlflow.xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import xgboost as xgb


dagshub.init(repo_owner='nkhar21', repo_name='ML_Final_Project', mlflow=True)
mlflow.set_tracking_uri("https://dagshub.com/nkhar21/ML_Final_Project.mlflow")

experiment_name = "XGBoost_Training"
mlflow.set_experiment(experiment_name)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.7/24.7 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=5efc9838-c6bb-4c44-b531-55941041d2ba&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=73c27564289ba72ee211b6d4b42263cc313d5e6755ade72147245c83389255ce




<Experiment: artifact_location='mlflow-artifacts:/b6bb21150f0c4042863bc345b7edb3cd', creation_time=1751801003477, experiment_id='0', last_update_time=1751801003477, lifecycle_stage='active', name='XGBoost_Training', tags={}>

# Log Cleaning

In [None]:
cleaning_pipeline = Pipeline([
    ("merge",       BaseMerger(features, stores)),     # joins store + feature tables
    ("fillna",      MissingValueFiller()),             # impute mark‑downs, CPI, Unemployment
    ("label_encode", CategoricalEncoder()),            # encode Type, IsHoliday
])

with mlflow.start_run(run_name="XGBoost_Cleaning"):
    mlflow.log_params({
        "merge_on": ["Store","Date","IsHoliday"],
        "fill_markdowns_with": 0,
        "impute_CPI_with": "mean",
        "impute_Unemp_with": "mean",
        "encode_Type_map": str({'A':3,'B':2,'C':1}),
        "encode_Holiday_map": str({False:0,True:1})
    })

    cleaned = cleaning_pipeline.fit_transform(train)
    cleaned_test = cleaning_pipeline.transform(test)

    mlflow.log_metric("n_clean_rows", cleaned.shape[0])
    mlflow.log_metric("n_clean_cols", cleaned.shape[1])

    # Optionally pickle & log the pipeline
    import pickle
    with open("cleaning_pipeline.pkl", "wb") as f:
        pickle.dump(cleaning_pipeline, f)
    mlflow.log_artifact("cleaning_pipeline.pkl")




🏃 View run XGBoost_Cleaning at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/0/runs/56a1e343070144ab9a89628a8ce4bb46
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/0


# Log Feature Engineering

In [None]:
feature_pipeline = Pipeline([
    ("feature_add",  FeatureAdder()),
    ("lags",         LagFeatureTransformer(
                        lags=[1,2,3,4],
                        rolling_windows=[4,8],
                    )),
])

with mlflow.start_run(run_name="XGBoost_Feature_Engineering"):
    mlflow.log_params({
        "use_superbowl_holiday": True,
        "lags": [1,2,3,4],
        "rolling_windows": [4,8],
    })

    fe_train = feature_pipeline.fit_transform(cleaned)
    fe_test  = feature_pipeline.transform(cleaned_test)

    mlflow.log_metric("n_fe_rows", fe_train.shape[0])
    mlflow.log_metric("n_fe_cols", fe_train.shape[1])

    with open("feature_pipeline.pkl", "wb") as f:
        pickle.dump(feature_pipeline, f)
    mlflow.log_artifact("feature_pipeline.pkl")

🏃 View run XGBoost_Feature_Engineering at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/0/runs/f51bf64a6a6a45a28c7885ba6a56099e
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/0


# Training Helper functions


In [None]:
def log_xgb_params(model):
    # Pull hyper‑params straight off the fitted model
    p = model.get_params()
    mlflow.log_params({
        "n_estimators":    p["n_estimators"],
        "max_depth":       p["max_depth"],
        "learning_rate":   p["learning_rate"],
        "subsample":       p.get("subsample"),
        "colsample_bytree":p.get("colsample_bytree"),
        "gamma":           p.get("gamma"),
        "reg_alpha":       p.get("reg_alpha"),
        "reg_lambda":      p.get("reg_lambda"),
        "objective":       p.get("objective"),
        "random_state":    p.get("random_state"),
    })

def evaluate(model, X, y, weights, split):
    preds = model.predict(X)
    wmae = (weights * np.abs(y - preds)).sum() / weights.sum()
    mae  = mean_absolute_error(y, preds)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2   = r2_score(y, preds)
    # MAPE (skip zero‑actual rows)
    mask = y != 0
    mape = np.mean(np.abs((y[mask] - preds[mask]) / y[mask])) * 100

    mlflow.log_metrics({
        f"{split}_WMAE": wmae,
        f"{split}_MAE":  mae,
        f"{split}_RMSE": rmse,
        f"{split}_R2":   r2,
        f"{split}_MAPE": mape
    })

    print(f"[{split.upper()}] WMAE={wmae:.4f}, MAE={mae:.4f}, RMSE={rmse:.4f}, R²={r2:.4f}, MAPE={mape:.2f}%")
    return {"WMAE": wmae, "MAE": mae, "RMSE": rmse, "R2": r2, "MAPE": mape}


In [None]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('merge',        BaseMerger(features, stores)),
    ('fillna',       MissingValueFiller()),
    ('label_encode', CategoricalEncoder()),
    ('feature_add',  FeatureAdder()),
    ('lags',         LagFeatureTransformer(
                        lags=[1,2,3,4],           # e.g. use 4 lags
                        rolling_windows=[4,8],    # e.g. 4‑wk & 8‑wk rolling avg
                    )),
])

train_df = pipeline.fit_transform(train)

# Training 1

In [None]:
import mlflow
import mlflow.xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import numpy as np

y = train_df["Weekly_Sales"]
X = train_df.drop(columns=["Weekly_Sales"])
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

w_train = X_train["IsHoliday"].astype(bool).map({True:5, False:1}).values
w_val   = X_val  ["IsHoliday"].astype(bool).map({True:5, False:1}).values

with mlflow.start_run(run_name="XGBoost_Regressor_1"):
    mlflow.xgboost.autolog()

    model = xgb.XGBRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0,
        reg_alpha=0,
        reg_lambda=1,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1,
    )

    log_xgb_params(model)

    model.fit(X_train, y_train)

    train_metrics = evaluate(model, X_train, y_train, w_train, split="train")
    val_metrics   = evaluate(model, X_val,   y_val,   w_val,   split="val")

    delta_metrics = {
        f"delta_{m}": train_metrics[m] - val_metrics[m]
        for m in train_metrics
    }
    mlflow.log_metrics(delta_metrics)
    print("Overfitting deltas:", delta_metrics)




[TRAIN] WMAE=930.2843, MAE=875.2516, RMSE=1675.6686, R²=0.9946, MAPE=356.97%
[VAL] WMAE=1012.2616, MAE=928.8814, RMSE=2184.3777, R²=0.9905, MAPE=297.24%
Overfitting deltas: {'delta_WMAE': np.float64(-81.97734535682025), 'delta_MAE': -53.62976558877472, 'delta_RMSE': np.float64(-508.7091047403974), 'delta_R2': 0.0041069145288110676, 'delta_MAPE': np.float64(59.72380464369479)}
🏃 View run XGBoost_Regressor_1 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/0/runs/675aace5d0fc418fb5c3673136d470fb
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/0


In [None]:
import mlflow
import mlflow.xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import numpy as np

# --- 1) Prepare X, y and filter out bad labels ---
y_raw = train_df["Weekly_Sales"]
X = train_df.drop(columns=["Weekly_Sales"])

# log transform
y_log = np.log1p(y_raw)

# filter non‑finite
mask = np.isfinite(y_log)
if not mask.all():
    bad = (~mask).sum()
    print(f"Dropping {bad} rows due to NaN/inf after log1p.")
    X = X.loc[mask].reset_index(drop=True)
    y_log = y_log.loc[mask].reset_index(drop=True)

# weighted MAE weights
w_all = X["IsHoliday"].astype(bool).map({True:5, False:1}).values

# train/val split
X_train, X_val, y_train_log, y_val_log, w_train, w_val = train_test_split(
    X, y_log, w_all, test_size=0.2, random_state=42
)

# --- 2) Helpers (unchanged) ---
def log_xgb_params(model):
    p = model.get_params()
    mlflow.log_params({
        "n_estimators":    p["n_estimators"],
        "max_depth":       p["max_depth"],
        "learning_rate":   p["learning_rate"],
        "subsample":       p.get("subsample"),
        "colsample_bytree":p.get("colsample_bytree"),
        "gamma":           p.get("gamma"),
        "reg_alpha":       p.get("reg_alpha"),
        "reg_lambda":      p.get("reg_lambda"),
        "objective":       p.get("objective"),
        "random_state":    p.get("random_state"),
    })

def evaluate(model, X, y_log, weights, split):
    preds_log = model.predict(X)
    preds = np.expm1(preds_log)
    y = np.expm1(y_log)

    wmae = (weights * np.abs(y - preds)).sum() / weights.sum()
    mae  = mean_absolute_error(y, preds)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2   = r2_score(y, preds)
    mask = y != 0
    mape = np.mean(np.abs((y[mask] - preds[mask]) / y[mask])) * 100

    mlflow.log_metrics({
        f"{split}_WMAE": wmae,
        f"{split}_MAE":  mae,
        f"{split}_RMSE": rmse,
        f"{split}_R2":   r2,
        f"{split}_MAPE": mape
    })

    print(f"[{split.upper()}] WMAE={wmae:.4f}, MAE={mae:.4f}, RMSE={rmse:.4f}, R²={r2:.4f}, MAPE={mape:.2f}%")
    return {"WMAE": wmae, "MAE": mae, "RMSE": rmse, "R2": r2, "MAPE": mape}

# --- 3) Training Run ---
with mlflow.start_run(run_name="XGBoost_Regressor_LogTarget"):
    mlflow.xgboost.autolog()

    model = xgb.XGBRegressor(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0,
        reg_alpha=0,
        reg_lambda=1,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1,
    )

    log_xgb_params(model)
    model.fit(X_train, y_train_log)

    train_metrics = evaluate(model, X_train, y_train_log, w_train, split="train")
    val_metrics   = evaluate(model, X_val,   y_val_log,   w_val,   split="val")

    delta = {f"delta_{k}": train_metrics[k] - val_metrics[k] for k in train_metrics}
    mlflow.log_metrics(delta)
    print("Overfitting deltas:", delta)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Dropping 956 rows due to NaN/inf after log1p.




[TRAIN] WMAE=1239.1977, MAE=1099.2810, RMSE=3212.7891, R²=0.9801, MAPE=12.85%
[VAL] WMAE=1273.9650, MAE=1130.9875, RMSE=3086.1107, R²=0.9817, MAPE=24.80%
Overfitting deltas: {'delta_WMAE': np.float64(-34.76728909052076), 'delta_MAE': -31.706501328909553, 'delta_RMSE': np.float64(126.67836117897286), 'delta_R2': -0.0016264944645584256, 'delta_MAPE': np.float64(-11.950057416129955)}
🏃 View run XGBoost_Regressor_LogTarget at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/0/runs/58f590a7633b46fd9cc4a6970ac7ae95
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/0


# Training 3 - Tune Hyperparams and use Scaler

In [None]:
import mlflow
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
import xgboost as xgb
import numpy as np

# 0) Prepare data & WMAE helper
y = train_df["Weekly_Sales"]
X = train_df.drop(columns=["Weekly_Sales"])
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
w_val = X_val["IsHoliday"].map({True:5, False:1}).values

def wmae(y_true, y_pred, w):
    return (w * np.abs(y_true - y_pred)).sum() / w.sum()

# 1) Build pipeline: scale → clamp inf → XGB
final_pipeline = Pipeline([
    ("scaler",     StandardScaler()),
    ("clamp_inf",  FunctionTransformer(lambda X: np.nan_to_num(X, posinf=0, neginf=0))),
    ("regressor",  xgb.XGBRegressor(
                       objective="reg:squarederror",
                       subsample=0.8,
                       colsample_bytree=0.8,
                       gamma=0,
                       reg_alpha=0,
                       reg_lambda=1,
                       random_state=42,
                       n_jobs=-1
                   )),
])

# 2) Parameter grid
param_grid = {
    "regressor__n_estimators":   [100, 200, 300],
    "regressor__max_depth":      [4, 6, 8],
    "regressor__learning_rate":  [0.05, 0.1, 0.2],
}

# 3) GridSearchCV on neg MAE
grid = GridSearchCV(
    final_pipeline,
    param_grid=param_grid,
    cv=3,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    verbose=1,
    return_train_score=False
)

mlflow.set_experiment("XGBoost_GridSearch")
with mlflow.start_run(run_name="GridSearch_Defaults"):
    # Fit
    grid.fit(X_train, y_train)

    # Extract best
    best_pipe = grid.best_estimator_
    best_params = grid.best_params_
    mlflow.log_params({
        "n_estimators":  best_params["regressor__n_estimators"],
        "max_depth":     best_params["regressor__max_depth"],
        "learning_rate": best_params["regressor__learning_rate"],
    })

    # Evaluate on hold‑out
    y_val_pred = best_pipe.predict(X_val)
    val_wmae   = wmae(y_val, y_val_pred, w_val)
    val_mae    = mean_absolute_error(y_val, y_val_pred)
    val_rmse   = np.sqrt(mean_squared_error(y_val, y_val_pred))
    val_r2     = r2_score(y_val, y_val_pred)

    mlflow.log_metrics({
        "val_WMAE": val_wmae,
        "val_MAE":  val_mae,
        "val_RMSE": val_rmse,
        "val_R2":   val_r2
    })

    print(f"Best params: {best_params}")
    print(f"[VAL] WMAE={val_wmae:.4f}, MAE={val_mae:.4f}, RMSE={val_rmse:.4f}, R²={val_r2:.4f}")

    # 4) Serialize & log the best pipeline
    with open("best_pipeline.pkl", "wb") as f:
        pickle.dump(best_pipe, f)
    mlflow.log_artifact("best_pipeline.pkl", artifact_path="model_pipeline")


2025/07/06 15:22:23 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost_GridSearch' does not exist. Creating a new experiment.


Fitting 3 folds for each of 27 candidates, totalling 81 fits




Best params: {'regressor__learning_rate': 0.2, 'regressor__max_depth': 8, 'regressor__n_estimators': 300}
[VAL] WMAE=nan, MAE=600.6671, RMSE=1790.1897, R²=0.9936
🏃 View run GridSearch_Defaults at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/1/runs/46c267c4baa04449ae29653d63a0d478
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/1


PicklingError: Can't pickle <function <lambda> at 0x7dcdd0123e20>: attribute lookup <lambda> on __main__ failed

# Training 4 - Tuning

In [None]:
import mlflow
import pickle
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

# --- Top‑level helper to clamp infinities ---
def clamp_inf_array(X):
    # convert ±inf to 0 and leave finite values unchanged
    return np.nan_to_num(X, posinf=0, neginf=0)

# --- 0) Prepare data & WMAE helper ---
y = train_df["Weekly_Sales"]
X = train_df.drop(columns=["Weekly_Sales"])
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
w_val = X_val["IsHoliday"].map({True:5, False:1}).values

def wmae(y_true, y_pred, w):
    return (w * np.abs(y_true - y_pred)).sum() / w.sum()

# --- 1) Final pipeline: scale → clamp_inf_array → XGB ---
final_pipeline = Pipeline([
    ("scaler",    StandardScaler()),
    ("clamp_inf", FunctionTransformer(clamp_inf_array)),
    ("regressor", xgb.XGBRegressor(
                      objective="reg:squarederror",
                      subsample=0.8,
                      colsample_bytree=0.8,
                      gamma=0,
                      reg_alpha=0,
                      reg_lambda=1,
                      random_state=42,
                      n_jobs=-1
                  )),
])

# --- 2) Parameter grid ---
param_grid = {
    "regressor__n_estimators":   [200, 300, 400],
    "regressor__max_depth":      [6, 7, 8],
    "regressor__learning_rate":  [0.15, 0.2, 0.25],
}

# --- 3) GridSearchCV setup ---
grid = GridSearchCV(
    final_pipeline,
    param_grid=param_grid,
    cv=2,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    verbose=1,
    return_train_score=False
)

mlflow.set_experiment("XGBoost_GridSearch")
with mlflow.start_run(run_name="GridSearch_Defaults"):
    # 4) Fit the grid
    grid.fit(X_train, y_train)

    # 5) Log best params
    best_params = {
        k.replace("regressor__", ""): v
        for k, v in grid.best_params_.items()
    }
    mlflow.log_params(best_params)

    # 6) Evaluate on validation
    best_pipe = grid.best_estimator_
    y_val_pred = best_pipe.predict(X_val)
    val_wmae   = wmae(y_val, y_val_pred, w_val)
    val_mae    = mean_absolute_error(y_val, y_val_pred)
    val_rmse   = np.sqrt(mean_squared_error(y_val, y_val_pred))
    val_r2     = r2_score(y_val, y_val_pred)

    mlflow.log_metrics({
        "val_WMAE": val_wmae,
        "val_MAE":  val_mae,
        "val_RMSE": val_rmse,
        "val_R2":   val_r2
    })

    print(f"Best params: {best_params}")
    print(f"[VAL] WMAE={val_wmae:.4f}, MAE={val_mae:.4f}, RMSE={val_rmse:.4f}, R²={val_r2:.4f}")

    # 7) Serialize & log the best pipeline
    with open("best_pipeline.pkl", "wb") as f:
        pickle.dump(best_pipe, f)
    mlflow.log_artifact("best_pipeline.pkl", artifact_path="model_pipeline")


Fitting 2 folds for each of 27 candidates, totalling 54 fits




Best params: {'learning_rate': 0.2, 'max_depth': 8, 'n_estimators': 400}
[VAL] WMAE=nan, MAE=573.6104, RMSE=1772.5646, R²=0.9937
🏃 View run GridSearch_Defaults at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/1/runs/8c7f8e34a23c4a0d8c54ec902962f2bd
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/1


# Training 5 - Tuning

In [None]:
import mlflow
import pickle
import numpy as np
import itertools
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

# --- Helper to clamp infinities (named function for pickle) ---
def clamp_inf_array(X):
    return np.nan_to_num(X, posinf=0, neginf=0)

# --- Safe WMAE to guard against NaNs and zero‐weight sums ---
def safe_wmae(y_true, y_pred, weights):
    mask = (
        np.isfinite(y_true) &
        np.isfinite(y_pred) &
        np.isfinite(weights) &
        (weights > 0)
    )
    if not mask.any():
        return np.nan
    return (weights[mask] * np.abs(y_true[mask] - y_pred[mask])).sum() / weights[mask].sum()

# --- Data prep ---
y = train_df["Weekly_Sales"]
X = train_df.drop(columns=["Weekly_Sales"])
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Compute weights as plain numeric arrays (no dtype=object)
# after your CategoricalEncoder, IsHoliday is 0/1
w_train = np.where(X_train["IsHoliday"] == 1, 5, 1)
w_val   = np.where(X_val  ["IsHoliday"] == 1, 5, 1)

# --- Parameter grid ---
param_grid = {
    "n_estimators":   [400, 800, 1600],
    "max_depth":      [8, 10, 15],
    "learning_rate":  [0.15, 0.2, 0.25],
}
param_combinations = list(itertools.product(
    param_grid["n_estimators"],
    param_grid["max_depth"],
    param_grid["learning_rate"]
))

mlflow.set_experiment("XGBoost_ManualGridSearch_2")

best_score = float("inf")
best_model = None
best_params = None

# --- Manual grid search with progress logging ---
for idx, (ne, md, lr) in enumerate(param_combinations, start=1):
    total = len(param_combinations)
    run_name = f"ne{ne}_md{md}_lr{lr}"
    print(f"[{idx}/{total}] Training {run_name}...")

    with mlflow.start_run(run_name=run_name):
        # Build pipeline
        pipe = Pipeline([
            ("scaler", StandardScaler()),
            ("clamp",  FunctionTransformer(clamp_inf_array)),
            ("regressor", xgb.XGBRegressor(
                n_estimators=ne,
                max_depth=md,
                learning_rate=lr,
                objective="reg:squarederror",
                subsample=0.8,
                colsample_bytree=0.8,
                gamma=0,
                reg_alpha=0,
                reg_lambda=1,
                random_state=42,
                n_jobs=-1
            )),
        ])

        # Fit
        pipe.fit(X_train, y_train)

        # Predict & evaluate
        y_pred = pipe.predict(X_val)
        score  = safe_wmae(y_val.values, y_pred, w_val)
        mae    = mean_absolute_error(y_val, y_pred)
        rmse   = np.sqrt(mean_squared_error(y_val, y_pred))
        r2     = r2_score(y_val, y_pred)

        # If WMAE is NaN, skip this run
        if np.isnan(score):
            print(f"    ⚠️  WMAE is NaN; skipping logging for this run.")
            continue

        # Log parameters and metrics
        mlflow.log_params({
            "n_estimators":   ne,
            "max_depth":      md,
            "learning_rate":  lr
        })
        mlflow.log_metrics({
            "val_WMAE": score,
            "val_MAE":  mae,
            "val_RMSE": rmse,
            "val_R2":   r2
        })

        print(f"    WMAE={score:.2f}, MAE={mae:.2f}, RMSE={rmse:.2f}, R²={r2:.4f}")

        # Track best model
        if score < best_score:
            best_score = score
            best_model = pipe
            best_params = {"n_estimators": ne, "max_depth": md, "learning_rate": lr}

# --- Serialize & log the best pipeline artifact ---
if best_model is not None:
    with open("best_pipeline.pkl", "wb") as f:
        pickle.dump(best_model, f)
    mlflow.log_artifact("best_pipeline.pkl", artifact_path="model_pipeline")

    print("\n✅ Best Model:")
    print(f"   Params: {best_params}")
    print(f"   WMAE:   {best_score:.2f}")
else:
    print("⚠️ No valid model found (all runs had NaN WMAE).")


[1/27] Training ne400_md8_lr0.15...




    WMAE=642.24, MAE=587.51, RMSE=1754.89, R²=0.9939
🏃 View run ne400_md8_lr0.15 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/65e5ef1ef901435bb03265a92682f854
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[2/27] Training ne400_md8_lr0.2...




    WMAE=632.35, MAE=573.61, RMSE=1772.56, R²=0.9937
🏃 View run ne400_md8_lr0.2 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/4c771b0aaf0b4815b2d79d3df56f3550
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[3/27] Training ne400_md8_lr0.25...




    WMAE=649.16, MAE=581.13, RMSE=1721.84, R²=0.9941
🏃 View run ne400_md8_lr0.25 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/affdd267abd54f7d9acae41ff8dde6c2
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[4/27] Training ne400_md10_lr0.15...




    WMAE=605.82, MAE=546.29, RMSE=1756.26, R²=0.9939
🏃 View run ne400_md10_lr0.15 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/56cb14746f444de485477d67a6c7aaab
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[5/27] Training ne400_md10_lr0.2...




    WMAE=623.74, MAE=563.26, RMSE=1798.30, R²=0.9936
🏃 View run ne400_md10_lr0.2 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/a746e2b1c3f943cbafcc42578f6bed89
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[6/27] Training ne400_md10_lr0.25...




    WMAE=651.73, MAE=577.92, RMSE=1815.51, R²=0.9934
🏃 View run ne400_md10_lr0.25 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/de1d0fd1117e4ca88ea1556feb7b2040
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[7/27] Training ne400_md15_lr0.15...




    WMAE=672.69, MAE=597.14, RMSE=1913.28, R²=0.9927
🏃 View run ne400_md15_lr0.15 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/a1f122713fde4790b5b995103b09607b
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[8/27] Training ne400_md15_lr0.2...




    WMAE=700.16, MAE=622.97, RMSE=1946.77, R²=0.9925
🏃 View run ne400_md15_lr0.2 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/2468fe32eb074b12ae1ceff8e1006ab6
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[9/27] Training ne400_md15_lr0.25...




    WMAE=736.83, MAE=658.28, RMSE=1984.10, R²=0.9922
🏃 View run ne400_md15_lr0.25 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/fba724e5f5074e97a8706f393685b54f
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[10/27] Training ne800_md8_lr0.15...




    WMAE=591.07, MAE=536.83, RMSE=1727.39, R²=0.9941
🏃 View run ne800_md8_lr0.15 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/81f8a50b3cb6457e8d2f502957092cfe
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[11/27] Training ne800_md8_lr0.2...




    WMAE=601.09, MAE=542.23, RMSE=1756.69, R²=0.9939
🏃 View run ne800_md8_lr0.2 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/1b484a7234f9418f8b524f98944a0318
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[12/27] Training ne800_md8_lr0.25...




    WMAE=627.41, MAE=558.84, RMSE=1715.87, R²=0.9941
🏃 View run ne800_md8_lr0.25 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/e5ec99c50da54974aa1ad20b1793c534
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[13/27] Training ne800_md10_lr0.15...




    WMAE=590.92, MAE=531.60, RMSE=1752.51, R²=0.9939
🏃 View run ne800_md10_lr0.15 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/1cd82ce76e7249e79538bd57fb4adbeb
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[14/27] Training ne800_md10_lr0.2...




    WMAE=616.56, MAE=556.01, RMSE=1797.25, R²=0.9936
🏃 View run ne800_md10_lr0.2 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/3b9a882ff46d40a387bce4a231a8f42d
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[15/27] Training ne800_md10_lr0.25...




    WMAE=649.11, MAE=575.34, RMSE=1815.10, R²=0.9934
🏃 View run ne800_md10_lr0.25 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/281c31df55d74c87a453307f69e10005
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[16/27] Training ne800_md15_lr0.15...




    WMAE=672.77, MAE=597.25, RMSE=1913.25, R²=0.9927
🏃 View run ne800_md15_lr0.15 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/d5b9871110924a7b9afad276517946e9
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[17/27] Training ne800_md15_lr0.2...




    WMAE=700.28, MAE=623.10, RMSE=1946.80, R²=0.9925
🏃 View run ne800_md15_lr0.2 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/b0c05a69f9504632b7a7194284ee946a
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[18/27] Training ne800_md15_lr0.25...




    WMAE=736.88, MAE=658.33, RMSE=1984.10, R²=0.9922
🏃 View run ne800_md15_lr0.25 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/3ba42d35acbd4f758ba38a0094a4fed5
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[19/27] Training ne1600_md8_lr0.15...




    WMAE=571.62, MAE=518.15, RMSE=1721.68, R²=0.9941
🏃 View run ne1600_md8_lr0.15 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/fd5d7fd923b840aab066280b3c655b92
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[20/27] Training ne1600_md8_lr0.2...




    WMAE=591.38, MAE=532.85, RMSE=1757.21, R²=0.9939
🏃 View run ne1600_md8_lr0.2 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/c0f9262f042945fc9d92cd5cf540f5dd
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[21/27] Training ne1600_md8_lr0.25...




    WMAE=620.89, MAE=553.02, RMSE=1716.07, R²=0.9941
🏃 View run ne1600_md8_lr0.25 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/a6a34e9c5ae54d50976254a86e3d8f9e
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[22/27] Training ne1600_md10_lr0.15...




    WMAE=588.68, MAE=529.62, RMSE=1752.61, R²=0.9939
🏃 View run ne1600_md10_lr0.15 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/044dfdd305ed4f92a842df2d94bb89bd
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[23/27] Training ne1600_md10_lr0.2...




    WMAE=616.64, MAE=556.31, RMSE=1797.35, R²=0.9936
🏃 View run ne1600_md10_lr0.2 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/a941b8a29ff3481a8073fcf26829defd
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[24/27] Training ne1600_md10_lr0.25...




    WMAE=650.35, MAE=576.86, RMSE=1815.54, R²=0.9934
🏃 View run ne1600_md10_lr0.25 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/aa2a44def2014e6b8625835d661c808b
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[25/27] Training ne1600_md15_lr0.15...




    WMAE=672.79, MAE=597.26, RMSE=1913.26, R²=0.9927
🏃 View run ne1600_md15_lr0.15 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/d0407fafc3174a46ada7510bac4f4294
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[26/27] Training ne1600_md15_lr0.2...




    WMAE=700.28, MAE=623.10, RMSE=1946.79, R²=0.9925
🏃 View run ne1600_md15_lr0.2 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/4823d8a342834ba7ab2e7ad0f527bdee
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4
[27/27] Training ne1600_md15_lr0.25...




    WMAE=736.88, MAE=658.33, RMSE=1984.10, R²=0.9922
🏃 View run ne1600_md15_lr0.25 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4/runs/58735e550d36430cb5a0d681b93127c4
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/4

✅ Best Model:
   Params: {'n_estimators': 1600, 'max_depth': 8, 'learning_rate': 0.15}
   WMAE:   571.62


# Best model

In [None]:
import mlflow
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
import xgboost as xgb
import numpy as np

preprocessor_pipeline = Pipeline([
    ("merge",       BaseMerger(features, stores)),
    ("fillna",      MissingValueFiller()),
    ("label_encode", CategoricalEncoder()),
    ("feature_add",  FeatureAdder()),
    ("lags",         LagFeatureTransformer(
                        lags=[1,2,3,4],
                        rolling_windows=[4,8],
                    )),
    ])

def clamp_inf_array(X):
    return np.nan_to_num(X, posinf=0, neginf=0)


regressor_pipeline =  pipe = Pipeline([
            ("scaler", StandardScaler()),
            ("clamp",  FunctionTransformer(clamp_inf_array)),
            ("regressor", xgb.XGBRegressor(
                n_estimators=1600,
                max_depth=8,
                learning_rate=0.15,
                objective="reg:squarederror",
                subsample=0.8,
                colsample_bytree=0.8,
                gamma=0,
                reg_alpha=0,
                reg_lambda=1,
                random_state=42,
                n_jobs=-1,
                eval_metric="mae"
            )),
        ])

train_df = preprocessor_pipeline.fit_transform(train)
test_df = preprocessor_pipeline.transform(test)

y_train = train_df["Weekly_Sales"]
X_train = train_df.drop(columns=["Weekly_Sales"])
X_test = test_df

X_train.columns, X_train.shape, y_train.shape, X_test.columns, X_test.shape

(Index(['Store', 'Dept', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1',
        'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI',
        'Unemployment', 'Type', 'Size', 'Month', 'SuperbowlWeek',
        'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek',
        'Days_to_Thanksgiving', 'Days_to_Christmas', 'DateOrdinal', 'lag_1',
        'lag_2', 'lag_3', 'lag_4', 'rolling_mean_4', 'rolling_mean_8'],
       dtype='object'),
 (398796, 28),
 (398796,),
 Index(['Store', 'Dept', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1',
        'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI',
        'Unemployment', 'Type', 'Size', 'Month', 'SuperbowlWeek',
        'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek',
        'Days_to_Thanksgiving', 'Days_to_Christmas', 'DateOrdinal', 'lag_1',
        'lag_2', 'lag_3', 'lag_4', 'rolling_mean_4', 'rolling_mean_8'],
       dtype='object'),
 (115064, 28))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.xgboost
import mlflow.sklearn
import pandas as pd

# 1) Hold out a validation set for early stopping
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# 2) Extract the scaler, clamp, and regressor from your pipeline
scaler    = regressor_pipeline.named_steps["scaler"]
clamp     = regressor_pipeline.named_steps["clamp"]
xgb_model = regressor_pipeline.named_steps["regressor"]

# 3) Fit scaler & clamp on X_tr, transform X_val
X_tr_s   = clamp.transform(scaler.fit_transform   (X_tr))
X_val_s  = clamp.transform(scaler.transform       (X_val))

# 4) Start MLflow run and turn on XGBoost autologging
mlflow.set_experiment("XGBoost_Training")
with mlflow.start_run(run_name="xgb_final") as run:
    print("🧪 MLflow Run ID:", run.info.run_id)
    mlflow.xgboost.autolog()

    # 5) Fit the raw XGB model with early stopping
    xgb_model.fit(
        X_tr_s, y_tr,
        eval_set=[(X_val_s, y_val)]
    )

    # 6) Evaluate on hold‑out using the full pipeline (which re‐applies scaler+clamp)
    val_preds = regressor_pipeline.predict(X_val)
    val_mae   = mean_absolute_error(y_val, val_preds)
    val_rmse  = (mean_squared_error(y_val, val_preds) ** 0.5)
    val_r2    = r2_score(y_val, val_preds)
    # Weighted MAE
    weights   = np.where(X_val["IsHoliday"]==1, 5, 1)
    val_wmae  = (weights * np.abs(y_val - val_preds)).sum() / weights.sum()

    mlflow.log_metrics({
        "val_WMAE": val_wmae,
        "val_MAE":  val_mae,
        "val_RMSE": val_rmse,
        "val_R2":   val_r2
    })
    print(f"✅ Validation – WMAE:{val_wmae:.2f}, MAE:{val_mae:.2f}, RMSE:{val_rmse:.2f}, R²:{val_r2:.4f}")

    # 7) Predict on your test set and write submission
    test_preds = regressor_pipeline.predict(X_test)
    submission = pd.DataFrame({
        "Id": test["Store"].astype(str) + "_" +
              test["Dept"].astype(str)  + "_" +
              test["Date"].astype(str),
        "Weekly_Sales": test_preds
    })
    submission.to_csv("xgb_submission.csv", index=False)
    print("✅ Written xgb_submission.csv")

    # 8) Save the full fitted pipeline with pickle
    with open("xgb_model_pipeline.pkl", "wb") as f:
        pickle.dump(regressor_pipeline, f)
    print("✅ Pipeline saved to 'xgb_model_pipeline.pkl'")



🧪 MLflow Run ID: 63ec8410d6944556bdc70f1c55d68513
[0]	validation_0-mae:12913.25087
[1]	validation_0-mae:11057.68721
[2]	validation_0-mae:9444.76149
[3]	validation_0-mae:8072.98007
[4]	validation_0-mae:6921.14358
[5]	validation_0-mae:5936.29914
[6]	validation_0-mae:5101.92087
[7]	validation_0-mae:4398.00891
[8]	validation_0-mae:3803.31619
[9]	validation_0-mae:3308.38510
[10]	validation_0-mae:2890.85259
[11]	validation_0-mae:2544.55060
[12]	validation_0-mae:2257.88078
[13]	validation_0-mae:2025.29728
[14]	validation_0-mae:1834.83731
[15]	validation_0-mae:1688.09211
[16]	validation_0-mae:1556.97154
[17]	validation_0-mae:1451.31388
[18]	validation_0-mae:1376.07417
[19]	validation_0-mae:1304.98715
[20]	validation_0-mae:1248.33737
[21]	validation_0-mae:1209.21994
[22]	validation_0-mae:1177.88972
[23]	validation_0-mae:1141.63069
[24]	validation_0-mae:1110.98688
[25]	validation_0-mae:1088.74740
[26]	validation_0-mae:1066.18912
[27]	validation_0-mae:1046.88320
[28]	validation_0-mae:1033.42116
[



✅ Validation – WMAE:571.62, MAE:518.15, RMSE:1721.68, R²:0.9941
✅ Written xgb_submission.csv
✅ Pipeline saved to 'xgb_model_pipeline.pkl'
🏃 View run xgb_final at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/0/runs/63ec8410d6944556bdc70f1c55d68513
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/0


# Predict and generate sub

In [None]:
X_test  = clamp.transform(scaler.transform       (test_df))
test_preds = xgb_model.predict(X_test)

# 5) Create submission DataFrame with Id format: Store_Dept_Date
submission = pd.DataFrame({
    "Id": test["Store"].astype(str) + "_" +
          test["Dept"].astype(str) + "_" +
          test["Date"].astype(str),
    "Weekly_Sales": test_preds
})

# Save to CSV
submission.to_csv("submission_XGB.csv", index=False)

# Preview
print(submission.head())


               Id  Weekly_Sales
0  1_1_2012-11-02  21761.939453
1  1_1_2012-11-09  22235.800781
2  1_1_2012-11-16  21867.765625
3  1_1_2012-11-23  21721.035156
4  1_1_2012-11-30  21982.865234


In [None]:
import pickle
import mlflow

mlflow.set_experiment("XGBoost_Training")
with mlflow.start_run(run_name="XGBoost_preprocessor_run") as run:
    run_id = run.info.run_id
    print("MLflow Run ID:", run_id)

    # 1) Log your custom params as before
    bm = preprocessor_pipeline.named_steps["merge"]
    mlflow.log_param("BaseMerger.feature_store_rows", len(bm.feature_store))
    mvf = preprocessor_pipeline.named_steps["fillna"]
    mlflow.log_param("MissingValueFiller.markdown_cols", len(mvf.markdown_cols))
    mlflow.log_param("MissingValueFiller.mean_cols",     len(mvf.mean_cols))
    ce = preprocessor_pipeline.named_steps["label_encode"]
    mlflow.log_param("CategoricalEncoder.type_mapping",    str(ce.type_mapping))
    mlflow.log_param("CategoricalEncoder.holiday_mapping", str(ce.holiday_mapping))
    fa = preprocessor_pipeline.named_steps["feature_add"]
    mlflow.log_param("FeatureAdder.superbowl_dates",     len(fa.superbowl))
    mlflow.log_param("FeatureAdder.thanksgiving_dates",  len(fa.thanksgiving))
    lag = preprocessor_pipeline.named_steps["lags"]
    mlflow.log_param("LagFeatureTransformer.lags",            ",".join(map(str, lag.lags)))
    mlflow.log_param("LagFeatureTransformer.rolling_windows", ",".join(map(str, lag.rolling_windows)))
    mlflow.log_param("LagFeatureTransformer.drop_na",         lag.drop_na)

    # 2) Serialize to disk
    with open("preprocess_pipeline.pkl", "wb") as f:
        pickle.dump(preprocessor_pipeline, f)

    # 3) Log it as a generic artifact
    mlflow.log_artifact("preprocess_pipeline.pkl", artifact_path="pipelines")

    print("✅ Pickled & logged preprocessor as an artifact")


MLflow Run ID: 3c94a96859224c08988b1ff0df319dd5
✅ Pickled & logged preprocessor as an artifact
🏃 View run XGBoost_preprocessor_run at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/0/runs/3c94a96859224c08988b1ff0df319dd5
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/0


# Predict and generate submission

In [None]:
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.xgboost
import mlflow.sklearn
import pandas as pd
from sklearn.metrics import mean_absolute_error

# 1) Split off a small hold‐out for early stopping
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# 2) Configure MLflow
mlflow.set_experiment("XGBoost_Training")
with mlflow.start_run(run_name="xgboost_regressor") as run:
    run_id = run.info.run_id
    print("🧪 MLflow Run ID:", run_id)

    # 3) Fit with early stopping
    regressor_pipeline.fit(
        X_tr, y_tr,
        regressor__eval_set=[(X_val, y_val)],
        regressor__early_stopping_rounds=50,
        regressor__eval_metric="mae",
        regressor__verbose=False
    )


    # 4) Log validation MAE
    val_preds = regressor_pipeline.predict(X_val)
    val_mae = mean_absolute_error(y_val, val_preds)
    mlflow.log_metric("val_mae", val_mae)
    print("✅ Validation MAE:", val_mae)

    # 5) Create submission
    test_preds = regressor_pipeline.predict(X_test)
    submission = pd.DataFrame({
        "Id": test["Store"].astype(str) + "_" +
              test["Dept"].astype(str) + "_" +
              test["Date"].astype(str),
        "Weekly_Sales": test_preds
    })
    submission.to_csv("xgb_submission.csv", index=False)
    print("✅ Wrote xgb_submission.csv")

    # 6) Log the entire fitted pipeline
    mlflow.sklearn.log_model(regressor_pipeline, "xgb_model_pipeline")
    print("✅ Pipeline logged as `xgb_model_pipeline`")




🧪 MLflow Run ID: d3cf3d2059ff4ddda8960cbe174873a1
🏃 View run xgboost_regressor at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/0/runs/d3cf3d2059ff4ddda8960cbe174873a1
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/0


TypeError: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'