<a href="https://colab.research.google.com/github/dimna21/ML_Final_Project/blob/main/model_experiment_LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Load your data
import pandas as pd

features = pd.read_csv('/content/drive/MyDrive/ML_Final_Project/features.csv')
stores = pd.read_csv('/content/drive/MyDrive/ML_Final_Project/stores.csv')
train = pd.read_csv('/content/drive/MyDrive/ML_Final_Project/train.csv')
test = pd.read_csv('/content/drive/MyDrive/ML_Final_Project/test.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Cleaning

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class BaseMerger(BaseEstimator, TransformerMixin):
    def __init__(self, features, stores):
        self.feature_store = features.merge(stores, how='inner', on='Store')
        self.feature_store['Date'] = pd.to_datetime(self.feature_store['Date'])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['Date'] = pd.to_datetime(X['Date'])
        merged = X.merge(self.feature_store, how='inner', on=['Store', 'Date', 'IsHoliday'])
        merged = merged.sort_values(by=['Store', 'Dept', 'Date']).reset_index(drop=True)
        return merged


class MissingValueFiller(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        self.mean_cols = ['CPI', 'Unemployment']
        self.mean_values = {}

    def fit(self, X, y=None):
        for col in self.mean_cols:
            if col in X.columns:
                self.mean_values[col] = X[col].mean()
        return self

    def transform(self, X):
        X = X.copy()

        # Fill markdowns with 0
        for col in self.markdown_cols:
            if col in X.columns:
                X[col] = X[col].fillna(0.0)

        # Fill CPI and Unemployment with learned mean
        for col in self.mean_cols:
            if col in X.columns and col in self.mean_values:
                X[col] = X[col].fillna(self.mean_values[col])

        return X

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.type_mapping = {'A': 3, 'B': 2, 'C': 1}
        self.holiday_mapping = {False: 0, True: 1}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        if 'Type' in X.columns:
            X['Type'] = X['Type'].map(self.type_mapping)

        if 'IsHoliday' in X.columns:
            X['IsHoliday'] = X['IsHoliday'].map(self.holiday_mapping)

        return X

# Feature Engineering

In [None]:
class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.superbowl = pd.to_datetime(['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'])
        self.labor_day = pd.to_datetime(['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'])
        self.thanksgiving = pd.to_datetime(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'])
        self.christmas = pd.to_datetime(['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Convert temperature to Celsius
        if 'Temperature' in X.columns:
            X['Temperature'] = (X['Temperature'] - 32) * (5.0 / 9.0)

        # Basic date parts
        X['Day'] = X['Date'].dt.day
        X['Month'] = X['Date'].dt.month
        X['Year'] = X['Date'].dt.year

        # Extract ISO week and year for holiday matching
        X['Week'] = X['Date'].dt.isocalendar().week
        X['YearNum'] = X['Date'].dt.year

        # Helper to flag if a date is in same ISO week/year as a known holiday
        def is_holiday_week(date_series, holidays):
            holiday_weeks = set((d.isocalendar().week, d.year) for d in holidays)
            return date_series.apply(lambda d: (d.isocalendar().week, d.year) in holiday_weeks if pd.notnull(d) else False).astype(int)

        X['SuperbowlWeek'] = is_holiday_week(X['Date'], self.superbowl)
        X['LaborDayWeek'] = is_holiday_week(X['Date'], self.labor_day)
        X['ThanksgivingWeek'] = is_holiday_week(X['Date'], self.thanksgiving)
        X['ChristmasWeek'] = is_holiday_week(X['Date'], self.christmas)

        # Calculate days to Thanksgiving and Christmas (using Nov 24 and Dec 24 as anchor dates)
        thanksgiving_dates = pd.to_datetime(X['Year'].astype(str) + "-11-24")
        christmas_dates = pd.to_datetime(X['Year'].astype(str) + "-12-24")

        X['Days_to_Thanksgiving'] = (thanksgiving_dates - X['Date']).dt.days
        X['Days_to_Christmas'] = (christmas_dates - X['Date']).dt.days

        # Clean up helper cols
        X = X.drop(columns=['Week', 'YearNum'])

        return X

from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

class LagFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,
                 lags=[1, 2, 3, 4],
                 rolling_windows=[4, 8],
                 drop_na=True):
        self.lags = lags
        self.rolling_windows = rolling_windows
        self.drop_na = drop_na
        self.history_ = None
        self.lag_values_ = {}
        self.rolling_values_ = {}

    def fit(self, X, y=None):
        df = X.copy()
        df['Date'] = pd.to_datetime(df['Date'])

        # Sort by Store, Dept, Date
        df = df.sort_values(['Store', 'Dept', 'Date'])

        # Store the last few values for each Store-Dept combination
        # This will be used to compute lags for test data
        max_lag = max(self.lags)
        max_window = max(self.rolling_windows) if self.rolling_windows else 0
        history_length = max(max_lag, max_window)

        self.history_ = (
            df[['Store', 'Dept', 'Date', 'Weekly_Sales']]
            .groupby(['Store', 'Dept'], as_index=False)
            .tail(history_length)
        )

        # Pre-compute lag and rolling features for the last rows
        # This will help with test data transformation
        self.lag_values_ = {}
        self.rolling_values_ = {}

        for (store, dept), group in df.groupby(['Store', 'Dept']):
            group = group.sort_values('Date')

            # Store last lag values
            self.lag_values_[(store, dept)] = {}
            for lag in self.lags:
                if len(group) >= lag:
                    self.lag_values_[(store, dept)][lag] = group['Weekly_Sales'].iloc[-lag]
                else:
                    self.lag_values_[(store, dept)][lag] = np.nan

            # Store last rolling values
            self.rolling_values_[(store, dept)] = {}
            for window in self.rolling_windows:
                if len(group) >= window:
                    self.rolling_values_[(store, dept)][window] = group['Weekly_Sales'].iloc[-window:].mean()
                else:
                    self.rolling_values_[(store, dept)][window] = np.nan

        return self

    def transform(self, X):
        df = X.copy()
        df['Date'] = pd.to_datetime(df['Date'])
        has_sales = 'Weekly_Sales' in df.columns

        # Create DateOrdinal
        df['DateOrdinal'] = df['Date'].map(pd.Timestamp.toordinal)
        df = df.sort_values(['Store', 'Dept', 'Date'])

        if has_sales:
            # Training data - compute lags normally
            for lag in self.lags:
                df[f'lag_{lag}'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(lag)

            # Compute rolling means
            for window in self.rolling_windows:
                df[f'rolling_mean_{window}'] = (
                    df.groupby(['Store', 'Dept'])['Weekly_Sales']
                    .transform(lambda s: s.rolling(window).mean())
                )
        else:
            # Test data - use pre-computed values from training
            # Initialize lag columns
            for lag in self.lags:
                df[f'lag_{lag}'] = np.nan
            for window in self.rolling_windows:
                df[f'rolling_mean_{window}'] = np.nan

            # Fill with pre-computed values
            for idx, row in df.iterrows():
                store_dept = (row['Store'], row['Dept'])

                if store_dept in self.lag_values_:
                    for lag in self.lags:
                        if lag in self.lag_values_[store_dept]:
                            df.loc[idx, f'lag_{lag}'] = self.lag_values_[store_dept][lag]

                if store_dept in self.rolling_values_:
                    for window in self.rolling_windows:
                        if window in self.rolling_values_[store_dept]:
                            df.loc[idx, f'rolling_mean_{window}'] = self.rolling_values_[store_dept][window]

        # Drop helper columns
        drop_cols = [c for c in ['Day', 'Year', 'Date'] if c in df.columns]
        df = df.drop(columns=drop_cols)

        # Handle NaN values
        if self.drop_na and has_sales:
            # Only drop NaN for training data
            required = [f'lag_{l}' for l in self.lags] + [f'rolling_mean_{w}' for w in self.rolling_windows]
            df = df.dropna(subset=required).reset_index(drop=True)
        elif not has_sales:
            # For test data, fill remaining NaN values with appropriate defaults
            # You might want to adjust these defaults based on your domain knowledge
            for lag in self.lags:
                df[f'lag_{lag}'] = df[f'lag_{lag}'].fillna(0)  # or use median/mean from training
            for window in self.rolling_windows:
                df[f'rolling_mean_{window}'] = df[f'rolling_mean_{window}'].fillna(0)  # or use median/mean from training

        return df

# Advanced Feature Engineering

In [None]:

class AdvancedSeasonalFeatures(BaseEstimator, TransformerMixin):
    """
    Creates sophisticated seasonal and cyclical features
    """
    def __init__(self):
        # Define holiday periods more precisely
        self.holiday_periods = {
            'thanksgiving_period': [
                ('2010-11-19', '2010-11-26'),
                ('2011-11-18', '2011-11-25'),
                ('2012-11-16', '2012-11-23'),
                ('2013-11-22', '2013-11-29')
            ],
            'christmas_period': [
                ('2010-12-17', '2010-12-31'),
                ('2011-12-16', '2011-12-30'),
                ('2012-12-21', '2012-12-28'),
                ('2013-12-20', '2013-12-27')
            ],
            'superbowl_period': [
                ('2010-02-05', '2010-02-12'),
                ('2011-02-04', '2011-02-11'),
                ('2012-02-03', '2012-02-10'),
                ('2013-02-01', '2013-02-08')
            ],
            'labor_day_period': [
                ('2010-09-03', '2010-09-10'),
                ('2011-09-02', '2011-09-09'),
                ('2012-08-31', '2012-09-07'),
                ('2013-08-30', '2013-09-06')
            ]
        }

        # Back-to-school period (typically July-August)
        self.back_to_school_period = [
            ('2010-07-15', '2010-08-31'),
            ('2011-07-15', '2011-08-31'),
            ('2012-07-15', '2012-08-31'),
            ('2013-07-15', '2013-08-31')
        ]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df['Date'] = pd.to_datetime(df['Date'])

        # Cyclical encoding of time features
        df['month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
        df['month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)

        # Week of year for seasonal patterns
        df['week_of_year'] = df['Date'].dt.isocalendar().week
        df['week_sin'] = np.sin(2 * np.pi * df['week_of_year'] / 52)
        df['week_cos'] = np.cos(2 * np.pi * df['week_of_year'] / 52)

        # Quarter features
        df['quarter'] = df['Date'].dt.quarter
        df['quarter_sin'] = np.sin(2 * np.pi * df['quarter'] / 4)
        df['quarter_cos'] = np.cos(2 * np.pi * df['quarter'] / 4)

        # Advanced holiday features
        for holiday_name, periods in self.holiday_periods.items():
            df[f'{holiday_name}_flag'] = 0
            for start, end in periods:
                mask = (df['Date'] >= start) & (df['Date'] <= end)
                df.loc[mask, f'{holiday_name}_flag'] = 1

        # Back-to-school period
        df['back_to_school_flag'] = 0
        for start, end in self.back_to_school_period:
            mask = (df['Date'] >= start) & (df['Date'] <= end)
            df.loc[mask, 'back_to_school_flag'] = 1

        # Days since major holidays (continuous features)
        for year in [2010, 2011, 2012, 2013]:
            thanksgiving = pd.to_datetime(f'{year}-11-24')  # Approximate
            christmas = pd.to_datetime(f'{year}-12-25')

            year_mask = df['Date'].dt.year == year
            if year_mask.any():
                df.loc[year_mask, 'days_since_thanksgiving'] = (df.loc[year_mask, 'Date'] - thanksgiving).dt.days
                df.loc[year_mask, 'days_since_christmas'] = (df.loc[year_mask, 'Date'] - christmas).dt.days

        # Seasonal shopping intensity (pre-holiday buildup)
        df['pre_thanksgiving_intensity'] = np.where(
            (df['Days_to_Thanksgiving'] <= 14) & (df['Days_to_Thanksgiving'] > 0),
            15 - df['Days_to_Thanksgiving'], 0
        )

        df['pre_christmas_intensity'] = np.where(
            (df['Days_to_Christmas'] <= 21) & (df['Days_to_Christmas'] > 0),
            22 - df['Days_to_Christmas'], 0
        )

        # Post-holiday effect (returns, clearance)
        df['post_holiday_effect'] = np.where(
            ((df['days_since_thanksgiving'] > 0) & (df['days_since_thanksgiving'] <= 7)) |
            ((df['days_since_christmas'] > 0) & (df['days_since_christmas'] <= 14)),
            1, 0
        )

        # Clean up intermediate columns
        df = df.drop(columns=['week_of_year'], errors='ignore')

        return df


class ImprovedLagFeatureTransformer(BaseEstimator, TransformerMixin):
    """
    Enhanced lag features with more sophisticated patterns
    """
    def __init__(self,
                 lags=[1, 2, 3, 4, 8, 12, 52],  # Include yearly lag
                 rolling_windows=[2, 4, 8, 12, 26],  # More diverse windows
                 ewm_spans=[4, 8, 12],  # Exponential weighted moving averages
                 drop_na=True):
        self.lags = lags
        self.rolling_windows = rolling_windows
        self.ewm_spans = ewm_spans
        self.drop_na = drop_na
        self.history_ = None
        self.lag_stats_ = {}

    def fit(self, X, y=None):
        df = X.copy()
        df['Date'] = pd.to_datetime(df['Date'])
        df = df.sort_values(['Store', 'Dept', 'Date'])

        # Store more comprehensive history
        max_lag = max(self.lags) if self.lags else 0
        max_window = max(self.rolling_windows) if self.rolling_windows else 0
        max_ewm = max(self.ewm_spans) if self.ewm_spans else 0
        history_length = max(max_lag, max_window, max_ewm, 60)  # At least 60 weeks

        self.history_ = (
            df[['Store', 'Dept', 'Date', 'Weekly_Sales']]
            .groupby(['Store', 'Dept'], as_index=False)
            .tail(history_length)
        )

        # Calculate statistics for each store-dept combination
        for (store, dept), group in df.groupby(['Store', 'Dept']):
            group = group.sort_values('Date')
            sales = group['Weekly_Sales']

            self.lag_stats_[(store, dept)] = {
                'mean': sales.mean(),
                'std': sales.std(),
                'median': sales.median(),
                'q25': sales.quantile(0.25),
                'q75': sales.quantile(0.75),
                'trend': self._calculate_trend(sales),
                'seasonality': self._calculate_seasonality(sales)
            }

        return self

    def _calculate_trend(self, series):
        """Calculate simple trend"""
        if len(series) < 4:
            return 0
        x = np.arange(len(series))
        try:
            trend = np.polyfit(x, series, 1)[0]
            return trend
        except:
            return 0

    def _calculate_seasonality(self, series):
        """Calculate seasonal strength"""
        if len(series) < 52:
            return 0
        try:
            # Simple seasonal strength measure
            return series.std() / series.mean() if series.mean() > 0 else 0
        except:
            return 0

    def transform(self, X):
        df = X.copy()
        df['Date'] = pd.to_datetime(df['Date'])
        has_sales = 'Weekly_Sales' in df.columns
        df = df.sort_values(['Store','Dept','Date'])

        if has_sales:
            # 1) Create lags
            for lag in self.lags:
                df[f'lag_{lag}'] = df.groupby(['Store','Dept'])['Weekly_Sales'].shift(lag)

            # 2) Rolling statistics on lag_1 only (purely past)
            for window in self.rolling_windows:
                df[f'rolling_mean_{window}'] = (
                    df.groupby(['Store','Dept'])['lag_1']
                      .transform(lambda s: s.rolling(window, min_periods=1).mean())
                )
                df[f'rolling_std_{window}'] = (
                    df.groupby(['Store','Dept'])['lag_1']
                      .transform(lambda s: s.rolling(window, min_periods=1).std())
                )
                df[f'rolling_median_{window}'] = (
                    df.groupby(['Store','Dept'])['lag_1']
                      .transform(lambda s: s.rolling(window, min_periods=1).median())
                )

            # 3) EWMA on lag_1
            for span in self.ewm_spans:
                df[f'ewm_{span}'] = (
                    df.groupby(['Store','Dept'])['lag_1']
                      .transform(lambda s: s.ewm(span=span, min_periods=1).mean())
                )

            # 4) Momentum & acceleration remain OK (based on lag_1 diff)
            df['sales_momentum']     = df.groupby(['Store','Dept'])['lag_1'].transform(lambda s: s.diff())
            df['sales_acceleration'] = df.groupby(['Store','Dept'])['sales_momentum'].transform(lambda s: s.diff())

        else:
            # Test data - use historical statistics
            for lag in self.lags:
                df[f'lag_{lag}'] = np.nan

            for window in self.rolling_windows:
                df[f'rolling_mean_{window}'] = np.nan
                df[f'rolling_std_{window}'] = np.nan
                df[f'rolling_median_{window}'] = np.nan

            for span in self.ewm_spans:
                df[f'ewm_{span}'] = np.nan

            df['sales_momentum'] = np.nan
            df['sales_acceleration'] = np.nan

            # Fill with historical statistics
            for idx, row in df.iterrows():
                store_dept = (row['Store'], row['Dept'])
                if store_dept in self.lag_stats_:
                    stats = self.lag_stats_[store_dept]
                    # Use mean for missing lags
                    for lag in self.lags:
                        df.loc[idx, f'lag_{lag}'] = stats['mean']

                    # Use historical statistics for rolling features
                    for window in self.rolling_windows:
                        df.loc[idx, f'rolling_mean_{window}'] = stats['mean']
                        df.loc[idx, f'rolling_std_{window}'] = stats['std']
                        df.loc[idx, f'rolling_median_{window}'] = stats['median']

                    for span in self.ewm_spans:
                        df.loc[idx, f'ewm_{span}'] = stats['mean']

                    df.loc[idx, 'sales_momentum'] = stats['trend']
                    df.loc[idx, 'sales_acceleration'] = 0

        # Clean up
        if self.drop_na and has_sales:
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            df = df.dropna(subset=numeric_cols).reset_index(drop=True)

        return df

class StoreSpecificFeatures(BaseEstimator, TransformerMixin):
    """
    Store and department specific features
    """
    def __init__(self):
        self.store_stats_ = {}
        self.dept_stats_ = {}
        self.store_dept_stats_ = {}

    def fit(self, X, y=None):
        df = X.copy()
        has_sales = 'Weekly_Sales' in df.columns

        if has_sales:
            # Store-level statistics
            store_groups = df.groupby('Store')['Weekly_Sales']
            self.store_stats_ = {
                'mean': store_groups.mean().to_dict(),
                'std': store_groups.std().to_dict(),
                'median': store_groups.median().to_dict(),
                'volume': store_groups.count().to_dict()
            }

            # Department-level statistics
            dept_groups = df.groupby('Dept')['Weekly_Sales']
            self.dept_stats_ = {
                'mean': dept_groups.mean().to_dict(),
                'std': dept_groups.std().to_dict(),
                'median': dept_groups.median().to_dict(),
                'volume': dept_groups.count().to_dict()
            }

            # Store-Department level statistics
            store_dept_groups = df.groupby(['Store', 'Dept'])['Weekly_Sales']
            self.store_dept_stats_ = {
                'mean': store_dept_groups.mean().to_dict(),
                'std': store_dept_groups.std().to_dict(),
                'volume': store_dept_groups.count().to_dict()
            }

        return self

    def transform(self, X):
        df = X.copy()

        # Store performance indicators
        df['store_avg_sales'] = df['Store'].map(self.store_stats_.get('mean', {})).fillna(0)
        df['store_sales_volatility'] = df['Store'].map(self.store_stats_.get('std', {})).fillna(0)
        df['store_volume'] = df['Store'].map(self.store_stats_.get('volume', {})).fillna(0)

        # Department performance indicators
        df['dept_avg_sales'] = df['Dept'].map(self.dept_stats_.get('mean', {})).fillna(0)
        df['dept_sales_volatility'] = df['Dept'].map(self.dept_stats_.get('std', {})).fillna(0)
        df['dept_volume'] = df['Dept'].map(self.dept_stats_.get('volume', {})).fillna(0)

        # Store-Department specific features
        df['store_dept_key'] = list(zip(df['Store'], df['Dept']))
        df['store_dept_avg_sales'] = df['store_dept_key'].map(self.store_dept_stats_.get('mean', {})).fillna(0)
        df['store_dept_volatility'] = df['store_dept_key'].map(self.store_dept_stats_.get('std', {})).fillna(0)

        # Performance ratios
        df['store_vs_overall_ratio'] = df['store_avg_sales'] / (df['store_avg_sales'].mean() + 1e-8)
        df['dept_vs_overall_ratio'] = df['dept_avg_sales'] / (df['dept_avg_sales'].mean() + 1e-8)

        # Size-normalized features
        df['sales_per_sqft'] = df['store_avg_sales'] / (df['Size'] + 1e-8)
        df['dept_penetration'] = df['dept_volume'] / (df['store_volume'] + 1e-8)

        # Store type interactions
        df['type_size_interaction'] = df['Type'] * np.log1p(df['Size'])

        # Clean up
        df = df.drop(columns=['store_dept_key'], errors='ignore')

        return df

from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd

class EconomicInteractionFeatures(BaseEstimator, TransformerMixin):
    """
    Economic indicators and their interactions.
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        # Temperature-based features
        if 'Temperature' in df.columns:
            df['temp_comfort_zone'] = ((df['Temperature'] >= 15) & (df['Temperature'] <= 25)).astype(int)
            df['temp_too_hot'] = (df['Temperature'] > 30).astype(int)
            df['temp_too_cold'] = (df['Temperature'] < 5).astype(int)
            if 'Month' in df.columns:
                df['temp_month_interaction'] = df['Temperature'] * df['Month']
            df['weather_shopping_boost'] = np.where(
                (df['Temperature'] < 0) | (df['Temperature'] > 35), 1, 0
            )

        # Fuel price interactions
        if 'Fuel_Price' in df.columns:
            df['fuel_price_high'] = (df['Fuel_Price'] > df['Fuel_Price'].quantile(0.75)).astype(int)
            df['fuel_price_low'] = (df['Fuel_Price'] < df['Fuel_Price'].quantile(0.25)).astype(int)
            if 'Size' in df.columns:
                df['fuel_size_interaction'] = df['Fuel_Price'] * np.log1p(df['Size'])

        # Economic pressure indicators
        if 'CPI' in df.columns and 'Unemployment' in df.columns:
            df['economic_pressure'] = df['CPI'] * df['Unemployment']
            df['economic_stability'] = 1 / (1 + df['economic_pressure'])
            df['purchasing_power'] = df['CPI'] / (df['Unemployment'] + 1e-8)

        # Markdown effectiveness
        markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        existing_markdowns = [col for col in markdown_cols if col in df.columns]
        if existing_markdowns:
            df['total_markdown'] = df[existing_markdowns].sum(axis=1)
            df['markdown_count'] = (df[existing_markdowns] > 0).sum(axis=1)
            df['avg_markdown'] = df['total_markdown'] / (df['markdown_count'] + 1e-8)
            if 'Type' in df.columns:
                df['markdown_type_interaction'] = df['total_markdown'] * df['Type']
            if 'IsHoliday' in df.columns:
                df['holiday_markdown_boost'] = df['IsHoliday'] * df['total_markdown']

        return df


class AdvancedDateFeatures(BaseEstimator, TransformerMixin):
    """
    Advanced date-based features
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df['Date'] = pd.to_datetime(df['Date'])

        # Paycheck cycles (bi-weekly and monthly)
        df['is_paycheck_week'] = ((df['Date'].dt.day <= 7) |
                                  ((df['Date'].dt.day >= 14) & (df['Date'].dt.day <= 21))).astype(int)
        df['is_month_end'] = (df['Date'].dt.day >= 25).astype(int)
        df['is_month_start'] = (df['Date'].dt.day <= 7).astype(int)

        # School calendar effects
        df['is_school_week'] = ((df['Date'].dt.month >= 9) |
                                (df['Date'].dt.month <= 5)).astype(int)
        df['is_summer_break'] = ((df['Date'].dt.month >= 6) &
                                 (df['Date'].dt.month <= 8)).astype(int)

        # Tax season
        df['is_tax_season'] = ((df['Date'].dt.month >= 1) &
                               (df['Date'].dt.month <= 4)).astype(int)

        # Weekend proximity
        df['days_to_weekend'] = 6 - df['Date'].dt.dayofweek
        df['days_from_weekend'] = df['Date'].dt.dayofweek

        # Seasonal shopping patterns
        df['is_spring_shopping'] = ((df['Date'].dt.month >= 3) &
                                    (df['Date'].dt.month <= 5)).astype(int)
        df['is_summer_shopping'] = ((df['Date'].dt.month >= 6) &
                                    (df['Date'].dt.month <= 8)).astype(int)
        df['is_fall_shopping'] = ((df['Date'].dt.month >= 9) &
                                  (df['Date'].dt.month <= 11)).astype(int)
        df['is_winter_shopping'] = ((df['Date'].dt.month == 12) |
                                    (df['Date'].dt.month <= 2)).astype(int)

        return df

# Clean and feature engineer

# ML Flow setup


In [None]:
%pip install -q dagshub mlflow
import mlflow
import dagshub

dagshub.init(repo_owner='nkhar21', repo_name='ML_Final_Project', mlflow=True)
mlflow.set_tracking_uri("https://dagshub.com/nkhar21/ML_Final_Project.mlflow")

experiment_name = "LightGBM_Training"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/e2916fe29db043179a31a72c4aa3e0dd', creation_time=1751883875960, experiment_id='5', last_update_time=1751883875960, lifecycle_stage='active', name='LightGBM_Training', tags={}>

# Logging fe and cleaning

In [None]:
from sklearn.pipeline import Pipeline
clean_pipeline = Pipeline([
    ('merge', BaseMerger(features, stores)),
    ('fillna', MissingValueFiller()),
    ('label_encode', CategoricalEncoder()),
])

fe_pipeline = Pipeline([
    ('feature_add', FeatureAdder()),
    ('store_features', StoreSpecificFeatures()),
    ('improved_lags', ImprovedLagFeatureTransformer(
        lags=[1, 2, 3, 4, 8, 12, 52],
        rolling_windows=[2, 4, 8, 12, 26],
        ewm_spans=[4, 8, 12],
        drop_na=True
    )),
])

train_df_clean = clean_pipeline.fit_transform(train)
test_df_clean = clean_pipeline.transform(test)


train_df = fe_pipeline.fit_transform(train_df_clean)
test_df = fe_pipeline.transform(test_df_clean)

train_df.columns, train_df.shape, test_df.columns, test_df.shape

(Index(['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday', 'Temperature',
        'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4',
        'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size', 'Day', 'Month',
        'Year', 'SuperbowlWeek', 'LaborDayWeek', 'ThanksgivingWeek',
        'ChristmasWeek', 'Days_to_Thanksgiving', 'Days_to_Christmas',
        'store_avg_sales', 'store_sales_volatility', 'store_volume',
        'dept_avg_sales', 'dept_sales_volatility', 'dept_volume',
        'store_dept_avg_sales', 'store_dept_volatility',
        'store_vs_overall_ratio', 'dept_vs_overall_ratio', 'sales_per_sqft',
        'dept_penetration', 'type_size_interaction', 'lag_1', 'lag_2', 'lag_3',
        'lag_4', 'lag_8', 'lag_12', 'lag_52', 'rolling_mean_2', 'rolling_std_2',
        'rolling_median_2', 'rolling_mean_4', 'rolling_std_4',
        'rolling_median_4', 'rolling_mean_8', 'rolling_std_8',
        'rolling_median_8', 'rolling_mean_12', 'rolling_std_12',
        'roll

# Log cleaning and fe

In [None]:
import pickle
import mlflow

# Ensure you’ve initialized DagsHub & MLflow tracking URI already:
# dagshub.init(...); mlflow.set_tracking_uri(...)
mlflow.set_experiment("LightGBM_Training")

# 1️⃣ Log the cleaning pipeline
with mlflow.start_run(run_name="clean_pipeline_registration"):
    mlflow.set_tag("phase", "cleaning")
    mlflow.log_params({
        "clean__steps": [name for name, _ in clean_pipeline.steps],
    })
    # Serialize & log artifact
    with open("clean_pipeline.pkl", "wb") as f:
        pickle.dump(clean_pipeline, f)
    mlflow.log_artifact("clean_pipeline.pkl", artifact_path="pipelines")
    print(f"Logged clean_pipeline in run {mlflow.active_run().info.run_id}")

# 2️⃣ Log the feature‑engineering pipeline
with mlflow.start_run(run_name="fe_pipeline_registration"):
    mlflow.set_tag("phase", "feature_engineering")
    # Log the ImprovedLagFeatureTransformer parameters:
    lag_params = clean_pipeline.named_steps  # reuse for demonstration, adjust below
    il = fe_pipeline.named_steps["improved_lags"]
    mlflow.log_params({
        "fe__steps": [name for name, _ in fe_pipeline.steps],
        "lags": il.lags,
        "rolling_windows": il.rolling_windows,
        "ewm_spans": il.ewm_spans,
        "drop_na": il.drop_na
    })
    # Serialize & log artifact
    with open("fe_pipeline.pkl", "wb") as f:
        pickle.dump(fe_pipeline, f)
    mlflow.log_artifact("fe_pipeline.pkl", artifact_path="pipelines")
    print(f"Logged fe_pipeline in run {mlflow.active_run().info.run_id}")


Logged clean_pipeline in run 247aea9eb24e4c5fb6363e116060c450
🏃 View run clean_pipeline_registration at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/247aea9eb24e4c5fb6363e116060c450
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
Logged fe_pipeline in run 887a7e79d45a4b228e22277945b0c84c
🏃 View run fe_pipeline_registration at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/887a7e79d45a4b228e22277945b0c84c
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5


# Helper functions

In [None]:
%pip install lightgbm
import mlflow
import mlflow.lightgbm
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# --- Helpers ---

def log_lgb_params(model):
    """Pull key hyperparameters off the fitted LightGBM model."""
    p = model.get_params()
    mlflow.log_params({
        "n_estimators":    p.get("n_estimators"),
        "max_depth":       p.get("max_depth"),
        "learning_rate":   p.get("learning_rate"),
        "num_leaves":      p.get("num_leaves"),
        "subsample":       p.get("subsample"),
        "colsample_bytree":p.get("colsample_bytree") or p.get("feature_fraction"),
        "reg_alpha":       p.get("reg_alpha") or p.get("lambda_l1"),
        "reg_lambda":      p.get("reg_lambda") or p.get("lambda_l2"),
        "objective":       p.get("objective"),
        "random_state":    p.get("random_state"),
    })

def evaluate(model, X, y, weights, split):
    preds = model.predict(X)
    wmae = (weights * np.abs(y - preds)).sum() / weights.sum()
    mae  = mean_absolute_error(y, preds)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2   = r2_score(y, preds)
    mask = y != 0
    mape = np.mean(np.abs((y[mask] - preds[mask]) / y[mask])) * 100

    mlflow.log_metrics({
        f"{split}_WMAE": wmae,
        f"{split}_MAE":  mae,
        f"{split}_RMSE": rmse,
        f"{split}_R2":   r2,
        f"{split}_MAPE": mape
    })
    print(f"[{split.upper()}] WMAE={wmae:.4f}, MAE={mae:.4f}, RMSE={rmse:.4f}, R²={r2:.4f}, MAPE={mape:.2f}%")
    return {"WMAE": wmae, "MAE": mae, "RMSE": rmse, "R2": r2, "MAPE": mape}


# --- Data Prep ---

train_df = train_df.drop(columns=['Date'])
test_df  = test_df.drop(columns=['Date'])

y = train_df["Weekly_Sales"]
X = train_df.drop(columns=["Weekly_Sales"])
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Build numeric holiday weights
w_train = np.where(X_train["IsHoliday"] == 1, 5, 1)
w_val   = np.where(X_val  ["IsHoliday"] == 1, 5, 1)






KeyError: "['Date'] not found in axis"

# Training 1 - basic

In [None]:
mlflow.set_experiment("LightGBM_Training")
with mlflow.start_run(run_name="LightGBM_Regressor_1"):
    # Enable autologging for LightGBM
    mlflow.lightgbm.autolog()

    # Instantiate your model
    model = lgb.LGBMRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        lambda_l1=0.0,
        lambda_l2=0.0,
        objective="regression",
        random_state=42,
        n_jobs=-1
    )

    # Log key hyperparameters manually
    log_lgb_params(model)

    # Fit
    model.fit(X_train, y_train)

    # Evaluate
    train_metrics = evaluate(model, X_train, y_train, w_train, split="train")
    val_metrics   = evaluate(model, X_val,   y_val,   w_val,   split="val")

    # Log overfit deltas
    delta_metrics = {
        f"delta_{m}": train_metrics[m] - val_metrics[m]
        for m in train_metrics
    }
    mlflow.log_metrics(delta_metrics)
    print("Overfitting deltas:", delta_metrics)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.129873 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10892
[LightGBM] [Info] Number of data points in the train set: 208866, number of used features: 63
[LightGBM] [Info] Start training from score 16377.758634




[TRAIN] WMAE=1235.7402, MAE=1178.5009, RMSE=2269.9513, R²=0.9900, MAPE=352.98%
[VAL] WMAE=1352.9665, MAE=1272.5256, RMSE=2668.9117, R²=0.9865, MAPE=368.91%
Overfitting deltas: {'delta_WMAE': np.float64(-117.22625357623588), 'delta_MAE': -94.02469010453592, 'delta_RMSE': np.float64(-398.96036791683673), 'delta_R2': 0.0035747274809675433, 'delta_MAPE': np.float64(-15.931670676815884)}
🏃 View run LightGBM_Regressor_1 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/7159faf8dc4a4356bd14b85fa7060e23
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5


# Training 2 - using xgb model's best params

In [None]:
import mlflow
import mlflow.lightgbm
import lightgbm as lgb

mlflow.set_experiment("LightGBM_Training")

with mlflow.start_run(run_name="LGBM_best_xgb_translated"):
    mlflow.lightgbm.autolog()

    model = lgb.LGBMRegressor(
        n_estimators=1600,
        max_depth=8,
        num_leaves=255,             # 2^8 - 1 (good default for max_depth=8)
        learning_rate=0.15,
        subsample=0.8,
        colsample_bytree=0.8,
        lambda_l1=0.0,
        lambda_l2=0.0,
        objective="regression",
        random_state=42,
        n_jobs=-1
    )

    # Log all hyperparameters from the model itself
    mlflow.log_params(model.get_params())

    model.fit(X_train, y_train)

    train_metrics = evaluate(model, X_train, y_train, w_train, split="train")
    val_metrics   = evaluate(model, X_val,   y_val,   w_val,   split="val")

    delta_metrics = {
        f"delta_{m}": train_metrics[m] - val_metrics[m]
        for m in train_metrics
    }
    mlflow.log_metrics(delta_metrics)
    print("Overfitting deltas:", delta_metrics)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.129592 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10892
[LightGBM] [Info] Number of data points in the train set: 208866, number of used features: 63
[LightGBM] [Info] Start training from score 16377.758634








[TRAIN] WMAE=366.6381, MAE=370.7858, RMSE=540.4227, R²=0.9994, MAPE=97.08%
[VAL] WMAE=1179.4867, MAE=1100.1449, RMSE=2476.6000, R²=0.9883, MAPE=236.12%
Overfitting deltas: {'delta_WMAE': np.float64(-812.8486737392232), 'delta_MAE': -729.3590198980672, 'delta_RMSE': np.float64(-1936.1772636459066), 'delta_R2': 0.011095629088020531, 'delta_MAPE': np.float64(-139.04178919342294)}
🏃 View run LGBM_best_xgb_translated at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/e213d193d85c44dfa053a7e7290ddaa6
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5


# Training 3 - Tune prev params, add L1/L2. +early stoppping

In [None]:
import mlflow
import mlflow.lightgbm
import lightgbm as lgb

mlflow.set_experiment("LightGBM_Training")
with mlflow.start_run(run_name="LGBM_regularized_earlystop"):
    mlflow.lightgbm.autolog()

    model = lgb.LGBMRegressor(
        n_estimators=2000,
        max_depth=8,
        num_leaves=63,
        learning_rate=0.15,
        subsample=0.8,
        colsample_bytree=0.8,
        feature_fraction=0.7,
        bagging_fraction=0.7,
        bagging_freq=5,
        lambda_l1=0.1,
        lambda_l2=1.0,
        min_child_samples=20,
        min_child_weight=0.001,
        objective="regression",
        random_state=42,
        n_jobs=-1,
        early_stopping_rounds=50   # ← moved here
    )

    # Log exactly those params from the model
    mlflow.log_params(model.get_params())

    # Fit without the unsupported argument
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='mae',
    )

    train_metrics = evaluate(model, X_train, y_train, w_train, split="train")
    val_metrics   = evaluate(model, X_val,   y_val,   w_val,   split="val")

    delta_metrics = {
        f"delta_{k}": train_metrics[k] - val_metrics[k]
        for k in train_metrics
    }
    mlflow.log_metrics(delta_metrics)
    print("Overfitting deltas:", delta_metrics)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.144755 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10892
[LightGBM] [Info] Number of data points in the train set: 208866, number of used features: 63
[LightGBM] [Info] Start training from score 16377.758634
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1256]	valid_0's l1: 1140.36	valid_0's l2: 6.01708e+06








[TRAIN] WMAE=645.6070, MAE=643.7332, RMSE=969.9724, R²=0.9982, MAPE=172.77%
[VAL] WMAE=1208.9881, MAE=1140.3640, RMSE=2452.9737, R²=0.9886, MAPE=314.49%
Overfitting deltas: {'delta_WMAE': np.float64(-563.3810491387666), 'delta_MAE': -496.630802839219, 'delta_RMSE': np.float64(-1483.0012383065807), 'delta_R2': 0.009619225854654134, 'delta_MAPE': np.float64(-141.72124837986982)}
🏃 View run LGBM_regularized_earlystop at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/287aef3994fd4516b0f2e40e4e1769e9
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5


# Training 4 - Using simpler feature selection

In [None]:
cleaning_pipeline = Pipeline([
    ("merge",       BaseMerger(features, stores)),     # joins store + feature tables
    ("fillna",      MissingValueFiller()),             # impute mark‑downs, CPI, Unemployment
    ("label_encode", CategoricalEncoder()),            # encode Type, IsHoliday
])
feature_pipeline = Pipeline([
    ("feature_add",  FeatureAdder()),
    ("lags",         LagFeatureTransformer(
                        lags=[1,2,3,4],
                        rolling_windows=[4,8],
                    )),
])


cleaned = cleaning_pipeline.fit_transform(train)
train_df = feature_pipeline.fit_transform(cleaned)

# train_df = train_df.drop(columns=['Date'])
# test_df  = test_df.drop(columns=['Date'])

y = train_df["Weekly_Sales"]
X = train_df.drop(columns=["Weekly_Sales"])
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Build numeric holiday weights
w_train = np.where(X_train["IsHoliday"] == 1, 5, 1)
w_val   = np.where(X_val  ["IsHoliday"] == 1, 5, 1)

In [None]:
import mlflow
import mlflow.lightgbm
import lightgbm as lgb

mlflow.set_experiment("LightGBM_Training")
with mlflow.start_run(run_name="LGBM_regularized_earlystop"):
    mlflow.lightgbm.autolog()

    model = lgb.LGBMRegressor(
        n_estimators=2000,
        max_depth=8,
        num_leaves=63,
        learning_rate=0.15,
        subsample=0.8,
        colsample_bytree=0.8,
        feature_fraction=0.7,
        bagging_fraction=0.7,
        bagging_freq=5,
        lambda_l1=0.1,
        lambda_l2=1.0,
        min_child_samples=20,
        min_child_weight=0.001,
        objective="regression",
        random_state=42,
        n_jobs=-1,
        early_stopping_rounds=50   # ← moved here
    )

    # Log exactly those params from the model
    mlflow.log_params(model.get_params())

    # Fit without the unsupported argument
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='mae',
    )

    train_metrics = evaluate(model, X_train, y_train, w_train, split="train")
    val_metrics   = evaluate(model, X_val,   y_val,   w_val,   split="val")

    delta_metrics = {
        f"delta_{k}": train_metrics[k] - val_metrics[k]
        for k in train_metrics
    }
    mlflow.log_metrics(delta_metrics)
    print("Overfitting deltas:", delta_metrics)




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.123563 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1076]	valid_0's l1: 575.435	valid_0's l2: 3.06544e+06








[TRAIN] WMAE=410.8493, MAE=397.2446, RMSE=710.1886, R²=0.9990, MAPE=177.04%
[VAL] WMAE=644.5178, MAE=575.4347, RMSE=1750.8390, R²=0.9939, MAPE=139.17%
Overfitting deltas: {'delta_WMAE': np.float64(-233.66852836006592), 'delta_MAE': -178.190142034536, 'delta_RMSE': np.float64(-1040.6503893098463), 'delta_R2': 0.005130852900313676, 'delta_MAPE': np.float64(37.87213649197412)}
🏃 View run LGBM_regularized_earlystop at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/cceee835c7224e01ae3a69745965ce55
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5


# Training 5 - keep simple FE, Tune hyperparams

In [None]:
import mlflow
import mlflow.lightgbm
import lightgbm as lgb
import itertools
import numpy as np

mlflow.set_experiment("LightGBM_Training")

param_grid = {
    "learning_rate":   [0.05, 0.1, 0.15],
    "num_leaves":      [31, 63, 127],
    "subsample":       [0.8],
    "feature_fraction":[0.7],
    "lambda_l1":       [0.1],
    "lambda_l2":       [1.0],
}

# Generate all combinations
combinations = list(itertools.product(
    param_grid["learning_rate"],
    param_grid["num_leaves"],
    param_grid["subsample"],
    param_grid["feature_fraction"],
    param_grid["lambda_l1"],
    param_grid["lambda_l2"]
))

for lr, nl, ss, ff, l1, l2 in combinations:
    run_name = f"LR{lr}_NL{nl}"
    with mlflow.start_run(run_name=run_name):
        mlflow.lightgbm.autolog()

        model = lgb.LGBMRegressor(
            objective="regression",
            n_estimators=2000,
            max_depth=8,
            num_leaves=nl,
            learning_rate=lr,
            subsample=ss,
            colsample_bytree=0.8,
            feature_fraction=ff,
            bagging_fraction=0.7,
            bagging_freq=5,
            lambda_l1=l1,
            lambda_l2=l2,
            min_child_samples=20,
            min_child_weight=0.001,
            random_state=42,
            n_jobs=-1,
            early_stopping_rounds=50
        )

        # Log tuned parameters
        mlflow.log_params({
            "learning_rate":    lr,
            "num_leaves":       nl,
            "subsample":        ss,
            "feature_fraction": ff,
            "lambda_l1":        l1,
            "lambda_l2":        l2,
        })

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric="mae",
        )

        # Evaluate model
        train_metrics = evaluate(model, X_train, y_train, w_train, split="train")
        val_metrics   = evaluate(model, X_val,   y_val,   w_val,   split="val")
        delta_metrics = {
            f"delta_{k}": train_metrics[k] - val_metrics[k]
            for k in train_metrics
        }

        # Log all metrics
        mlflow.log_metrics({
            **{f"train_{k}": v for k, v in train_metrics.items()},
            **{f"val_{k}":   v for k, v in val_metrics.items()},
            **delta_metrics
        })

        print(
            f"{run_name} → "
            f"train_WMAE={train_metrics['WMAE']:.1f}, "
            f"val_WMAE={val_metrics['WMAE']:.1f}, "
            f"ΔWMAE={delta_metrics['delta_WMAE']:.1f}"
        )


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.222429 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 673.05	valid_0's l2: 3.01052e+06




[TRAIN] WMAE=615.0505, MAE=585.6022, RMSE=1085.1327, R²=0.9977, MAPE=325.37%
[VAL] WMAE=733.6107, MAE=673.0505, RMSE=1735.0841, R²=0.9940, MAPE=250.51%
LR0.05_NL31 → train_WMAE=615.1, val_WMAE=733.6, ΔWMAE=-118.6
🏃 View run LR0.05_NL31 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/6f98475ab1ed45f9848e47542781d0de
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.105015 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 602.092	valid_0's l2: 2.88512e+06




[TRAIN] WMAE=483.6607, MAE=466.9330, RMSE=848.3610, R²=0.9986, MAPE=230.83%
[VAL] WMAE=660.7741, MAE=602.0921, RMSE=1698.5649, R²=0.9943, MAPE=191.52%
LR0.05_NL63 → train_WMAE=483.7, val_WMAE=660.8, ΔWMAE=-177.1
🏃 View run LR0.05_NL63 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/e220602928054a02836ef92f7a1726b1
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099878 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1875]	valid_0's l1: 594.798	valid_0's l2: 2.88887e+06




[TRAIN] WMAE=450.3083, MAE=437.1740, RMSE=793.7051, R²=0.9988, MAPE=203.57%
[VAL] WMAE=653.3463, MAE=594.7980, RMSE=1699.6679, R²=0.9943, MAPE=162.70%
LR0.05_NL127 → train_WMAE=450.3, val_WMAE=653.3, ΔWMAE=-203.0
🏃 View run LR0.05_NL127 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/dcd7281ce10d45ddb9ed66c9db6a2064
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098208 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1181]	valid_0's l1: 657.73	valid_0's l2: 3.17048e+06








[TRAIN] WMAE=585.7332, MAE=558.5920, RMSE=1036.1621, R²=0.9979, MAPE=269.92%
[VAL] WMAE=722.3211, MAE=657.7296, RMSE=1780.5843, R²=0.9937, MAPE=200.97%
LR0.1_NL31 → train_WMAE=585.7, val_WMAE=722.3, ΔWMAE=-136.6
🏃 View run LR0.1_NL31 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/0c601c78e06f44459c5665a58b2c4a55
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102858 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[819]	valid_0's l1: 641.86	valid_0's l2: 3.04777e+06








[TRAIN] WMAE=543.5161, MAE=519.4933, RMSE=963.5376, R²=0.9982, MAPE=253.99%
[VAL] WMAE=710.0083, MAE=641.8600, RMSE=1745.7857, R²=0.9939, MAPE=202.20%
LR0.1_NL63 → train_WMAE=543.5, val_WMAE=710.0, ΔWMAE=-166.5
🏃 View run LR0.1_NL63 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/08e69b3393814eeca1de8f7bd2881b54
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.103364 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[656]	valid_0's l1: 655.874	valid_0's l2: 3.11812e+06








[TRAIN] WMAE=551.0775, MAE=524.8353, RMSE=999.1824, R²=0.9981, MAPE=271.29%
[VAL] WMAE=726.7732, MAE=655.8740, RMSE=1765.8206, R²=0.9938, MAPE=205.01%
LR0.1_NL127 → train_WMAE=551.1, val_WMAE=726.8, ΔWMAE=-175.7
🏃 View run LR0.1_NL127 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/2395a0b22dbc46adbcc1bf7d146cee7c
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.132345 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[974]	valid_0's l1: 631.312	valid_0's l2: 3.05421e+06




[TRAIN] WMAE=544.4035, MAE=520.3121, RMSE=966.9475, R²=0.9982, MAPE=286.20%
[VAL] WMAE=699.1601, MAE=631.3118, RMSE=1747.6291, R²=0.9939, MAPE=211.07%
LR0.15_NL31 → train_WMAE=544.4, val_WMAE=699.2, ΔWMAE=-154.8
🏃 View run LR0.15_NL31 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/54c626faeedc464e9a77752c54dad37a
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100848 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1076]	valid_0's l1: 575.435	valid_0's l2: 3.06544e+06








[TRAIN] WMAE=410.8493, MAE=397.2446, RMSE=710.1886, R²=0.9990, MAPE=177.04%
[VAL] WMAE=644.5178, MAE=575.4347, RMSE=1750.8390, R²=0.9939, MAPE=139.17%
LR0.15_NL63 → train_WMAE=410.8, val_WMAE=644.5, ΔWMAE=-233.7
🏃 View run LR0.15_NL63 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/ec24376cbc944bc39c354c0f747990af
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102848 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[475]	valid_0's l1: 665.874	valid_0's l2: 3.42367e+06








[TRAIN] WMAE=546.1583, MAE=520.5180, RMSE=995.2850, R²=0.9981, MAPE=229.95%
[VAL] WMAE=739.5956, MAE=665.8743, RMSE=1850.3149, R²=0.9932, MAPE=180.62%
LR0.15_NL127 → train_WMAE=546.2, val_WMAE=739.6, ΔWMAE=-193.4
🏃 View run LR0.15_NL127 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/8570aa3b8d2f485493568a6a12cf8872
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5


# Training 6-  more tuning

In [None]:
import mlflow
import mlflow.lightgbm
import lightgbm as lgb
import itertools
import numpy as np

mlflow.set_experiment("LightGBM_Training")

param_grid = {
    "learning_rate":   [0.05, 0.1, 0.15],
    "num_leaves":      [63, 100],
    "subsample":       [0.8, 0.7],
    "feature_fraction":[0.7, 0.5],
    "lambda_l1":       [0.1],
    "lambda_l2":       [1.0],
}

# Generate all combinations
combinations = list(itertools.product(
    param_grid["learning_rate"],
    param_grid["num_leaves"],
    param_grid["subsample"],
    param_grid["feature_fraction"],
    param_grid["lambda_l1"],
    param_grid["lambda_l2"]
))

for lr, nl, ss, ff, l1, l2 in combinations:
    run_name = f"LR{lr}_NL{nl}_SS{ss}_ff{ff}"
    with mlflow.start_run(run_name=run_name):
        mlflow.lightgbm.autolog()

        model = lgb.LGBMRegressor(
            objective="regression",
            n_estimators=2000,
            max_depth=8,
            num_leaves=nl,
            learning_rate=lr,
            subsample=ss,
            colsample_bytree=0.8,
            feature_fraction=ff,
            bagging_fraction=0.7,
            bagging_freq=5,
            lambda_l1=l1,
            lambda_l2=l2,
            min_child_samples=20,
            min_child_weight=0.001,
            random_state=42,
            n_jobs=-1,
            early_stopping_rounds=50
        )

        # Log tuned parameters
        mlflow.log_params({
            "learning_rate":    lr,
            "num_leaves":       nl,
            "subsample":        ss,
            "feature_fraction": ff,
            "lambda_l1":        l1,
            "lambda_l2":        l2,
        })

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric="mae",
        )

        # Evaluate model
        train_metrics = evaluate(model, X_train, y_train, w_train, split="train")
        val_metrics   = evaluate(model, X_val,   y_val,   w_val,   split="val")
        delta_metrics = {
            f"delta_{k}": train_metrics[k] - val_metrics[k]
            for k in train_metrics
        }

        # Log all metrics
        mlflow.log_metrics({
            **{f"train_{k}": v for k, v in train_metrics.items()},
            **{f"val_{k}":   v for k, v in val_metrics.items()},
            **delta_metrics
        })

        print(
            f"{run_name} → "
            f"train_WMAE={train_metrics['WMAE']:.1f}, "
            f"val_WMAE={val_metrics['WMAE']:.1f}, "
            f"ΔWMAE={delta_metrics['delta_WMAE']:.1f}"
        )


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100419 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 602.092	valid_0's l2: 2.88512e+06








[TRAIN] WMAE=483.6607, MAE=466.9330, RMSE=848.3610, R²=0.9986, MAPE=230.83%
[VAL] WMAE=660.7741, MAE=602.0921, RMSE=1698.5649, R²=0.9943, MAPE=191.52%
LR0.05_NL63_SS0.8_ff0.7 → train_WMAE=483.7, val_WMAE=660.8, ΔWMAE=-177.1
🏃 View run LR0.05_NL63_SS0.8_ff0.7 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/cb71cae91f8847d287843398397ae197
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.166124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 651.664	valid_0's l2: 2.88892e+06








[TRAIN] WMAE=537.3519, MAE=517.1081, RMSE=923.5327, R²=0.9984, MAPE=305.32%
[VAL] WMAE=715.4228, MAE=651.6638, RMSE=1699.6818, R²=0.9943, MAPE=216.74%
LR0.05_NL63_SS0.8_ff0.5 → train_WMAE=537.4, val_WMAE=715.4, ΔWMAE=-178.1
🏃 View run LR0.05_NL63_SS0.8_ff0.5 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/c69ad6c25cd9400c8f298939e837f764
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.122821 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 602.092	valid_0's l2: 2.88512e+06








[TRAIN] WMAE=483.6607, MAE=466.9330, RMSE=848.3610, R²=0.9986, MAPE=230.83%
[VAL] WMAE=660.7741, MAE=602.0921, RMSE=1698.5649, R²=0.9943, MAPE=191.52%
LR0.05_NL63_SS0.7_ff0.7 → train_WMAE=483.7, val_WMAE=660.8, ΔWMAE=-177.1
🏃 View run LR0.05_NL63_SS0.7_ff0.7 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/0ebe7ce0956e4884a85b2fe83ff653bd
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.106297 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 651.664	valid_0's l2: 2.88892e+06








[TRAIN] WMAE=537.3519, MAE=517.1081, RMSE=923.5327, R²=0.9984, MAPE=305.32%
[VAL] WMAE=715.4228, MAE=651.6638, RMSE=1699.6818, R²=0.9943, MAPE=216.74%
LR0.05_NL63_SS0.7_ff0.5 → train_WMAE=537.4, val_WMAE=715.4, ΔWMAE=-178.1
🏃 View run LR0.05_NL63_SS0.7_ff0.5 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/0d1911ec1f7c46d7b12ddd9d44c4ebfa
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102225 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1860]	valid_0's l1: 597.885	valid_0's l2: 2.91292e+06








[TRAIN] WMAE=459.3241, MAE=445.2028, RMSE=811.0737, R²=0.9987, MAPE=213.34%
[VAL] WMAE=659.3048, MAE=597.8850, RMSE=1706.7265, R²=0.9942, MAPE=164.27%
LR0.05_NL100_SS0.8_ff0.7 → train_WMAE=459.3, val_WMAE=659.3, ΔWMAE=-200.0
🏃 View run LR0.05_NL100_SS0.8_ff0.7 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/5986f053bb3748b7a695ef4e7aac8426
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094246 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 635.679	valid_0's l2: 2.88103e+06








[TRAIN] WMAE=488.7790, MAE=473.3007, RMSE=844.3078, R²=0.9986, MAPE=249.87%
[VAL] WMAE=701.9771, MAE=635.6787, RMSE=1697.3598, R²=0.9943, MAPE=197.49%
LR0.05_NL100_SS0.8_ff0.5 → train_WMAE=488.8, val_WMAE=702.0, ΔWMAE=-213.2
🏃 View run LR0.05_NL100_SS0.8_ff0.5 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/043d74f250a94849bc31ed0427738cdc
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098214 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1860]	valid_0's l1: 597.885	valid_0's l2: 2.91292e+06




[TRAIN] WMAE=459.3241, MAE=445.2028, RMSE=811.0737, R²=0.9987, MAPE=213.34%
[VAL] WMAE=659.3048, MAE=597.8850, RMSE=1706.7265, R²=0.9942, MAPE=164.27%
LR0.05_NL100_SS0.7_ff0.7 → train_WMAE=459.3, val_WMAE=659.3, ΔWMAE=-200.0
🏃 View run LR0.05_NL100_SS0.7_ff0.7 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/e6be3ac10446402eb427c6d26ab2f1c2
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.111277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1







[TRAIN] WMAE=488.7790, MAE=473.3007, RMSE=844.3078, R²=0.9986, MAPE=249.87%
[VAL] WMAE=701.9771, MAE=635.6787, RMSE=1697.3598, R²=0.9943, MAPE=197.49%
LR0.05_NL100_SS0.7_ff0.5 → train_WMAE=488.8, val_WMAE=702.0, ΔWMAE=-213.2
🏃 View run LR0.05_NL100_SS0.7_ff0.5 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/58e2c8bfa6f4452ab836defeabdd9c07
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099520 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[819]	valid_0's l1: 641.86	valid_0's l2: 3.04777e+06








[TRAIN] WMAE=543.5161, MAE=519.4933, RMSE=963.5376, R²=0.9982, MAPE=253.99%
[VAL] WMAE=710.0083, MAE=641.8600, RMSE=1745.7857, R²=0.9939, MAPE=202.20%
LR0.1_NL63_SS0.8_ff0.7 → train_WMAE=543.5, val_WMAE=710.0, ΔWMAE=-166.5
🏃 View run LR0.1_NL63_SS0.8_ff0.7 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/758eb3f4a0eb44bc8c16f36a5e72baea
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.093097 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1525]	valid_0's l1: 596.986	valid_0's l2: 2.93023e+06




[TRAIN] WMAE=445.1669, MAE=429.6009, RMSE=764.7748, R²=0.9989, MAPE=216.39%
[VAL] WMAE=664.8790, MAE=596.9857, RMSE=1711.7907, R²=0.9942, MAPE=153.77%
LR0.1_NL63_SS0.8_ff0.5 → train_WMAE=445.2, val_WMAE=664.9, ΔWMAE=-219.7
🏃 View run LR0.1_NL63_SS0.8_ff0.5 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/b4622f42f575408ba9b5edb2ce381160
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.158978 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[819]	valid_0's l1: 641.86	valid_0's l2: 3.04777e+06








[TRAIN] WMAE=543.5161, MAE=519.4933, RMSE=963.5376, R²=0.9982, MAPE=253.99%
[VAL] WMAE=710.0083, MAE=641.8600, RMSE=1745.7857, R²=0.9939, MAPE=202.20%
LR0.1_NL63_SS0.7_ff0.7 → train_WMAE=543.5, val_WMAE=710.0, ΔWMAE=-166.5
🏃 View run LR0.1_NL63_SS0.7_ff0.7 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/48849ab9b972425fbcfbf5328dff175f
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.153105 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1525]	valid_0's l1: 596.986	valid_0







[TRAIN] WMAE=445.1669, MAE=429.6009, RMSE=764.7748, R²=0.9989, MAPE=216.39%
[VAL] WMAE=664.8790, MAE=596.9857, RMSE=1711.7907, R²=0.9942, MAPE=153.77%
LR0.1_NL63_SS0.7_ff0.5 → train_WMAE=445.2, val_WMAE=664.9, ΔWMAE=-219.7
🏃 View run LR0.1_NL63_SS0.7_ff0.5 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/cfb3c8180a2b4ca397eb18717ebdd7f1
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.319419 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1998]	valid_0's l1: 540.423	valid_0's l2: 2.8474e+06








[TRAIN] WMAE=322.6010, MAE=316.4017, RMSE=541.2832, R²=0.9994, MAPE=149.04%
[VAL] WMAE=606.7714, MAE=540.4234, RMSE=1687.4238, R²=0.9943, MAPE=143.92%
LR0.1_NL100_SS0.8_ff0.7 → train_WMAE=322.6, val_WMAE=606.8, ΔWMAE=-284.2
🏃 View run LR0.1_NL100_SS0.8_ff0.7 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/344dfdacf49b49f9a6ce5c84d774ebb9
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 



[TRAIN] WMAE=345.6621, MAE=337.0215, RMSE=580.5178, R²=0.9994, MAPE=187.23%
[VAL] WMAE=643.8496, MAE=567.3194, RMSE=1713.6254, R²=0.9942, MAPE=144.25%
LR0.1_NL100_SS0.8_ff0.5 → train_WMAE=345.7, val_WMAE=643.8, ΔWMAE=-298.2
🏃 View run LR0.1_NL100_SS0.8_ff0.5 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/0505e4a1ae134ee4a77e80e6f06f90f0
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.104427 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1998]	valid_0's l1: 540.423	valid_0's l2: 2.8474e+06








[TRAIN] WMAE=322.6010, MAE=316.4017, RMSE=541.2832, R²=0.9994, MAPE=149.04%
[VAL] WMAE=606.7714, MAE=540.4234, RMSE=1687.4238, R²=0.9943, MAPE=143.92%
LR0.1_NL100_SS0.7_ff0.7 → train_WMAE=322.6, val_WMAE=606.8, ΔWMAE=-284.2
🏃 View run LR0.1_NL100_SS0.7_ff0.7 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/e2b79493168e4223bb070d7a426a6982
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 







[TRAIN] WMAE=345.6621, MAE=337.0215, RMSE=580.5178, R²=0.9994, MAPE=187.23%
[VAL] WMAE=643.8496, MAE=567.3194, RMSE=1713.6254, R²=0.9942, MAPE=144.25%
LR0.1_NL100_SS0.7_ff0.5 → train_WMAE=345.7, val_WMAE=643.8, ΔWMAE=-298.2
🏃 View run LR0.1_NL100_SS0.7_ff0.5 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/5c4b7613410c4ea79d88d760199b9ec8
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099716 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1076]	valid_0's l1: 575.435	valid_0's l2: 3.06544e+06








[TRAIN] WMAE=410.8493, MAE=397.2446, RMSE=710.1886, R²=0.9990, MAPE=177.04%
[VAL] WMAE=644.5178, MAE=575.4347, RMSE=1750.8390, R²=0.9939, MAPE=139.17%
LR0.15_NL63_SS0.8_ff0.7 → train_WMAE=410.8, val_WMAE=644.5, ΔWMAE=-233.7
🏃 View run LR0.15_NL63_SS0.8_ff0.7 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/3b61fb13c7c24840b9693ed08b26a4c2
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100509 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[975]	valid_0's l1: 616.248	valid_0's l2: 2.9877e+06




[TRAIN] WMAE=471.6354, MAE=452.9557, RMSE=815.7512, R²=0.9987, MAPE=224.93%
[VAL] WMAE=682.5604, MAE=616.2482, RMSE=1728.4960, R²=0.9941, MAPE=175.87%
LR0.15_NL63_SS0.8_ff0.5 → train_WMAE=471.6, val_WMAE=682.6, ΔWMAE=-210.9
🏃 View run LR0.15_NL63_SS0.8_ff0.5 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/56967c29ec0d43db8c1a9b09b5904686
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.109924 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1076]	valid_0's l1: 575.435	valid_0's l2: 3.06544e+06








[TRAIN] WMAE=410.8493, MAE=397.2446, RMSE=710.1886, R²=0.9990, MAPE=177.04%
[VAL] WMAE=644.5178, MAE=575.4347, RMSE=1750.8390, R²=0.9939, MAPE=139.17%
LR0.15_NL63_SS0.7_ff0.7 → train_WMAE=410.8, val_WMAE=644.5, ΔWMAE=-233.7
🏃 View run LR0.15_NL63_SS0.7_ff0.7 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/f3e2f5855dc845d29906d7fc976acc57
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.096348 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[975]	valid_0's l1: 616.248	valid_0's l2: 2.9877e+06








[TRAIN] WMAE=471.6354, MAE=452.9557, RMSE=815.7512, R²=0.9987, MAPE=224.93%
[VAL] WMAE=682.5604, MAE=616.2482, RMSE=1728.4960, R²=0.9941, MAPE=175.87%
LR0.15_NL63_SS0.7_ff0.5 → train_WMAE=471.6, val_WMAE=682.6, ΔWMAE=-210.9
🏃 View run LR0.15_NL63_SS0.7_ff0.5 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/ecce0d8f6449491ca26becca31dcdfcd
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100191 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[816]	valid_0's l1: 595.925	valid_0's l2: 3.09427e+06




[TRAIN] WMAE=431.9703, MAE=415.6381, RMSE=753.3496, R²=0.9989, MAPE=209.27%
[VAL] WMAE=666.5023, MAE=595.9252, RMSE=1759.0545, R²=0.9938, MAPE=154.89%
LR0.15_NL100_SS0.8_ff0.7 → train_WMAE=432.0, val_WMAE=666.5, ΔWMAE=-234.5
🏃 View run LR0.15_NL100_SS0.8_ff0.7 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/87bb770fef024ba8897ac919b869145d
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033112 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[611]	valid_0's l1: 671.136	vali







[TRAIN] WMAE=537.5407, MAE=511.2697, RMSE=937.3590, R²=0.9983, MAPE=274.96%
[VAL] WMAE=748.4455, MAE=671.1357, RMSE=1754.5182, R²=0.9939, MAPE=186.38%
LR0.15_NL100_SS0.8_ff0.5 → train_WMAE=537.5, val_WMAE=748.4, ΔWMAE=-210.9
🏃 View run LR0.15_NL100_SS0.8_ff0.5 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/cbb62fe3957c43d0841963c36059cc35
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.120704 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[816]	valid_0's l1: 595.925	vali







[TRAIN] WMAE=431.9703, MAE=415.6381, RMSE=753.3496, R²=0.9989, MAPE=209.27%
[VAL] WMAE=666.5023, MAE=595.9252, RMSE=1759.0545, R²=0.9938, MAPE=154.89%
LR0.15_NL100_SS0.7_ff0.7 → train_WMAE=432.0, val_WMAE=666.5, ΔWMAE=-234.5
🏃 View run LR0.15_NL100_SS0.7_ff0.7 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/a648c8ffa08e4a77a0c4b8d293e77a89
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.189232 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[611]	valid_0's l1: 671.136	vali







[TRAIN] WMAE=537.5407, MAE=511.2697, RMSE=937.3590, R²=0.9983, MAPE=274.96%
[VAL] WMAE=748.4455, MAE=671.1357, RMSE=1754.5182, R²=0.9939, MAPE=186.38%
LR0.15_NL100_SS0.7_ff0.5 → train_WMAE=537.5, val_WMAE=748.4, ΔWMAE=-210.9
🏃 View run LR0.15_NL100_SS0.7_ff0.5 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/9427fcf6cf124decab218b6b22c35d9d
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5


# Training 7 - micro tuning best model

In [None]:
import mlflow
import mlflow.lightgbm
import lightgbm as lgb
import itertools
import numpy as np

mlflow.set_experiment("LightGBM_Training")

param_grid = {
    "lambda_l1":       [0.1, 1, 10],
    "lambda_l2":       [1.0, 10, 40],
}

combinations = list(itertools.product(
    param_grid["lambda_l1"],
    param_grid["lambda_l2"]
))

for l1, l2 in combinations:
    run_name = f"best_l1{l1}_l2{l2}"
    with mlflow.start_run(run_name=run_name):
        mlflow.lightgbm.autolog()

        model = lgb.LGBMRegressor(
            objective="regression",
            n_estimators=2000,
            max_depth=8,
            num_leaves=100,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            feature_fraction=0.7,
            bagging_fraction=0.7,
            bagging_freq=5,
            lambda_l1=l1,
            lambda_l2=l2,
            min_child_samples=20,
            min_child_weight=0.001,
            random_state=42,
            n_jobs=-1,
            early_stopping_rounds=50
        )

        # Log tuned parameters
        mlflow.log_params({
            "learning_rate":    lr,
            "num_leaves":       nl,
            "subsample":        ss,
            "feature_fraction": ff,
            "lambda_l1":        l1,
            "lambda_l2":        l2,
        })

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric="mae",
        )

        # Evaluate model
        train_metrics = evaluate(model, X_train, y_train, w_train, split="train")
        val_metrics   = evaluate(model, X_val,   y_val,   w_val,   split="val")
        delta_metrics = {
            f"delta_{k}": train_metrics[k] - val_metrics[k]
            for k in train_metrics
        }

        # Log all metrics
        mlflow.log_metrics({
            **{f"train_{k}": v for k, v in train_metrics.items()},
            **{f"val_{k}":   v for k, v in val_metrics.items()},
            **delta_metrics
        })

        print(
            f"{run_name} → "
            f"train_WMAE={train_metrics['WMAE']:.1f}, "
            f"val_WMAE={val_metrics['WMAE']:.1f}, "
            f"ΔWMAE={delta_metrics['delta_WMAE']:.1f}"
        )




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1998]	valid_0's l1: 540.423	valid_0's l2: 2.8474e+06








[TRAIN] WMAE=322.6010, MAE=316.4017, RMSE=541.2832, R²=0.9994, MAPE=149.04%
[VAL] WMAE=606.7714, MAE=540.4234, RMSE=1687.4238, R²=0.9943, MAPE=143.92%
best_l10.1_l21.0 → train_WMAE=322.6, val_WMAE=606.8, ΔWMAE=-284.2
🏃 View run best_l10.1_l21.0 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/fe6173e8af1949b6bf7f8410ac56378c
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102580 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[975]	valid_0's l1: 607.464	valid_0's l2: 2.8313e+06








[TRAIN] WMAE=482.1395, MAE=462.6094, RMSE=864.2011, R²=0.9986, MAPE=245.88%
[VAL] WMAE=671.0289, MAE=607.4641, RMSE=1682.6476, R²=0.9944, MAPE=207.57%
best_l10.1_l210 → train_WMAE=482.1, val_WMAE=671.0, ΔWMAE=-188.9
🏃 View run best_l10.1_l210 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/6f2fb6c3fc3646d4bf659c9b457f65c2
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.110650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[975]	valid_0's l1: 626.173	valid_0's l2: 2.99712e+06








[TRAIN] WMAE=532.6962, MAE=504.9620, RMSE=991.0660, R²=0.9981, MAPE=237.30%
[VAL] WMAE=682.7548, MAE=626.1728, RMSE=1731.2188, R²=0.9940, MAPE=185.86%
best_l10.1_l240 → train_WMAE=532.7, val_WMAE=682.8, ΔWMAE=-150.1
🏃 View run best_l10.1_l240 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/9f4a8e3be6de436d9b0fb994a21a5879
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099318 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1174]	valid_0's l1: 582.419	valid_0's l2: 2.9734e+06








[TRAIN] WMAE=422.4970, MAE=408.6441, RMSE=740.4566, R²=0.9989, MAPE=202.41%
[VAL] WMAE=650.6471, MAE=582.4191, RMSE=1724.3553, R²=0.9941, MAPE=176.54%
best_l11_l21.0 → train_WMAE=422.5, val_WMAE=650.6, ΔWMAE=-228.2
🏃 View run best_l11_l21.0 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/7f952d665acd407e976361c2f5995767
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.209964 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[975]	valid_0's l1: 607.465	valid_0's l2: 2.8313e+06








[TRAIN] WMAE=482.1415, MAE=462.6110, RMSE=864.2050, R²=0.9986, MAPE=245.88%
[VAL] WMAE=671.0297, MAE=607.4648, RMSE=1682.6482, R²=0.9944, MAPE=207.57%
best_l11_l210 → train_WMAE=482.1, val_WMAE=671.0, ΔWMAE=-188.9
🏃 View run best_l11_l210 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/8a83c47f48fc4e4793eb6ca0d48a0c0c
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098023 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[840]	valid_0's l1: 647.769	valid_0's l2: 3.04265e+06








[TRAIN] WMAE=566.2075, MAE=534.8810, RMSE=1069.9810, R²=0.9978, MAPE=263.48%
[VAL] WMAE=704.8400, MAE=647.7691, RMSE=1744.3197, R²=0.9939, MAPE=200.77%
best_l11_l240 → train_WMAE=566.2, val_WMAE=704.8, ΔWMAE=-138.6
🏃 View run best_l11_l240 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/7becad4c7f904b8c94c699b63cb1e66f
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[817]	valid_0's l1: 625.967	valid_0's l2: 2.96925e+06








[TRAIN] WMAE=501.4066, MAE=481.3727, RMSE=898.8430, R²=0.9985, MAPE=237.70%
[VAL] WMAE=694.2518, MAE=625.9673, RMSE=1723.1519, R²=0.9941, MAPE=199.57%
best_l110_l21.0 → train_WMAE=501.4, val_WMAE=694.3, ΔWMAE=-192.8
🏃 View run best_l110_l21.0 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/3aa7c4543ce446338a35dd2807e2190c
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1090]	valid_0's l1: 596.301	valid_0's l2: 2.88761e+06








[TRAIN] WMAE=460.9424, MAE=443.0362, RMSE=818.5105, R²=0.9987, MAPE=229.78%
[VAL] WMAE=663.1399, MAE=596.3007, RMSE=1699.2977, R²=0.9943, MAPE=182.30%
best_l110_l210 → train_WMAE=460.9, val_WMAE=663.1, ΔWMAE=-202.2
🏃 View run best_l110_l210 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/8418b8f28d5e47629c4bee197c510938
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.104184 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1080]	valid_0's l1: 615.896	valid_0's l2: 2.97177e+06








[TRAIN] WMAE=513.6040, MAE=488.3010, RMSE=946.3233, R²=0.9983, MAPE=257.88%
[VAL] WMAE=672.1476, MAE=615.8956, RMSE=1723.8812, R²=0.9941, MAPE=211.29%
best_l110_l240 → train_WMAE=513.6, val_WMAE=672.1, ΔWMAE=-158.5
🏃 View run best_l110_l240 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/0c4c2c1398a8409cb47aaac86d3adfac
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5


# Log best model

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import FunctionTransformer
# Drops Weekly_Sales if present, else leaves df unchanged
drop_target = FunctionTransformer(lambda df: df.drop(columns=["Weekly_Sales"], errors="ignore"),
                                  validate=False)


# 1) Define the two pipelines
preprocess_pipeline = Pipeline([
    ('merge',       BaseMerger(features, stores)),
    ('fillna',      MissingValueFiller()),
    ('label_encode',CategoricalEncoder()),
    ('feature_add', FeatureAdder()),
    ('lags',        LagFeatureTransformer(lags=[1,2,3,4], rolling_windows=[4,8])),
    ('drop_target', drop_target),
])

model = lgb.LGBMRegressor(
    n_estimators=2000,
    max_depth=8,
    num_leaves=100,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    feature_fraction=0.7,
    bagging_fraction=0.7,
    bagging_freq=5,
    lambda_l1=0.1,
    lambda_l2=1.0,
    min_child_samples=20,
    min_child_weight=0.001,
    objective="regression",
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=50,
)
model_pipeline = Pipeline([('regressor', model)])

# 2) Fit & align
X_full = preprocess_pipeline.fit_transform(train)
y_full = train["Weekly_Sales"].iloc[X_full.index]

# 3) Split & train
X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.2, random_state=42)
w_train = np.where(X_train["IsHoliday"]==1,5,1)
w_val   = np.where(X_val  ["IsHoliday"]==1,5,1)

#model_pipeline.fit(X_train, y_train)
model_pipeline.fit(
    X_train, y_train,
    # prefix with the step name “regressor”
    regressor__eval_set=[(X_val, y_val)],
    regressor__eval_metric="mae"
)

# 4) Evaluate & log metrics
train_metrics = evaluate(model, X_train, y_train, w_train, split="train")
val_metrics   = evaluate(model, X_val,   y_val,   w_val,   split="val")

mlflow.set_experiment("LightGBM_Training")
with mlflow.start_run(run_name="final_lightGBM_pipeline_run"):
    mlflow.log_params(model.get_params())
    mlflow.log_metrics({
        **{f"train_{k}":v for k,v in train_metrics.items()},
        **{f"val_{k}":v   for k,v in val_metrics.items()},
        **{f"delta_{k}":train_metrics[k]-val_metrics[k] for k in train_metrics}
    })

    # 5) Log the composed pipeline as a single artifact
    full_pipeline = Pipeline([
        ('preprocess', preprocess_pipeline),
        ('model',      model)
    ])
    mlflow.sklearn.log_model(full_pipeline, artifact_path="final_lightGBM_pipeline")


2025/07/07 17:32:18 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ed4c7dd3ed054a26a78eca95e1e74160', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100304 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16308.174874
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 5415.89	valid_0's l2: 8.75389e+07








🏃 View run placid-loon-673 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/ed4c7dd3ed054a26a78eca95e1e74160
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
[TRAIN] WMAE=4019.8025, MAE=4049.9599, RMSE=6701.1610, R²=0.9154, MAPE=16203.41%
[VAL] WMAE=5508.2920, MAE=5415.8908, RMSE=9356.2246, R²=0.8324, MAPE=18834.28%


Exception: Run with UUID 37e77f6c18514542909e28c8cb99d769 is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [None]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import FunctionTransformer
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

def final_lightGBM_pipeline_experiment(train_df, features, stores, evaluate_func, test_size=0.2, random_state=42):
    """
    Complete pipeline experiment with MLflow logging

    Args:
        train_df: Training dataframe WITH target column 'Weekly_Sales'
        features: Features dataframe for merging
        stores: Stores dataframe for merging
        evaluate_func: Your evaluate function
        test_size: Validation split ratio
        random_state: Random state for reproducibility
    """

    # End any existing MLflow run
    mlflow.end_run()

    # Set experiment
    mlflow.set_experiment("LightGBM_Training")

    with mlflow.start_run(run_name="final_lightGBM_pipeline_run"):

        # Log input data info
        mlflow.log_param("train_data_shape", train_df.shape)
        mlflow.log_param("train_date_range", f"{train_df['Date'].min()} to {train_df['Date'].max()}")
        mlflow.log_param("num_stores", train_df['Store'].nunique())
        mlflow.log_param("num_departments", train_df['Dept'].nunique())
        mlflow.log_param("test_size", test_size)
        mlflow.log_param("random_state", random_state)

        # Drops Weekly_Sales if present, else leaves df unchanged
        drop_target = FunctionTransformer(lambda df: df.drop(columns=["Weekly_Sales"], errors="ignore"),
                                          validate=False)

        # Define preprocessing pipeline
        preprocess_pipeline = Pipeline([
            ('merge',       BaseMerger(features, stores)),
            ('fillna',      MissingValueFiller()),
            ('label_encode', CategoricalEncoder()),
            ('feature_add', FeatureAdder()),
            ('lags',        LagFeatureTransformer(lags=[1,2,3,4], rolling_windows=[4,8])),
            ('drop_target', drop_target),
        ])

        # Define model with corrected parameters (avoid duplicate parameter warnings)
        model = lgb.LGBMRegressor(
            n_estimators=2000,
            max_depth=8,
            num_leaves=100,
            learning_rate=0.1,
            # Use LightGBM parameter names to avoid warnings
            feature_fraction=0.7,  # Instead of colsample_bytree
            bagging_fraction=0.7,  # Instead of subsample
            bagging_freq=5,
            reg_alpha=0.1,         # Instead of lambda_l1
            reg_lambda=1.0,        # Instead of lambda_l2
            min_child_samples=20,
            min_child_weight=0.001,
            objective="regression",
            random_state=42,
            n_jobs=-1,
        )

        # Log model parameters
        model_params = model.get_params()
        mlflow.log_params({f"model_{k}": v for k, v in model_params.items()})

        # Log pipeline configuration
        mlflow.log_param("pipeline_steps", [step[0] for step in preprocess_pipeline.steps])
        mlflow.log_param("lag_features", [1, 2, 3, 4])
        mlflow.log_param("rolling_windows", [4, 8])

        print("Fitting preprocessing pipeline...")
        # Fit preprocessing and get processed data
        X_full = preprocess_pipeline.fit_transform(train_df)

        # IMPORTANT: Get target values that correspond to the processed data
        # After LagFeatureTransformer with drop_na=True, some rows are dropped
        y_full = train_df["Weekly_Sales"].iloc[X_full.index]

        print(f"Original data shape: {train_df.shape}")
        print(f"Processed data shape: {X_full.shape}")
        print(f"Target shape: {y_full.shape}")

        # Log processed data info
        mlflow.log_param("processed_data_shape", X_full.shape)
        mlflow.log_param("num_features", X_full.shape[1])
        mlflow.log_param("rows_dropped_in_preprocessing", train_df.shape[0] - X_full.shape[0])

        # Split data
        X_train, X_val, y_train, y_val = train_test_split(
            X_full, y_full, test_size=test_size, random_state=random_state, shuffle=False
        )

        # Calculate weights
        w_train = np.where(X_train["IsHoliday"] == 1, 5, 1)
        w_val = np.where(X_val["IsHoliday"] == 1, 5, 1)

        print(f"Training set shape: {X_train.shape}")
        print(f"Validation set shape: {X_val.shape}")

        # Train model with early stopping
        print("Training model...")
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric="mae",
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
        )

        # Evaluate model
        print("Evaluating model...")
        train_metrics = evaluate_func(model, X_train, y_train, w_train, split="train")
        val_metrics = evaluate_func(model, X_val, y_val, w_val, split="val")

        # Log all metrics
        mlflow.log_metrics({
            **{f"train_{k}": v for k, v in train_metrics.items()},
            **{f"val_{k}": v for k, v in val_metrics.items()},
            **{f"delta_{k}": train_metrics[k] - val_metrics[k] for k in train_metrics}
        })

        # Log additional metrics
        mlflow.log_metric("best_iteration", model.best_iteration_)
        mlflow.log_metric("overfitting_wmae", train_metrics.get('WMAE', 0) / val_metrics.get('WMAE', 1))

        # Log feature importance
        if hasattr(model, 'feature_importances_'):
            feature_names = X_full.columns.tolist()
            feature_importance = model.feature_importances_

            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': feature_importance
            }).sort_values('importance', ascending=False)

            # Log top features
            mlflow.log_param("top_10_features", importance_df.head(10)['feature'].tolist())

            # Log individual feature importances
            for feature, importance in zip(feature_names, feature_importance):
                mlflow.log_metric(f"feature_importance_{feature}", importance)

        # Create full pipeline for prediction
        full_pipeline = Pipeline([
            ('preprocess', preprocess_pipeline),
            ('model', model)
        ])

        # Log the complete pipeline
        mlflow.sklearn.log_model(full_pipeline, artifact_path="final_lightGBM_pipeline")

        print(f"\nExperiment Results:")
        for split, metrics in [("TRAIN", train_metrics), ("VAL", val_metrics)]:
            print(f"[{split}] ", end="")
            print(", ".join([f"{k}={v:.4f}" for k, v in metrics.items()]))

        return full_pipeline, {
            'train_metrics': train_metrics,
            'val_metrics': val_metrics,
            'X_train': X_train,
            'X_val': X_val,
            'y_train': y_train,
            'y_val': y_val,
            'feature_importance': importance_df if hasattr(model, 'feature_importances_') else None
        }

def predict_test_data(pipeline, test_df):
    """
    Use the trained pipeline to predict on test data

    Args:
        pipeline: Trained pipeline
        test_df: Test dataframe WITHOUT target column

    Returns:
        predictions: Array of predictions
    """
    # Make predictions using the full pipeline
    predictions = pipeline.predict(test_df)
    return predictions

# Example usage:
"""
# Run the experiment
pipeline, results = final_lightGBM_pipeline_experiment(
    train_df=train,  # Your training data WITH target
    features=features,  # Your features dataframe
    stores=stores,  # Your stores dataframe
    evaluate_func=evaluate,  # Your evaluate function
    test_size=0.2,
    random_state=42
)

# Later, for test predictions:
test_predictions = predict_test_data(pipeline, test_df)
"""

'\n# Run the experiment\npipeline, results = final_lightGBM_pipeline_experiment(\n    train_df=train,  # Your training data WITH target\n    features=features,  # Your features dataframe\n    stores=stores,  # Your stores dataframe\n    evaluate_func=evaluate,  # Your evaluate function\n    test_size=0.2,\n    random_state=42\n)\n\n# Later, for test predictions:\ntest_predictions = predict_test_data(pipeline, test_df)\n'

In [None]:
pipeline, results = final_lightGBM_pipeline_experiment(
    train_df=train,  # Your training data WITH target
    features=features,  # Your features dataframe
    stores=stores,  # Your stores dataframe
    evaluate_func=evaluate,  # Your evaluate function
    test_size=0.2,
    random_state=42
)


🏃 View run worried-fox-568 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/37e77f6c18514542909e28c8cb99d769
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
Fitting preprocessing pipeline...
Original data shape: (421570, 5)
Processed data shape: (398796, 28)
Target shape: (398796,)
Training set shape: (319036, 28)
Validation set shape: (79760, 28)
Training model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094750 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4395
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16948.948683
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's l1: 14410.8	valid_0's l2: 3.7084e+08




Evaluating model...
[TRAIN] WMAE=15750.2401, MAE=15734.1598, RMSE=23616.0285, R²=0.0201, MAPE=45692.29%
[VAL] WMAE=14410.0797, MAE=14410.7552, RMSE=19257.2165, R²=-0.0315, MAPE=53798.55%




🏃 View run final_lightGBM_pipeline_run at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/8f390a05de874c8cb7fadd16cf30149f
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5


RestException: INTERNAL_ERROR: Response: {'error': 'unsupported endpoint, please contact support@dagshub.com'}

In [None]:
test_predictions = predict_test_data(pipeline, test)

NotFittedError: Pipeline is not fitted yet.

# Log best model 1

In [None]:
preprocess_pipeline = Pipeline([
    ('merge',       BaseMerger(features, stores)),
    ('fillna',      MissingValueFiller()),
    ('label_encode',CategoricalEncoder()),
    ('feature_add', FeatureAdder()),
    ('lags',        LagFeatureTransformer(lags=[1,2,3,4], rolling_windows=[4,8])),
])

model = lgb.LGBMRegressor(
    n_estimators=2000,
    max_depth=8,
    num_leaves=100,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    feature_fraction=0.7,
    bagging_fraction=0.7,
    bagging_freq=5,
    lambda_l1=0.1,
    lambda_l2=1.0,
    min_child_samples=20,
    min_child_weight=0.001,
    objective="regression",
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=50,
)
model_pipeline = Pipeline([('regressor', model)])

train_df = preprocess_pipeline.fit_transform(train)
test_df = preprocess_pipeline.transform(test)

train_df.columns, train_df.shape, test_df.columns, test_df.shape

(Index(['Store', 'Dept', 'Weekly_Sales', 'IsHoliday', 'Temperature',
        'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4',
        'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size', 'Month',
        'SuperbowlWeek', 'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek',
        'Days_to_Thanksgiving', 'Days_to_Christmas', 'DateOrdinal', 'lag_1',
        'lag_2', 'lag_3', 'lag_4', 'rolling_mean_4', 'rolling_mean_8'],
       dtype='object'),
 (398796, 29),
 Index(['Store', 'Dept', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1',
        'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI',
        'Unemployment', 'Type', 'Size', 'Month', 'SuperbowlWeek',
        'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek',
        'Days_to_Thanksgiving', 'Days_to_Christmas', 'DateOrdinal', 'lag_1',
        'lag_2', 'lag_3', 'lag_4', 'rolling_mean_4', 'rolling_mean_8'],
       dtype='object'),
 (115064, 28))

In [None]:
y_train = train_df["Weekly_Sales"]
X_train = train_df.drop(columns=["Weekly_Sales"])
X_test = test_df

X_train.columns, X_train.shape, y_train.shape, X_test.columns, X_test.shape

(Index(['Store', 'Dept', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1',
        'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI',
        'Unemployment', 'Type', 'Size', 'Month', 'SuperbowlWeek',
        'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek',
        'Days_to_Thanksgiving', 'Days_to_Christmas', 'DateOrdinal', 'lag_1',
        'lag_2', 'lag_3', 'lag_4', 'rolling_mean_4', 'rolling_mean_8'],
       dtype='object'),
 (398796, 28),
 (398796,),
 Index(['Store', 'Dept', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1',
        'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI',
        'Unemployment', 'Type', 'Size', 'Month', 'SuperbowlWeek',
        'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek',
        'Days_to_Thanksgiving', 'Days_to_Christmas', 'DateOrdinal', 'lag_1',
        'lag_2', 'lag_3', 'lag_4', 'rolling_mean_4', 'rolling_mean_8'],
       dtype='object'),
 (115064, 28))

In [None]:
import mlflow
import mlflow.lightgbm
import lightgbm as lgb

mlflow.set_experiment("LightGBM_Training")
with mlflow.start_run(run_name="LGBM_best_run"):
    mlflow.lightgbm.autolog()

    mlflow.log_params(model.get_params())

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='mae',
    )

    train_metrics = evaluate(model, X_train, y_train, w_train, split="train")
    val_metrics   = evaluate(model, X_val,   y_val,   w_val,   split="val")

    delta_metrics = {
        f"delta_{k}": train_metrics[k] - val_metrics[k]
        for k in train_metrics
    }
    mlflow.log_metrics(delta_metrics)
    print("Overfitting deltas:", delta_metrics)


Index(['Store', 'Dept', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1',
       'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI',
       'Unemployment', 'Type', 'Size', 'Month', 'SuperbowlWeek',
       'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek',
       'Days_to_Thanksgiving', 'Days_to_Christmas', 'DateOrdinal', 'lag_1',
       'lag_2', 'lag_3', 'lag_4', 'rolling_mean_4', 'rolling_mean_8'],
      dtype='object')

In [None]:
import pickle
import mlflow

mlflow.set_experiment("LightGBM_Training")
with mlflow.start_run(run_name="lightGBM_preprocessor_run") as run:
    run_id = run.info.run_id
    print("MLflow Run ID:", run_id)

    # 1) Log your custom params as before
    bm = preprocess_pipeline.named_steps["merge"]
    mlflow.log_param("BaseMerger.feature_store_rows", len(bm.feature_store))
    mvf = preprocess_pipeline.named_steps["fillna"]
    mlflow.log_param("MissingValueFiller.markdown_cols", len(mvf.markdown_cols))
    mlflow.log_param("MissingValueFiller.mean_cols",     len(mvf.mean_cols))
    ce = preprocess_pipeline.named_steps["label_encode"]
    mlflow.log_param("CategoricalEncoder.type_mapping",    str(ce.type_mapping))
    mlflow.log_param("CategoricalEncoder.holiday_mapping", str(ce.holiday_mapping))
    fa = preprocess_pipeline.named_steps["feature_add"]
    mlflow.log_param("FeatureAdder.superbowl_dates",     len(fa.superbowl))
    mlflow.log_param("FeatureAdder.thanksgiving_dates",  len(fa.thanksgiving))
    lag = preprocess_pipeline.named_steps["lags"]
    mlflow.log_param("LagFeatureTransformer.lags",            ",".join(map(str, lag.lags)))
    mlflow.log_param("LagFeatureTransformer.rolling_windows", ",".join(map(str, lag.rolling_windows)))
    mlflow.log_param("LagFeatureTransformer.drop_na",         lag.drop_na)

    # 2) Serialize to disk
    with open("preprocess_pipeline.pkl", "wb") as f:
        pickle.dump(preprocess_pipeline, f)

    # 3) Log it as a generic artifact
    mlflow.log_artifact("preprocess_pipeline.pkl", artifact_path="pipelines")

    print("✅ Pickled & logged preprocessor as an artifact")




MLflow Run ID: f91ce30091c74b5cb4a016ac01c432df
✅ Pickled & logged preprocessor as an artifact
🏃 View run lightGBM_preprocessor_run at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/f91ce30091c74b5cb4a016ac01c432df
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5


# Generate submission

In [None]:
from sklearn.model_selection import train_test_split

# 1) Hold out a small validation fold for early stopping
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,    # 10% for validation
    random_state=42
)

# 2) Fit the model pipeline with early stopping
model_pipeline.fit(
    X_tr, y_tr,
    # these kwargs get passed to the `regressor` step
    regressor__eval_set=[(X_val, y_val)],
    regressor__eval_metric="mae"
)

# 3) Optional: check performance on the hold‑out
from sklearn.metrics import mean_absolute_error
val_preds = model_pipeline.predict(X_val)
print("Validation MAE:", mean_absolute_error(y_val, val_preds))

# 4) Finally, predict on your test set
test_preds = model_pipeline.predict(X_test)

# 5) If you want it as a DataFrame matching the test index:
import pandas as pd
submission = pd.DataFrame({
    "Store":      test["Store"],
    "Dept":       test["Dept"],
    "Date":       test["Date"],
    "Weekly_Sales": test_preds
})

# preview
submission.head()


2025/07/07 19:30:29 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '02cc59aa2e594e948e7ca38eebf89e5f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045660 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 319036, number of used features: 28
[LightGBM] [Info] Start training from score 16110.571336
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1998]	valid_0's l1: 540.423	valid_0's l2: 2.8474e+06








🏃 View run hilarious-turtle-212 at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5/runs/02cc59aa2e594e948e7ca38eebf89e5f
🧪 View experiment at: https://dagshub.com/nkhar21/ML_Final_Project.mlflow/#/experiments/5
Validation MAE: 540.4233769209027


Unnamed: 0,Store,Dept,Date,Weekly_Sales
0,1,1,2012-11-02,22559.225787
1,1,1,2012-11-09,22013.948438
2,1,1,2012-11-16,22275.173898
3,1,1,2012-11-23,21375.72544
4,1,1,2012-11-30,22217.054861


In [None]:
# 5) Create submission DataFrame with Id format: Store_Dept_Date
submission = pd.DataFrame({
    "Id": test["Store"].astype(str) + "_" +
          test["Dept"].astype(str) + "_" +
          test["Date"].astype(str),
    "Weekly_Sales": test_preds
})

# Optional: round or clip if needed
# submission["Weekly_Sales"] = submission["Weekly_Sales"].clip(lower=0)

# Save to CSV
submission.to_csv("submission.csv", index=False)

# Preview
print(submission.head())


               Id  Weekly_Sales
0  1_1_2012-11-02  22559.225787
1  1_1_2012-11-09  22013.948438
2  1_1_2012-11-16  22275.173898
3  1_1_2012-11-23  21375.725440
4  1_1_2012-11-30  22217.054861
