## Load and Inspect Data

In [1]:
# Import necessary libraries
import pandas as pd

# Load datasets
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
df_stores = pd.read_csv('../data/stores.csv')
df_oil = pd.read_csv('../data/oil.csv')
df_holidays = pd.read_csv('../data/holidays_events.csv')
df_transactions = pd.read_csv('../data/transactions.csv')

# Check first few rows of each dataset
print("Train DataFrame:")
print(df_train.head(), "\n")

print("Test DataFrame:")
print(df_test.head(), "\n")

print("Stores DataFrame:")
print(df_stores.head(), "\n")

print("Oil DataFrame:")
print(df_oil.head(), "\n")

print("Holidays DataFrame:")
print(df_holidays.head(), "\n")

print("Transactions DataFrame:")
print(df_transactions.head(), "\n")

# Check for missing values in each dataset
print("Missing values in Train Data:")
print(df_train.isnull().sum(), "\n")

print("Missing values in Test Data:")
print(df_test.isnull().sum(), "\n")

print("Missing values in Stores Data:")
print(df_stores.isnull().sum(), "\n")

print("Missing values in Oil Data:")
print(df_oil.isnull().sum(), "\n")

print("Missing values in Holidays Data:")
print(df_holidays.isnull().sum(), "\n")

print("Missing values in Transactions Data:")
print(df_transactions.isnull().sum(), "\n")

# Convert date columns to datetime format
df_train['date'] = pd.to_datetime(df_train['date'])
df_test['date'] = pd.to_datetime(df_test['date'])
df_oil['date'] = pd.to_datetime(df_oil['date'])
df_holidays['date'] = pd.to_datetime(df_holidays['date'])
df_transactions['date'] = pd.to_datetime(df_transactions['date'])

# Confirm the conversion of date columns
print("Date columns converted to datetime format.")

Train DataFrame:
   id        date  store_nbr      family  sales  onpromotion
0   0  2013-01-01          1  AUTOMOTIVE    0.0            0
1   1  2013-01-01          1   BABY CARE    0.0            0
2   2  2013-01-01          1      BEAUTY    0.0            0
3   3  2013-01-01          1   BEVERAGES    0.0            0
4   4  2013-01-01          1       BOOKS    0.0            0 

Test DataFrame:
        id        date  store_nbr      family  onpromotion
0  3000888  2017-08-16          1  AUTOMOTIVE            0
1  3000889  2017-08-16          1   BABY CARE            0
2  3000890  2017-08-16          1      BEAUTY            2
3  3000891  2017-08-16          1   BEVERAGES           20
4  3000892  2017-08-16          1       BOOKS            0 

Stores DataFrame:
   store_nbr           city                           state type  cluster
0          1          Quito                       Pichincha    D       13
1          2          Quito                       Pichincha    D       13
2  

## Create Date-Based Features

In [2]:
# Feature Engineering: Date-based features
def create_date_features(df):
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    df['week_of_year'] = df['date'].dt.isocalendar().week
    df['day_of_year'] = df['date'].dt.dayofyear
    return df

# Apply the function to both train and test datasets
df_train = create_date_features(df_train)
df_test = create_date_features(df_test)

# Confirm the newly created features
print("Train DataFrame with Date-Based Features:")
print(df_train.head())

print("Test DataFrame with Date-Based Features:")
print(df_test.head())

Train DataFrame with Date-Based Features:
   id       date  store_nbr      family  sales  onpromotion  year  month  day   
0   0 2013-01-01          1  AUTOMOTIVE    0.0            0  2013      1    1  \
1   1 2013-01-01          1   BABY CARE    0.0            0  2013      1    1   
2   2 2013-01-01          1      BEAUTY    0.0            0  2013      1    1   
3   3 2013-01-01          1   BEVERAGES    0.0            0  2013      1    1   
4   4 2013-01-01          1       BOOKS    0.0            0  2013      1    1   

   day_of_week  week_of_year  day_of_year  
0            1             1            1  
1            1             1            1  
2            1             1            1  
3            1             1            1  
4            1             1            1  
Test DataFrame with Date-Based Features:
        id       date  store_nbr      family  onpromotion  year  month  day   
0  3000888 2017-08-16          1  AUTOMOTIVE            0  2017      8   16  \
1  30008

##  Create Lag and Rolling Features

In [3]:
# Feature Engineering: Lag and rolling features for sales
def create_lag_features(df, lags=[1, 7, 14]):
    for lag in lags:
        df[f'sales_lag_{lag}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(lag)
    return df

def create_rolling_features(df, windows=[7, 14]):
    for window in windows:
        df[f'sales_roll_mean_{window}'] = df.groupby(['store_nbr', 'family'])['sales'].transform(lambda x: x.rolling(window).mean())
        df[f'sales_roll_std_{window}'] = df.groupby(['store_nbr', 'family'])['sales'].transform(lambda x: x.rolling(window).std())
    return df

# Apply the lag features to the train dataset
df_train = create_lag_features(df_train)
df_train = create_rolling_features(df_train)

# Since the test set lacks sales history, we can't create lag features for it directly
# Fill missing lag feature NaNs with 0
df_train.fillna(0, inplace=True)

# Confirm the newly created lag and rolling features
print("Train DataFrame with Lag and Rolling Features:")
print(df_train.head())

Train DataFrame with Lag and Rolling Features:
   id       date  store_nbr      family  sales  onpromotion  year  month  day   
0   0 2013-01-01          1  AUTOMOTIVE    0.0            0  2013      1    1  \
1   1 2013-01-01          1   BABY CARE    0.0            0  2013      1    1   
2   2 2013-01-01          1      BEAUTY    0.0            0  2013      1    1   
3   3 2013-01-01          1   BEVERAGES    0.0            0  2013      1    1   
4   4 2013-01-01          1       BOOKS    0.0            0  2013      1    1   

   day_of_week  week_of_year  day_of_year  sales_lag_1  sales_lag_7   
0            1             1            1          0.0          0.0  \
1            1             1            1          0.0          0.0   
2            1             1            1          0.0          0.0   
3            1             1            1          0.0          0.0   
4            1             1            1          0.0          0.0   

   sales_lag_14  sales_roll_mean_7  sal

## Handle missing data

In [5]:
# Handle missing oil prices using forward fill
df_oil['dcoilwtico'].fillna(method='ffill', inplace=True)

# Check if there are any remaining missing values in oil
print("Remaining missing values in oil data:")
print(df_oil.isnull().sum())

# Ensure lag and rolling feature NaNs in train data are filled (we already did this in the previous step)
df_train.fillna(0, inplace=True)

# Confirm no missing values in train data
print("Remaining missing values in train data after filling:")
print(df_train.isnull().sum())

# Since the test set doesn't have historical sales, we won't create lag/rolling features for it. 
# However, ensure there are no other missing values in the test dataset.
df_test.fillna(0, inplace=True)
print("Remaining missing values in test data after filling:")
print(df_test.isnull().sum())

# Handle the last remaining missing oil price using backward fill
df_oil['dcoilwtico'].fillna(method='bfill', inplace=True)

# Verify there are no more missing values in the oil data
print("Remaining missing values in oil data after backfill:")
print(df_oil.isnull().sum())

Remaining missing values in oil data:
date          0
dcoilwtico    1
dtype: int64
Remaining missing values in train data after filling:
id                    0
date                  0
store_nbr             0
family                0
sales                 0
onpromotion           0
year                  0
month                 0
day                   0
day_of_week           0
week_of_year          0
day_of_year           0
sales_lag_1           0
sales_lag_7           0
sales_lag_14          0
sales_roll_mean_7     0
sales_roll_std_7      0
sales_roll_mean_14    0
sales_roll_std_14     0
dtype: int64
Remaining missing values in test data after filling:
id              0
date            0
store_nbr       0
family          0
onpromotion     0
year            0
month           0
day             0
day_of_week     0
week_of_year    0
day_of_year     0
dtype: int64
Remaining missing values in oil data after backfill:
date          0
dcoilwtico    0
dtype: int64


## Add Holiday Feature

In [6]:
# Handle holiday data: create a flag for holidays and calculate days to next holiday
def add_holiday_features(df, holidays_df):
    # Filter national holidays only
    holidays_df = holidays_df[holidays_df['locale'] == 'National'].copy()
    
    # Merge holidays with the data based on the date
    df = df.merge(holidays_df[['date', 'type']], on='date', how='left')
    
    # Flag if the date is a holiday
    df['is_holiday'] = df['type'].notnull().astype(int)
    
    # Drop the 'type' column after creating the flag
    df.drop(columns=['type'], inplace=True)
    
    return df

# Apply holiday features to train and test datasets
df_train = add_holiday_features(df_train, df_holidays)
df_test = add_holiday_features(df_test, df_holidays)

# Confirm that the holiday feature was added
print("Train DataFrame with Holiday Features:")
print(df_train[['date', 'is_holiday']].head())

print("Test DataFrame with Holiday Features:")
print(df_test[['date', 'is_holiday']].head())

Train DataFrame with Holiday Features:
        date  is_holiday
0 2013-01-01           1
1 2013-01-01           1
2 2013-01-01           1
3 2013-01-01           1
4 2013-01-01           1
Test DataFrame with Holiday Features:
        date  is_holiday
0 2017-08-16           0
1 2017-08-16           0
2 2017-08-16           0
3 2017-08-16           0
4 2017-08-16           0


## Add Promotion Features

In [7]:
# Create time since the last promotion for each store and family
def create_time_since_last_promotion(df):
    df['promo_not_active'] = (df['onpromotion'] == 0).astype(int)
    
    # Calculate the cumulative sum of days since the last promotion within each store and product family
    df['time_since_last_promo'] = df.groupby(['store_nbr', 'family'])['promo_not_active'].cumsum()
    
    # Drop the helper column 'promo_not_active'
    df.drop(columns=['promo_not_active'], inplace=True)
    
    return df

# Apply to train dataset (test set doesn't have sales history, so we skip promotions there)
df_train = create_time_since_last_promotion(df_train)

# Confirm the new promotion-related feature in the train dataset
print("Train DataFrame with Promotion Features:")
print(df_train[['date', 'store_nbr', 'family', 'onpromotion', 'time_since_last_promo']].head())

Train DataFrame with Promotion Features:
        date  store_nbr      family  onpromotion  time_since_last_promo
0 2013-01-01          1  AUTOMOTIVE            0                      1
1 2013-01-01          1   BABY CARE            0                      1
2 2013-01-01          1      BEAUTY            0                      1
3 2013-01-01          1   BEVERAGES            0                      1
4 2013-01-01          1       BOOKS            0                      1


## Model Training

In [12]:
# Merge oil data into the train dataset based on date
df_train = df_train.merge(df_oil[['date', 'dcoilwtico']], on='date', how='left')

# Merge transactions data into the train dataset based on date and store number
df_train = df_train.merge(df_transactions[['date', 'store_nbr', 'transactions']], on=['date', 'store_nbr'], how='left')

# Fill any remaining NaN values in dcoilwtico and transactions
df_train['dcoilwtico'].fillna(0, inplace=True)
df_train['transactions'].fillna(0, inplace=True)

# Verify that the columns are present after merging
print(df_train[['date', 'store_nbr', 'dcoilwtico', 'transactions']].head())

        date  store_nbr  dcoilwtico  transactions
0 2013-01-01          1       93.14           0.0
1 2013-01-01          1       93.14           0.0
2 2013-01-01          1       93.14           0.0
3 2013-01-01          1       93.14           0.0
4 2013-01-01          1       93.14           0.0


In [13]:
# Import necessary libraries
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
import numpy as np

# Define features to be used in the model
features = ['onpromotion', 'dcoilwtico', 'transactions', 'year', 'month', 'day', 
            'day_of_week', 'week_of_year', 'day_of_year', 'time_since_last_promo', 
            'is_holiday']

# Prepare the feature matrix and target variable
X = df_train[features]
y = df_train['sales']

# Use TimeSeriesSplit for cross-validation (to ensure future data isn't used in the past)
tscv = TimeSeriesSplit(n_splits=5)

# Initialize lists to store evaluation results
rmsle_list = []

# Loop through each split in the cross-validation
for train_index, val_index in tscv.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Create LightGBM dataset
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    # Set LightGBM parameters
    params = {
        'objective': 'regression',
        'metric': 'rmse',  # LightGBM uses RMSE by default, but we calculate RMSLE manually
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,
        'num_leaves': 50,
        'max_depth': -1,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'random_state': 42
    }
    
    # Use callbacks for early stopping and logging
    callbacks = [
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
    
    # Train the model with early stopping callbacks
    model = lgb.train(params, train_data, valid_sets=[val_data], num_boost_round=1000, callbacks=callbacks)
    
    # Make predictions on the validation set
    y_val_pred = model.predict(X_val)
    y_val_pred = np.clip(y_val_pred, a_min=0, a_max=None)  # Ensure no negative predictions
    
    # Calculate RMSLE on validation set
    rmsle = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
    rmsle_list.append(rmsle)
    
    print(f"Validation RMSLE: {rmsle}")

# Average RMSLE across all splits
avg_rmsle = np.mean(rmsle_list)
print(f"Average RMSLE across all splits: {avg_rmsle}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 501336, number of used features: 9
[LightGBM] [Info] Start training from score 207.328590
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 873.135
[200]	valid_0's rmse: 868.169
[300]	valid_0's rmse: 866.702
[400]	valid_0's rmse: 866.205
[500]	valid_0's rmse: 866.082
[600]	valid_0's rmse: 865.975
Early stopping, best iteration is:
[577]	valid_0's rmse: 865.968
Validation RMSLE: 3.5074939804486225
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010183 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total

## Enhancing feature engineering

In [14]:
import numpy as np

# 1.1 Create additional lag features for 30, 60, and 90 days
def create_additional_lag_features(df, lags=[30, 60, 90]):
    for lag in lags:
        df[f'sales_lag_{lag}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(lag)
    return df

# 1.2 Create cyclic features for day_of_week and day_of_year
def create_cyclic_features(df):
    # Cyclic transformation of day_of_week
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

    # Cyclic transformation of day_of_year
    df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    
    return df

# Apply the additional lag features to the train dataset
df_train = create_additional_lag_features(df_train)

# Apply the cyclic transformations to the train dataset
df_train = create_cyclic_features(df_train)

# Fill missing values resulting from lag feature creation
df_train.fillna(0, inplace=True)

# Verify that the new features are created
print("Train DataFrame with additional lag and cyclic features:")
print(df_train[['sales_lag_30', 'sales_lag_60', 'sales_lag_90', 'day_of_week_sin', 'day_of_week_cos', 'day_of_year_sin', 'day_of_year_cos']].head())

Train DataFrame with additional lag and cyclic features:
   sales_lag_30  sales_lag_60  sales_lag_90  day_of_week_sin  day_of_week_cos   
0           0.0           0.0           0.0         0.781831          0.62349  \
1           0.0           0.0           0.0         0.781831          0.62349   
2           0.0           0.0           0.0         0.781831          0.62349   
3           0.0           0.0           0.0         0.781831          0.62349   
4           0.0           0.0           0.0         0.781831          0.62349   

   day_of_year_sin  day_of_year_cos  
0         0.017213         0.999852  
1         0.017213         0.999852  
2         0.017213         0.999852  
3         0.017213         0.999852  
4         0.017213         0.999852  


## Update model training with new features

In [28]:
# Updated feature list to include additional lag features and cyclic features
features = ['onpromotion', 'dcoilwtico', 'transactions', 'year', 'month', 'day', 
            'day_of_week_sin', 'day_of_week_cos', 'day_of_year_sin', 'day_of_year_cos',
            'week_of_year', 'time_since_last_promo', 'is_holiday', 
            'sales_lag_1', 'sales_lag_7', 'sales_lag_14', 'sales_lag_30', 
            'sales_lag_60', 'sales_lag_90']

# Prepare the feature matrix and target variable
X = df_train[features]
y = df_train['sales']

# Use TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Initialize lists to store evaluation results
rmsle_list = []

# Loop through each split in the cross-validation
for train_index, val_index in tscv.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Create LightGBM dataset
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    # Set LightGBM parameters
    params = {
        'objective': 'regression',
        'metric': 'rmse',  # LightGBM uses RMSE by default, but we calculate RMSLE manually
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,
        'num_leaves': 50,
        'max_depth': -1,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'random_state': 42
    }
    
    # Use callbacks for early stopping and logging
    callbacks = [
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
    
    # Train the model with early stopping callbacks
    model = lgb.train(params, train_data, valid_sets=[val_data], num_boost_round=1000, callbacks=callbacks)
    
    # Make predictions on the validation set
    y_val_pred = model.predict(X_val)
    y_val_pred = np.clip(y_val_pred, a_min=0, a_max=None)  # Ensure no negative predictions
    
    # Calculate RMSLE on validation set
    rmsle = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
    rmsle_list.append(rmsle)
    
    print(f"Validation RMSLE: {rmsle}")

# Average RMSLE across all splits
avg_rmsle = np.mean(rmsle_list)
print(f"Average RMSLE across all splits: {avg_rmsle}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007938 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2800
[LightGBM] [Info] Number of data points in the train set: 501336, number of used features: 17
[LightGBM] [Info] Start training from score 207.328590
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 413.668
[200]	valid_0's rmse: 278.017
[300]	valid_0's rmse: 248.544
[400]	valid_0's rmse: 242.444
[500]	valid_0's rmse: 240.872
[600]	valid_0's rmse: 240.395
[700]	valid_0's rmse: 240.186
[800]	valid_0's rmse: 240.045
Early stopping, best iteration is:
[845]	valid_0's rmse: 239.93
Validation RMSLE: 0.963898306407128
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough,

## Hyperparameter Tuning using RandomizedSearchCV

In [27]:
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_log_error
import warnings

# Suppress warnings from LightGBM
warnings.filterwarnings("ignore", category=UserWarning)

# Step 1: Check for zero or negative sales values
def check_for_zero_negative_sales(df, target_column='sales'):
    num_invalid_sales = df[df[target_column] <= 0].shape[0]
    if num_invalid_sales > 0:
        print(f"Number of zero or negative sales: {num_invalid_sales}")
    else:
        print("No zero or negative sales found.")
    return num_invalid_sales

# Step 2: Handle zero or negative sales (either filter them or adjust by adding a constant)
def handle_zero_negative_sales(df, method='filter'):
    if method == 'filter':
        # Filter out zero or negative sales
        df_filtered = df[df['sales'] > 0]
        print(f"Filtered out rows with zero or negative sales. Remaining rows: {df_filtered.shape[0]}")
        return df_filtered
    elif method == 'add_constant':
        # Add a small constant to avoid zero or negative sales
        df['sales_adjusted'] = df['sales'] + 1
        print("Added constant of 1 to avoid zero or negative sales.")
        return df

# Step 3: Custom RMSLE scorer that clips negative predictions
def rmsle_scorer(y_true, y_pred):
    # Clip any negative predictions to 0 to avoid errors
    y_pred = np.clip(y_pred, a_min=0, a_max=None)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Step 4: Perform Hyperparameter Tuning with RandomizedSearchCV
def perform_hyperparameter_tuning(X, y):
    # Ensure no zero or negative values in y before proceeding
    if (y <= 0).any():
        raise ValueError("Detected zero or negative values in target variable, can't proceed with RMSLE.")

    # Define the cleaned-up parameter grid for LightGBM
    param_grid = {
        'num_leaves': [31, 50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'min_data_in_leaf': [20, 30, 50],
        'feature_fraction': [0.7, 0.8, 0.9],
        'lambda_l1': [0, 0.1, 1],
        'lambda_l2': [0, 0.1, 1]
    }

    # Initialize LightGBM regressor
    lgbm = lgb.LGBMRegressor(objective='regression', boosting_type='gbdt', random_state=42, n_estimators=1000)

    # Custom RMSLE scorer with clipping
    rmsle = make_scorer(rmsle_scorer, greater_is_better=False)

    # Set up RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=lgbm,
        param_distributions=param_grid,
        n_iter=20,
        scoring=rmsle,  # Use custom RMSLE scorer
        cv=3,  # 3-fold cross-validation
        verbose=1,  # Set verbosity to 1 for progress updates
        random_state=42,
        n_jobs=-1
    )

    # Perform the random search on the filtered training data
    random_search.fit(X, y)

    # Output the best parameters and the corresponding RMSLE score
    print("Best hyperparameters:", random_search.best_params_)
    print("Best score (RMSLE):", np.sqrt(-random_search.best_score_))

# Step 5: Clip negative model predictions
def clip_predictions(predictions):
    return np.clip(predictions, a_min=0, a_max=None)

# Step 6: Prepare and run everything
# Check for zero or negative sales in the training data
check_for_zero_negative_sales(df_train)

# Handle zero or negative sales based on the chosen method (either filter or add_constant)
df_train_cleaned = handle_zero_negative_sales(df_train, method='filter')

# Prepare feature matrix and target variable for training
X_filtered = df_train_cleaned[features]  # Assuming features list is defined earlier
y_filtered = df_train_cleaned['sales']

# Step 7: Ensure all target values are positive before proceeding
try:
    if (y_filtered <= 0).any():
        raise ValueError("Zero or negative values found in the target data!")

    # Print statistics on the target variable
    print(f"Min sales value: {y_filtered.min()}, Max sales value: {y_filtered.max()}")
    
    # Perform hyperparameter tuning
    perform_hyperparameter_tuning(X_filtered, y_filtered)
    
except ValueError as e:
    print(f"Error: {e}")

Number of zero or negative sales: 940483
Filtered out rows with zero or negative sales. Remaining rows: 2067533
Min sales value: 0.122, Max sales value: 124717.0
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.162826 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3090
[LightGBM] [Info] Number of data points in the train set: 1378356, number of used features: 19
[LightGBM] [Info] Start training from score 504.579952
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.129579 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3154
[LightGBM] [Info] Number of data points in the train set: 1378355, number of used features: 19
[LightGBM] [Info] Start training from score 513.053109
[LightGBM] [Info] Aut

## FURTHER IMPROVEMENTS

In [36]:
# Drop existing 'cluster' column if it exists
if 'cluster' in df_train.columns:
    df_train.drop(columns=['cluster'], inplace=True)

if 'cluster' in df_test.columns:
    df_test.drop(columns=['cluster'], inplace=True)

# Now merge the 'cluster' information from df_stores
df_train = df_train.merge(df_stores[['store_nbr', 'cluster']], on='store_nbr', how='left')
df_test = df_test.merge(df_stores[['store_nbr', 'cluster']], on='store_nbr', how='left')

# Check if the cluster is merged correctly
print("Cluster column in train data:")
print(df_train[['store_nbr', 'cluster']].head())

print("Cluster column in test data:")
print(df_test[['store_nbr', 'cluster']].head())

# Add interaction between promotion and holiday, and promotion and store cluster
df_train['promo_holiday'] = df_train['onpromotion'] * df_train['is_holiday']
df_train['store_cluster_promo'] = df_train['onpromotion'] * df_train['cluster']

df_test['promo_holiday'] = df_test['onpromotion'] * df_test['is_holiday']
df_test['store_cluster_promo'] = df_test['onpromotion'] * df_test['cluster']

# Check if the new features are added
print("New interaction features in train data:")
print(df_train[['promo_holiday', 'store_cluster_promo']].head())

print("New interaction features in test data:")
print(df_test[['promo_holiday', 'store_cluster_promo']].head())

Cluster column in train data:
   store_nbr  cluster
0          1       13
1          1       13
2          1       13
3          1       13
4          1       13
Cluster column in test data:
   store_nbr  cluster
0          1       13
1          1       13
2          1       13
3          1       13
4          1       13
New interaction features in train data:
   promo_holiday  store_cluster_promo
0              0                    0
1              0                    0
2              0                    0
3              0                    0
4              0                    0
New interaction features in test data:
   promo_holiday  store_cluster_promo
0              0                    0
1              0                    0
2              0                   26
3              0                  260
4              0                    0


In [38]:
def create_lag_features(df, lags=[1, 7, 14, 30, 60, 90, 180]):
    for lag in lags:
        df[f'sales_lag_{lag}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(lag)
    return df

# Apply lag features to the train dataset only
df_train = create_lag_features(df_train)

# In the test dataset, we won't have 'sales' data for lag features,
# so fill missing lag values with 0 or any default value you want.
df_test.fillna(0, inplace=True)

# Check if the lag features are correctly applied in df_train
print("Train data with lag features:")
print(df_train[['store_nbr', 'family', 'sales', 'sales_lag_1', 'sales_lag_7', 'sales_lag_14']].head())

# If you still want to include lag features in the test set,
# you can use forward-fill based on the last available training data
# or initialize them with 0 for predictions (depending on your approach).

Train data with lag features:
   store_nbr      family  sales  sales_lag_1  sales_lag_7  sales_lag_14
0          1  AUTOMOTIVE    0.0          NaN          NaN           NaN
1          1   BABY CARE    0.0          NaN          NaN           NaN
2          1      BEAUTY    0.0          NaN          NaN           NaN
3          1   BEVERAGES    0.0          NaN          NaN           NaN
4          1       BOOKS    0.0          NaN          NaN           NaN


In [39]:
# Define a function to calculate days to next holiday and days since last holiday
def add_holiday_distance_features(df, holidays_df):
    holidays_df['date'] = pd.to_datetime(holidays_df['date'])
    holidays_df = holidays_df[holidays_df['transferred'] == False]

    # Create a list of unique holiday dates
    holiday_dates = holidays_df['date'].unique()

    df['days_until_next_holiday'] = df['date'].apply(lambda x: min((holiday_dates - x).days) if x < max(holiday_dates) else 0)
    df['days_since_last_holiday'] = df['date'].apply(lambda x: min((x - holiday_dates).days) if x > min(holiday_dates) else 0)
    
    return df

# Apply the holiday distance features to the train and test datasets
df_train = add_holiday_distance_features(df_train, df_holidays)
df_test = add_holiday_distance_features(df_test, df_holidays)

In [41]:
# Create cyclic features for day of year and day of week
df_train['day_of_year_sin'] = np.sin(2 * np.pi * df_train['day_of_year'] / 365)
df_train['day_of_year_cos'] = np.cos(2 * np.pi * df_train['day_of_year'] / 365)
df_train['day_of_week_sin'] = np.sin(2 * np.pi * df_train['day_of_week'] / 7)
df_train['day_of_week_cos'] = np.cos(2 * np.pi * df_train['day_of_week'] / 7)

df_test['day_of_year_sin'] = np.sin(2 * np.pi * df_test['day_of_year'] / 365)
df_test['day_of_year_cos'] = np.cos(2 * np.pi * df_test['day_of_year'] / 365)
df_test['day_of_week_sin'] = np.sin(2 * np.pi * df_test['day_of_week'] / 7)
df_test['day_of_week_cos'] = np.cos(2 * np.pi * df_test['day_of_week'] / 7)

In [45]:
# Create rolling mean and standard deviation features for different window sizes
def create_rolling_features(df, windows=[7, 14, 30, 60, 90, 180]):
    for window in windows:
        df[f'sales_roll_mean_{window}'] = df.groupby(['store_nbr', 'family'])['sales'].transform(lambda x: x.rolling(window).mean())
        df[f'sales_roll_std_{window}'] = df.groupby(['store_nbr', 'family'])['sales'].transform(lambda x: x.rolling(window).std())
    return df

# Apply rolling features to the training data
df_train = create_rolling_features(df_train)

# Since there are no sales data in the test set, fill missing rolling feature values with 0
df_test.fillna(0, inplace=True)

# Now define your features, ensuring the new rolling features are included
features = ['onpromotion', 'dcoilwtico', 'transactions', 'year', 'month', 'day',
            'day_of_week_sin', 'day_of_week_cos', 'day_of_year_sin', 'day_of_year_cos',
            'week_of_year', 'time_since_last_promo', 'is_holiday',
            'sales_lag_1', 'sales_lag_7', 'sales_lag_14', 'sales_lag_30',
            'sales_lag_60', 'sales_lag_90', 'sales_roll_mean_30', 'sales_roll_std_30',
            'sales_roll_mean_90', 'sales_roll_std_90', 'sales_roll_mean_180', 'sales_roll_std_180']

# Prepare the feature matrix and target variable
X = df_train[features]
y = df_train['sales']

In [46]:


# Use TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Initialize lists to store evaluation results
rmsle_list = []

# Loop through each split in the cross-validation
for train_index, val_index in tscv.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Create LightGBM dataset
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    # Set LightGBM parameters
    params = {
        'objective': 'regression',
        'metric': 'rmse',  # We will calculate RMSLE manually
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,
        'num_leaves': 50,
        'max_depth': -1,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'random_state': 42
    }
    
    # Use callbacks for early stopping and logging
    callbacks = [
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
    
    # Train the model with early stopping
    model = lgb.train(params, train_data, valid_sets=[val_data], num_boost_round=1000, callbacks=callbacks)
    
    # Make predictions on the validation set
    y_val_pred = model.predict(X_val)
    y_val_pred = np.clip(y_val_pred, a_min=0, a_max=None)  # Ensure no negative predictions
    
    # Calculate RMSLE on validation set
    rmsle = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
    rmsle_list.append(rmsle)
    
    print(f"Validation RMSLE: {rmsle}")

# Average RMSLE across all splits
avg_rmsle = np.mean(rmsle_list)
print(f"Average RMSLE across all splits: {avg_rmsle}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010712 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4330
[LightGBM] [Info] Number of data points in the train set: 501336, number of used features: 23
[LightGBM] [Info] Start training from score 207.328590
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 422.713
[200]	valid_0's rmse: 290.567
[300]	valid_0's rmse: 259.988
[400]	valid_0's rmse: 252.886
[500]	valid_0's rmse: 250.955
[600]	valid_0's rmse: 250.403
Early stopping, best iteration is:
[639]	valid_0's rmse: 250.032
Validation RMSLE: 1.0184353903444463
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016363 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

## Feature Engineering: Seasonality and Interaction Features

In [47]:
import numpy as np

# Cyclic features for day of week and day of year
def add_cyclic_features(df):
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    
    df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    return df

# Apply to both train and test datasets
df_train = add_cyclic_features(df_train)
df_test = add_cyclic_features(df_test)

# Add interaction between promotion and holiday, and promotion and store cluster
df_train['promo_holiday'] = df_train['onpromotion'] * df_train['is_holiday']
df_train['store_cluster_promo'] = df_train['onpromotion'] * df_train['cluster']

df_test['promo_holiday'] = df_test['onpromotion'] * df_test['is_holiday']
df_test['store_cluster_promo'] = df_test['onpromotion'] * df_test['cluster']

# Updated feature list to include cyclic and interaction features
features = ['onpromotion', 'dcoilwtico', 'transactions', 'year', 'month', 'day', 
            'day_of_week_sin', 'day_of_week_cos', 'day_of_year_sin', 'day_of_year_cos',
            'week_of_year', 'time_since_last_promo', 'is_holiday', 
            'sales_lag_1', 'sales_lag_7', 'sales_lag_14', 'sales_lag_30', 
            'sales_lag_60', 'sales_lag_90',
            'promo_holiday', 'store_cluster_promo']

# Prepare the feature matrix and target variable
X = df_train[features]
y = df_train['sales']

## Regularization Tuning: Fine-tune L1 and L2 Regularization

In [48]:
# Set LightGBM parameters with fine-tuned regularization
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 200,  # Adjusted based on previous tuning
    'min_data_in_leaf': 50,  # Regularization tuning
    'lambda_l1': 0.1,  # L1 regularization
    'lambda_l2': 1,    # L2 regularization
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'random_state': 42
}

# Train the model using TimeSeriesSplit and LightGBM
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
import lightgbm as lgb
import numpy as np

tscv = TimeSeriesSplit(n_splits=5)
rmsle_list = []

for train_index, val_index in tscv.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)

    callbacks = [
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]

    model = lgb.train(params, train_data, valid_sets=[val_data], num_boost_round=1000, callbacks=callbacks)

    # Predictions and RMSLE Calculation
    y_val_pred = model.predict(X_val)
    y_val_pred = np.clip(y_val_pred, a_min=0, a_max=None)  # Clip negative predictions

    rmsle = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
    rmsle_list.append(rmsle)
    print(f"Validation RMSLE: {rmsle}")

# Average RMSLE across all splits
avg_rmsle = np.mean(rmsle_list)
print(f"Average RMSLE across all splits: {avg_rmsle}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005929 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2800
[LightGBM] [Info] Number of data points in the train set: 501336, number of used features: 17
[LightGBM] [Info] Start training from score 207.328590
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 426.396
[200]	valid_0's rmse: 296.707
[300]	valid_0's rmse: 266.716
[400]	valid_0's rmse: 257.897
[500]	valid_0's rmse: 254.907
[600]	valid_0's rmse: 253.214
[700]	valid_0's rmse: 252.076
[800]	valid_0's rmse: 251.307
[900]	valid_0's rmse: 250.75
[1000]	valid_0's rmse: 250.368
Did not meet early stopping. Best iteration is:
[981]	valid_0's rmse: 250.346
Validation RMSLE: 0.8912228132695358
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018473 seconds.
You can set