In [28]:
import numpy as np
import pandas as pd
import joblib
from lightgbm import LGBMRegressor
from sklearn.metrics import root_mean_squared_error
import lightgbm as lgb
import gc
import os
import warnings 

warnings.filterwarnings('ignore')

In [29]:
os.chdir("c://Users//divya//desktop//TCS_TASK-1")

In [30]:
from src.pipeline import merge_and_melt, fill_sell, fill_event, cat_to_num, add_lags, add_mean_encoding, add_sold_mean, fill_df

In [31]:
def load_eval_data():
    calendar_df = pd.read_csv("data/calendar.csv")
    prices_df = pd.read_csv('data/sell_prices.csv')
    sales_df = pd.read_csv('data/sales_train_evaluation.csv')
    return sales_df, calendar_df, prices_df

In [32]:
def prepare_dataframe(sales_df, calendar_df, prices_df):
    """Runs the full data pipeline."""
    
    print("--- Merging and Melting Data ---")
    data_df = merge_and_melt(sales_df, calendar_df, prices_df)
    
    print("--- Filling Nulls and Converting Categories ---")
    data_df = fill_sell(data_df)
    data_df = fill_event(data_df)
    data_df = cat_to_num(data_df)
    
    print("--- Creating Features ---")
    lags = [1, 2, 3, 7, 15, 30]
    data_df = add_lags(data_df, lags)
    data_df = add_mean_encoding(data_df)
    data_df = add_sold_mean(data_df)
    
    print("--- Finalizing Dataset ---")
    data_df = fill_df(data_df)
    
    return data_df

In [33]:
sales_df, calendar_df, prices_df = load_eval_data()

In [34]:
data = prepare_dataframe(sales_df, calendar_df, prices_df)

--- Merging and Melting Data ---
--- Filling Nulls and Converting Categories ---
--- Creating Features ---
--- Finalizing Dataset ---


In [36]:
d_store_id = {0: 'CA_1', 1: 'CA_2', 2: 'CA_3', 3: 'CA_4', 4: 'TX_1', 5: 'TX_2', 6: 'TX_3', 7: 'WI_1', 8: 'WI_2', 9: 'WI_3'}
"""
Trains final models on the full dataset and generates a submission file.
"""
print("--- Generating Final Submission ---")

stores = data.store_id.unique().tolist()
store_dfs = []
forecast_days = 28
for store in stores:
    df = data[data['store_id'] == store]
    
    # Load the best parameters from the optimization phase
    best_params = joblib.load(f'lgbmodels/best_params_{d_store_id[store]}.pkl')
    
    # Train on the full dataset up to d_1913
    X_train_full = df[df['d'] < 1914].drop('sold', axis=1)
    y_train_full = df[df['d'] < 1914]['sold']
    
    # Create a final model instance with the best parameters
    final_model = LGBMRegressor(
        **best_params,
        n_estimators=1000,
        verbose=-1,
        n_jobs=-1,
        random_state=42
    )
    print(f'----- Training Final Model for Store: {d_store_id[store]} -----')
    final_model.fit(X_train_full, y_train_full)
    X_test = df[df['d'] >= 1914].drop('sold', axis=1)
    y_test = df[df['d'] >= 1914]['sold']
    
    test_predictions = final_model.predict(X_test)
    print('prediction is complete')
    
    # Post-process predictions
    test_predictions = np.maximum(0, test_predictions)
    test_predictions = np.round(test_predictions).astype(int)

    # print(f'the real output is {y_test}\n\n')
    # print(f'and the predicted output is {test_predictions}\n')
    rmse = root_mean_squared_error(y_test,test_predictions)
    print(f'the root mean squared error is {rmse}')
    

    
    # Prepare store submission dataframe
    num_products = df['id'].nunique()
    test_predictions = test_predictions.reshape(num_products, forecast_days, order = 'F')
    
    # Create store submission dataframe
    store_df = pd.DataFrame({'id': sales_df[sales_df['store_id'] == d_store_id[store]]['id'].values})
    
    for i in range(forecast_days):
        store_df[f'F{i+1}'] = test_predictions[:, i]
    
    store_dfs.append(store_df)

# Combine all stores vertically
lgb_submission_df = pd.concat(store_dfs, ignore_index=True)

# Save final submission file
final_path = 'submission/lgb_submission.csv'
lgb_submission_df.to_csv(final_path, index=False)
print(f"Final submission file saved as: {final_path}")

--- Generating Final Submission ---
----- Training Final Model for Store: CA_1 -----
prediction is complete
the root mean squared error is 0.49626114630689466
----- Training Final Model for Store: CA_2 -----
prediction is complete
the root mean squared error is 0.45254593798320686
----- Training Final Model for Store: CA_3 -----
prediction is complete
the root mean squared error is 0.5655504015662993
----- Training Final Model for Store: CA_4 -----
prediction is complete
the root mean squared error is 0.27724286947966414
----- Training Final Model for Store: TX_1 -----
prediction is complete
the root mean squared error is 0.4562579285676845
----- Training Final Model for Store: TX_2 -----
prediction is complete
the root mean squared error is 0.48447286584276317
----- Training Final Model for Store: TX_3 -----
prediction is complete
the root mean squared error is 0.5354967619725806
----- Training Final Model for Store: WI_1 -----
prediction is complete
the root mean squared error is 0.3

In [39]:
del data
gc.collect()
print("Cleaned up memory.")

Cleaned up memory.
