In [1]:
import pandas as pd
features = pd.read_csv("/content/drive/MyDrive/ML_Final_Project/features.csv")
train = pd.read_csv("/content/drive/MyDrive/ML_Final_Project/train.csv")
stores = pd.read_csv("/content/drive/MyDrive/ML_Final_Project/stores.csv")
test = pd.read_csv("/content/drive/MyDrive/ML_Final_Project/test.csv")

In [None]:
!pip install dagshub
!pip install mlflow
import dagshub
import mlflow

In [3]:
dagshub.init(repo_owner='dimna21', repo_name='ML_Final_Project', mlflow=True)

In [4]:
import pandas as pd
import numpy as np
from prophet import Prophet
from sklearn.metrics import mean_absolute_error
import mlflow
import dagshub
import warnings
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
warnings.filterwarnings('ignore')

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class BaseMerger(BaseEstimator, TransformerMixin):

    def __init__(self, features, stores):
        self.feature_store = features.merge(stores, how='inner', on='Store')
        self.feature_store['Date'] = pd.to_datetime(self.feature_store['Date'])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['Date'] = pd.to_datetime(X['Date'])
        merged = X.merge(self.feature_store, how='inner', on=['Store', 'Date', 'IsHoliday'])
        merged = merged.sort_values(by=['Date', 'Store', 'Dept']).reset_index(drop=True)
        return merged

class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.superbowl = pd.to_datetime(['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'])
        self.labor_day = pd.to_datetime(['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'])
        self.thanksgiving = pd.to_datetime(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'])
        self.christmas = pd.to_datetime(['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Convert temperature to Celsius
        if 'Temperature' in X.columns:
            X['Temperature'] = (X['Temperature'] - 32) * (5.0 / 9.0)

        # Basic date parts
        X['Day'] = X['Date'].dt.day
        X['Month'] = X['Date'].dt.month
        X['Year'] = X['Date'].dt.year

        # Extract ISO week and year for holiday matching
        X['Week'] = X['Date'].dt.isocalendar().week
        X['YearNum'] = X['Date'].dt.year

        # Helper to flag if a date is in same ISO week/year as a known holiday
        def is_holiday_week(date_series, holidays):
            holiday_weeks = set((d.isocalendar().week, d.year) for d in holidays)
            return date_series.apply(lambda d: (d.isocalendar().week, d.year) in holiday_weeks if pd.notnull(d) else False).astype(int)

        X['SuperbowlWeek'] = is_holiday_week(X['Date'], self.superbowl)
        X['LaborDayWeek'] = is_holiday_week(X['Date'], self.labor_day)
        X['ThanksgivingWeek'] = is_holiday_week(X['Date'], self.thanksgiving)
        X['ChristmasWeek'] = is_holiday_week(X['Date'], self.christmas)

        # Calculate days to Thanksgiving and Christmas (using Nov 24 and Dec 24 as anchor dates)
        thanksgiving_dates = pd.to_datetime(X['Year'].astype(str) + "-11-24")
        christmas_dates = pd.to_datetime(X['Year'].astype(str) + "-12-24")

        X['Days_to_Thanksgiving'] = (thanksgiving_dates - X['Date']).dt.days
        X['Days_to_Christmas'] = (christmas_dates - X['Date']).dt.days

        # Clean up helper cols
        X = X.drop(columns=['Week', 'YearNum'])

        return X

class MissingValueFiller(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        self.mean_cols = ['CPI', 'Unemployment']
        self.mean_values = {}

    def fit(self, X, y=None):
        for col in self.mean_cols:
            if col in X.columns:
                self.mean_values[col] = X[col].mean()
        return self

    def transform(self, X):
        X = X.copy()

        # Fill markdowns with 0
        for col in self.markdown_cols:
            if col in X.columns:
                X[col] = X[col].fillna(0.0)

        # Fill CPI and Unemployment with learned mean
        for col in self.mean_cols:
            if col in X.columns and col in self.mean_values:
                X[col] = X[col].fillna(self.mean_values[col])

        return X

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.type_mapping = {'A': 3, 'B': 2, 'C': 1}
        self.holiday_mapping = {False: 0, True: 1}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        if 'Type' in X.columns:
            X['Type'] = X['Type'].map(self.type_mapping)

        if 'IsHoliday' in X.columns:
            X['IsHoliday'] = X['IsHoliday'].map(self.holiday_mapping)

        return X

class StoreAggregator(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.timeseries = {}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        self.timeseries = {}
        for store in X['Store'].unique():
            self.aggregate_store_info(store, X)
        return self.timeseries

    def aggregate_store_info(self, store_id, X):
        store_data = X[X['Store'] == store_id].copy()

        # Check if Weekly_Sales exists (train data) or not (test data)
        has_weekly_sales = 'Weekly_Sales' in store_data.columns

        if has_weekly_sales:
            sum_columns = ['Weekly_Sales', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        else:
            sum_columns = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']

        first_columns = ['IsHoliday', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
                        'Type', 'Size', 'Day', 'Month', 'Year', 'SuperbowlWeek',
                        'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek',
                        'Days_to_Thanksgiving', 'Days_to_Christmas']

        agg_dict = {}

        # Add sum columns that exist in the data
        for col in sum_columns:
            if col in store_data.columns:
                agg_dict[col] = 'sum'

        # Add first columns that exist in the data
        for col in first_columns:
            if col in store_data.columns:
                agg_dict[col] = 'first'

        aggregated = store_data.groupby(['Date', 'Store']).agg(agg_dict).reset_index()
        aggregated = aggregated.sort_values('Date').reset_index(drop=True)

        # Calculate department proportions only if Weekly_Sales exists
        if has_weekly_sales:
            dept_proportions = self.calculate_dept_proportions(store_data)
        else:
            dept_proportions = None

        self.timeseries[store_id] = (aggregated, dept_proportions)
        return aggregated

    def calculate_dept_proportions(self, store_data):
        dept_totals = store_data.groupby('Dept')['Weekly_Sales'].sum()
        store_total = store_data['Weekly_Sales'].sum()

        if store_total == 0:
            num_depts = len(dept_totals)
            return {dept: 1.0/num_depts for dept in dept_totals.index}

        dept_proportions = (dept_totals / store_total).to_dict()
        return dept_proportions

In [6]:
from sklearn.pipeline import Pipeline

# Pipeline with store aggregation
pipeline = Pipeline([
   ('merge', BaseMerger(features, stores)),
   ('feature_add', FeatureAdder()),
   ('value_fill', MissingValueFiller()),
   ('cat_encoder', CategoricalEncoder()),
   ('store_agg', StoreAggregator())
])

# Transform training data
train_aggregated = pipeline.fit_transform(train)
test_aggregated = pipeline.transform(test)


In [None]:
import warnings
warnings.filterwarnings('ignore')
from prophet import Prophet
from tqdm import tqdm
import pandas as pd
import numpy as np

# Features to use as regressors
'''
ls = ['IsHoliday', 'Type', 'Size', 'SuperbowlWeek',
     'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek',
     'Days_to_Thanksgiving', 'Days_to_Christmas']
'''
ls = ['IsHoliday', 'SuperbowlWeek', 'LaborDayWeek', 'ThanksgivingWeek', 'ChristmasWeek']

store_models = {}

for store_id in tqdm(train_aggregated.keys(), desc="Training Prophet models"):
   store_ts, dept_props = train_aggregated[store_id]
   try:
       # Prepare data for Prophet
       df_prophet = store_ts[['Date', 'Weekly_Sales']].copy()
       df_prophet.columns = ['ds', 'y']
       df_prophet['ds'] = pd.to_datetime(df_prophet['ds'])

       # Add regressors that exist in the data
       available_regressors = []
       for regressor in ls:
           if regressor in store_ts.columns:
               df_prophet[regressor] = store_ts[regressor].values
               available_regressors.append(regressor)

       # Sort by date
       df_prophet = df_prophet.sort_values('ds').reset_index(drop=True)

       # Initialize Prophet model
       model = Prophet(
           yearly_seasonality=True,
           weekly_seasonality=False,
           daily_seasonality=False,
           changepoint_prior_scale=0.01,
           seasonality_prior_scale=10.0,
           holidays_prior_scale=15.0,
           mcmc_samples=0,
           interval_width=0.8,
           growth='flat'
       )

       # Add US holidays
       model.add_country_holidays(country_name='US')

       # Add available regressors
       for regressor in available_regressors:
           model.add_regressor(regressor)

       # Fit the model
       model.fit(df_prophet)

       # Store the fitted model with department proportions and available regressors
       store_models[store_id] = (model, dept_props, available_regressors)

   except Exception as e:
       print(f"Failed to train Prophet model for Store {store_id}: {e}")
       continue

print(f"Successfully trained {len(store_models)} Prophet store models")

In [13]:
import mlflow
import dagshub
import pickle
import os
from datetime import datetime

# Set up MLflow experiment
mlflow.set_experiment("Prophet_Store_Models_Minimal_Features")

# Start MLflow run
with mlflow.start_run(run_name=f"Prophet_Store_Training_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):

    # Log experiment parameters
    mlflow.log_param("model_type", "Prophet")
    mlflow.log_param("approach", "store_level_aggregated")
    mlflow.log_param("regressors", ", ".join(ls))
    mlflow.log_param("num_regressors", len(ls))
    mlflow.log_param("total_stores", len(train_aggregated))
    mlflow.log_param("successful_models", len(store_models))
    mlflow.log_param("failed_models", len(train_aggregated) - len(store_models))

    # Log Prophet model parameters
    mlflow.log_param("yearly_seasonality", True)
    mlflow.log_param("weekly_seasonality", False)
    mlflow.log_param("daily_seasonality", False)
    mlflow.log_param("changepoint_prior_scale", 0.05)
    mlflow.log_param("seasonality_prior_scale", 10.0)
    mlflow.log_param("holidays_prior_scale", 10.0)
    mlflow.log_param("mcmc_samples", 0)
    mlflow.log_param("interval_width", 0.8)
    mlflow.log_param("growth", "linear")
    mlflow.log_param("country_holidays", "US")

    # Log metrics
    mlflow.log_metric("model_success_rate", len(store_models) / len(train_aggregated))

    # Log individual store model info
    store_info = []
    total_training_periods = 0
    total_departments = 0


    # Log aggregate metrics
    mlflow.log_metric("avg_training_periods_per_store", total_training_periods / len(store_models))
    mlflow.log_metric("avg_departments_per_store", total_departments / len(store_models))
    mlflow.log_metric("total_departments", total_departments)

    # Save store information as artifact
    store_info_df = pd.DataFrame(store_info)
    store_info_path = "/content/drive/MyDrive/ML_Final_Project/prophet_store_info.csv"
    store_info_df.to_csv(store_info_path, index=False)
    mlflow.log_artifact(store_info_path)

    # Save the complete store_models dictionary as pickle
    models_path = "/content/drive/MyDrive/ML_Final_Project/prophet_store_models.pkl"
    with open(models_path, 'wb') as f:
        pickle.dump(store_models, f)
    mlflow.log_artifact(models_path)

    # Log feature list as artifact
    features_path = "/content/drive/MyDrive/ML_Final_Project/prophet_features_used.txt"
    with open(features_path, 'w') as f:
        f.write("Features used in Prophet models:\n")
        for feature in ls:
            f.write(f"- {feature}\n")
    mlflow.log_artifact(features_path)



📊 LOGGED TO MLFLOW:
Successful models: 45/45
Success rate: 100.00%
Average training periods per store: 0.0
Average departments per store: 0.0
Features used: IsHoliday, SuperbowlWeek, LaborDayWeek, ThanksgivingWeek, ChristmasWeek

📁 ARTIFACTS LOGGED:
- Complete models: prophet_store_models.pkl (1.5 MB)
- Store information: prophet_store_info.csv
- Features used: prophet_features_used.txt
🏃 View run Prophet_Store_Training_20250729_134714 at: https://dagshub.com/dimna21/ML_Final_Project.mlflow/#/experiments/11/runs/d7f9664319674bdfad7e120cc7582565
🧪 View experiment at: https://dagshub.com/dimna21/ML_Final_Project.mlflow/#/experiments/11
✅ MLflow logging completed!


In [8]:
max_train_date = pd.to_datetime(train['Date'].max())

def get_forecast(idx):
   sample = test.iloc[idx]
   store = sample['Store']
   dept = sample['Dept']
   pred_date = pd.to_datetime(sample['Date'])


   try:
       store_model, dept_props, available_regressors = store_models[store]

       # Get test data for this store
       if store not in test_aggregated:
           return 1000.0

       df, _ = test_aggregated[store]
       df = df.copy()
       df['Date'] = pd.to_datetime(df['Date'])
       df = df.sort_values('Date')

       # Get all test dates up to and including the prediction date
       relevant_data = df[df['Date'] <= pred_date].copy()

       if len(relevant_data) == 0:
           return 1000.0

       # Prepare future dataframe for Prophet
       future_data = {
           'ds': relevant_data['Date'].tolist()
       }

       # Add available regressors
       for regressor in available_regressors:
           if regressor in relevant_data.columns:
               future_data[regressor] = relevant_data[regressor].tolist()

       future_df = pd.DataFrame(future_data)

       # Make Prophet forecast
       forecast = store_model.predict(future_df)
       forecast['ds'] = pd.to_datetime(forecast['ds'])

       # Find the prediction for our target date
       target_forecast = forecast[forecast['ds'] == pred_date]

       if len(target_forecast) > 0:
           store_prediction = max(0, target_forecast['yhat'].iloc[0])

           # Apply department proportion
           if dept_props and dept in dept_props.keys():
               return store_prediction * dept_props[dept]
           elif dept_props:
               return store_prediction / len(dept_props)
           else:
               return store_prediction
       else:
           return 1000.0  # Fallback if date not found

   except Exception as e:
       print(f"Forecast failed for Store {store}, Dept {dept}: {e}")
       return 1000.0  # Fallback for any errors

In [12]:
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime

def process_store_batch(store_data_tuple):
    """Process all predictions for a single store at once"""
    store_id, store_test_data = store_data_tuple

    # Check if store model exists
    if store_id not in store_models:
        # Return fallback predictions for all rows
        results = []
        for _, row in store_test_data.iterrows():
            results.append({
                'Id': f"{row['Store']}_{row['Dept']}_{pd.to_datetime(row['Date']).strftime('%Y-%m-%d')}",
                'Weekly_Sales': 1000.0
            })
        return results

    try:
        store_model, dept_props, available_regressors = store_models[store_id]

        # Get test aggregated data for this store
        if store_id not in test_aggregated:
            # Fallback for missing test data
            results = []
            for _, row in store_test_data.iterrows():
                results.append({
                    'Id': f"{row['Store']}_{row['Dept']}_{pd.to_datetime(row['Date']).strftime('%Y-%m-%d')}",
                    'Weekly_Sales': 1000.0
                })
            return results

        df, _ = test_aggregated[store_id]
        df = df.copy()
        df['Date'] = pd.to_datetime(df['Date'])
        df = df.sort_values('Date')

        # Get all unique dates needed for this store
        store_test_data['Date'] = pd.to_datetime(store_test_data['Date'])
        unique_dates = sorted(store_test_data['Date'].unique())

        # Create future dataframe for ALL dates at once
        future_data = {'ds': unique_dates}

        # Add regressors for all dates
        for regressor in available_regressors:
            if regressor in df.columns:
                # Map regressor values to each unique date
                regressor_values = []
                for date in unique_dates:
                    # Find the regressor value for this date
                    date_data = df[df['Date'] <= date]
                    if len(date_data) > 0:
                        regressor_values.append(date_data[regressor].iloc[-1])
                    else:
                        regressor_values.append(0)  # Fallback

                future_data[regressor] = regressor_values

        future_df = pd.DataFrame(future_data)

        # SINGLE Prophet prediction call for ALL dates
        forecast = store_model.predict(future_df)
        forecast['ds'] = pd.to_datetime(forecast['ds'])

        # Create a mapping of date -> prediction
        date_to_prediction = dict(zip(forecast['ds'], forecast['yhat']))

        # Process all test rows for this store
        results = []
        for _, row in store_test_data.iterrows():
            pred_date = pd.to_datetime(row['Date'])
            dept = row['Dept']

            # Get store-level prediction
            if pred_date in date_to_prediction:
                store_prediction = max(0, date_to_prediction[pred_date])

                # Apply department proportion
                if dept_props and dept in dept_props:
                    final_prediction = store_prediction * dept_props[dept]
                elif dept_props:
                    final_prediction = store_prediction / len(dept_props)
                else:
                    final_prediction = store_prediction
            else:
                final_prediction = 1000.0

            results.append({
                'Id': f"{row['Store']}_{row['Dept']}_{pred_date.strftime('%Y-%m-%d')}",
                'Weekly_Sales': max(0, final_prediction)
            })

        return results

    except Exception as e:
        print(f"Batch processing failed for Store {store_id}: {e}")
        results = []
        for _, row in store_test_data.iterrows():
            results.append({
                'Id': f"{row['Store']}_{row['Dept']}_{pd.to_datetime(row['Date']).strftime('%Y-%m-%d')}",
                'Weekly_Sales': 1000.0
            })
        return results

test_by_store = list(test.groupby('Store'))


# Process stores in parallel
submission_data = []
with ThreadPoolExecutor(max_workers=4) as executor:
    futures = [executor.submit(process_store_batch, store_data) for store_data in test_by_store]

    for future in tqdm(as_completed(futures), total=len(futures)):
        batch_results = future.result()
        submission_data.extend(batch_results)

# Save the results
submission_df = pd.DataFrame(submission_data)
submission_df = submission_df.sort_values('Id')
submission_df.to_csv("/content/drive/MyDrive/ML_Final_Project/Prophet_Store_Submission_2.csv", index=False)


100%|██████████| 45/45 [00:21<00:00,  2.09it/s]
