<a href="https://colab.research.google.com/github/dimna21/ML_Final_Project/blob/main/model_experiment_TFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install darts

In [38]:
import pandas as pd
features = pd.read_csv('/content/drive/MyDrive/ML_Final_Project/features.csv')
stores = pd.read_csv('/content/drive/MyDrive/ML_Final_Project/stores.csv')
train = pd.read_csv('/content/drive/MyDrive/ML_Final_Project/train.csv')
test = pd.read_csv('/content/drive/MyDrive/ML_Final_Project/test.csv')

In [39]:
from sklearn.base import BaseEstimator, TransformerMixin

class BaseMerger(BaseEstimator, TransformerMixin):
    def __init__(self, features, stores):
        self.feature_store = features.merge(stores, how='inner', on='Store')
        self.feature_store['Date'] = pd.to_datetime(self.feature_store['Date'])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['Date'] = pd.to_datetime(X['Date'])
        merged = X.merge(self.feature_store, how='inner', on=['Store', 'Date', 'IsHoliday'])
        merged = merged.sort_values(by=['Date', 'Store', 'Dept']).reset_index(drop=True)
        return merged

In [40]:
class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.superbowl = pd.to_datetime(['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'])
        self.labor_day = pd.to_datetime(['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'])
        self.thanksgiving = pd.to_datetime(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'])
        self.christmas = pd.to_datetime(['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Convert temperature to Celsius
        if 'Temperature' in X.columns:
            X['Temperature'] = (X['Temperature'] - 32) * (5.0 / 9.0)

        # Basic date parts
        X['Day'] = X['Date'].dt.day
        X['Month'] = X['Date'].dt.month
        X['Year'] = X['Date'].dt.year

        # Extract ISO week and year for holiday matching
        X['Week'] = X['Date'].dt.isocalendar().week
        X['YearNum'] = X['Date'].dt.year

        # Helper to flag if a date is in same ISO week/year as a known holiday
        def is_holiday_week(date_series, holidays):
            holiday_weeks = set((d.isocalendar().week, d.year) for d in holidays)
            return date_series.apply(lambda d: (d.isocalendar().week, d.year) in holiday_weeks if pd.notnull(d) else False).astype(int)

        X['SuperbowlWeek'] = is_holiday_week(X['Date'], self.superbowl)
        X['LaborDayWeek'] = is_holiday_week(X['Date'], self.labor_day)
        X['ThanksgivingWeek'] = is_holiday_week(X['Date'], self.thanksgiving)
        X['ChristmasWeek'] = is_holiday_week(X['Date'], self.christmas)

        # Calculate days to Thanksgiving and Christmas (using Nov 24 and Dec 24 as anchor dates)
        thanksgiving_dates = pd.to_datetime(X['Year'].astype(str) + "-11-24")
        christmas_dates = pd.to_datetime(X['Year'].astype(str) + "-12-24")

        X['Days_to_Thanksgiving'] = (thanksgiving_dates - X['Date']).dt.days
        X['Days_to_Christmas'] = (christmas_dates - X['Date']).dt.days

        # Clean up helper cols
        X = X.drop(columns=['Week', 'YearNum'])

        return X


In [41]:
class MissingValueFiller(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        self.mean_cols = ['CPI', 'Unemployment']
        self.mean_values = {}

    def fit(self, X, y=None):
        for col in self.mean_cols:
            if col in X.columns:
                self.mean_values[col] = X[col].mean()
        return self

    def transform(self, X):
        X = X.copy()

        # Fill markdowns with 0
        for col in self.markdown_cols:
            if col in X.columns:
                X[col] = X[col].fillna(0.0)

        # Fill CPI and Unemployment with learned mean
        for col in self.mean_cols:
            if col in X.columns and col in self.mean_values:
                X[col] = X[col].fillna(self.mean_values[col])

        return X


In [42]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.type_mapping = {'A': 3, 'B': 2, 'C': 1}
        self.holiday_mapping = {False: 0, True: 1}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        if 'Type' in X.columns:
            X['Type'] = X['Type'].map(self.type_mapping)

        if 'IsHoliday' in X.columns:
            X['IsHoliday'] = X['IsHoliday'].map(self.holiday_mapping)

        return X


In [44]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('merge', BaseMerger(features, stores)),
    ('feature_add', FeatureAdder()),
    ('fillna', MissingValueFiller())
    ])

In [45]:
train_df = pipeline.fit_transform(train)

In [46]:
import pandas as pd

train_list = []
val_list = []

for (store, dept), group in train_df.groupby(['Store', 'Dept']):
    group = group.sort_values('Date')
    split_idx = int(len(group) * 0.8)

    train_group = group.iloc[:split_idx]
    val_group = group.iloc[split_idx:]

    train_list.append(train_group)
    val_list.append(val_group)

train_df = pd.concat(train_list).reset_index(drop=True)
val_df = pd.concat(val_list).reset_index(drop=True)


In [47]:
train_df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Size,Day,Month,Year,SuperbowlWeek,LaborDayWeek,ThanksgivingWeek,ChristmasWeek,Days_to_Thanksgiving,Days_to_Christmas
0,1,1,2010-02-05,24924.5,False,5.727778,2.572,0.0,0.0,0.0,...,151315,5,2,2010,0,0,0,0,292,322
1,1,1,2010-02-12,46039.49,True,3.616667,2.548,0.0,0.0,0.0,...,151315,12,2,2010,1,0,0,0,285,315
2,1,1,2010-02-19,41595.55,False,4.405556,2.514,0.0,0.0,0.0,...,151315,19,2,2010,0,0,0,0,278,308
3,1,1,2010-02-26,19403.54,False,8.127778,2.561,0.0,0.0,0.0,...,151315,26,2,2010,0,0,0,0,271,301
4,1,1,2010-03-05,21827.9,False,8.055556,2.625,0.0,0.0,0.0,...,151315,5,3,2010,0,0,0,0,264,294


In [48]:
val_df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Size,Day,Month,Year,SuperbowlWeek,LaborDayWeek,ThanksgivingWeek,ChristmasWeek,Days_to_Thanksgiving,Days_to_Christmas
0,1,1,2012-04-13,34684.21,False,20.594444,3.891,6186.19,3288.69,17.07,...,151315,13,4,2012,0,0,0,0,225,255
1,1,1,2012-04-20,16976.19,False,19.311111,3.877,2230.8,612.02,19.75,...,151315,20,4,2012,0,0,0,0,218,248
2,1,1,2012-04-27,16347.6,False,19.572222,3.814,3221.25,0.0,35.49,...,151315,27,4,2012,0,0,0,0,211,241
3,1,1,2012-05-04,17147.44,False,24.194444,3.749,21290.13,0.0,69.89,...,151315,4,5,2012,0,0,0,0,204,234
4,1,1,2012-05-11,18164.2,False,23.205556,3.688,8351.4,0.0,10.52,...,151315,11,5,2012,0,0,0,0,197,227


In [54]:
train_df['Date'] = pd.to_datetime(train_df['Date'])
val_df['Date'] = pd.to_datetime(val_df['Date'])