In [30]:
import pandas as pd
from datetime import timedelta


In [29]:
train_df = pd.read_csv('../data/raw/train.csv', parse_dates=['date'])
holiday_df = pd.read_csv('../data/raw/holidays_events.csv', parse_dates=['date'])
oil_df = pd.read_csv('../data/raw/oil.csv', parse_dates=['date'])
stores_df = pd.read_csv('../data/raw/stores.csv')
transactions_df = pd.read_csv('../data/raw/transactions.csv', parse_dates=['date'])
train_df.shape

(3000888, 6)

In [31]:
# print null rows of any dataframe
def print_null_rows(df, name):
    null_rows = df[df.isnull().any(axis=1)]
    if not null_rows.empty:
        print(f"shape:{df.shape}, Null rows in {name}:\n{null_rows}\n")
    else:
        print(f"shape:{df.shape}, No null rows in {name}.\n")
    
print_null_rows(train_df, 'train_df')
print_null_rows(holiday_df, 'holiday_df')
print_null_rows(oil_df, 'oil_df')
print_null_rows(stores_df, 'stores_df')
print_null_rows(transactions_df, 'transactions_df')

shape:(3000888, 6), No null rows in train_df.

shape:(350, 6), No null rows in holiday_df.

shape:(1218, 2), Null rows in oil_df:
           date  dcoilwtico
0    2013-01-01         NaN
14   2013-01-21         NaN
34   2013-02-18         NaN
63   2013-03-29         NaN
104  2013-05-27         NaN
132  2013-07-04         NaN
174  2013-09-02         NaN
237  2013-11-28         NaN
256  2013-12-25         NaN
261  2014-01-01         NaN
274  2014-01-20         NaN
294  2014-02-17         NaN
338  2014-04-18         NaN
364  2014-05-26         NaN
393  2014-07-04         NaN
434  2014-09-01         NaN
497  2014-11-27         NaN
517  2014-12-25         NaN
522  2015-01-01         NaN
534  2015-01-19         NaN
554  2015-02-16         NaN
588  2015-04-03         NaN
624  2015-05-25         NaN
653  2015-07-03         NaN
699  2015-09-07         NaN
757  2015-11-26         NaN
778  2015-12-25         NaN
783  2016-01-01         NaN
794  2016-01-18         NaN
814  2016-02-15         NaN
84

## 1. Traditional Final Train

In [32]:
# interpolate missing values in oil prices
all_dates = pd.date_range(start=oil_df['date'].min(), end=oil_df['date'].max())
oil_df = oil_df.set_index('date').reindex(all_dates).rename_axis('date').reset_index()
oil_df['dcoilwtico'] = oil_df['dcoilwtico'].interpolate(method='polynomial', order=2)
# fill backward and forward fill for oil prices
oil_df['dcoilwtico'] = oil_df['dcoilwtico'].bfill()

In [33]:
print_null_rows(train_df, 'train_df')
print_null_rows(holiday_df, 'holiday_df')
print_null_rows(oil_df, 'oil_df')
print_null_rows(stores_df, 'stores_df')
print_null_rows(transactions_df, 'transactions_df')

shape:(3000888, 6), No null rows in train_df.

shape:(350, 6), No null rows in holiday_df.

shape:(1704, 2), No null rows in oil_df.

shape:(54, 5), No null rows in stores_df.

shape:(83488, 3), No null rows in transactions_df.



In [34]:
# create faetures for train and test sets
def create_features(df):
    def days_since_payday(date):
        day = date.day
        if day <= 15:
            # Days since last month's end
            last_month_end = date.replace(day=1) - timedelta(days=1)
            return (date - last_month_end).days
        else:
            # Days since 15th of current month
            current_month_15th = date.replace(day=15)
            return (date - current_month_15th).days
        
    def days_until_payday(date):
        day = date.day
        if day < 15:
            # Days until 15th
            return 15 - day
        else:
            # Days until month end
            next_month = date.replace(day=28) + timedelta(days=4)
            month_end = next_month - timedelta(days=next_month.day)
            return (month_end - date).days
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['weekofyear'] = df['date'].dt.isocalendar().week
    df['day_of_year'] = df['date'].dt.dayofyear
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    df['is_quarter_start'] = df['date'].dt.is_quarter_start.astype(int)
    df['is_quarter_end'] = df['date'].dt.is_quarter_end.astype(int)
    df['is_payday'] = ((df['day'] == 15) | df['date'].dt.is_month_end).astype(int)
    df['days_since_payday'] = df['date'].apply(days_since_payday)
    df['days_until_payday'] = df['date'].apply(days_until_payday)
    return df

def create_lag_features(df, target_col='sales', lags=[1, 7, 14, 30]):
    """Create lag features for time series"""
    df_sorted = df.sort_values(['store_nbr', 'family', 'date'])
    
    for lag in lags:
        df_sorted[f'{target_col}_lag_{lag}'] = df_sorted.groupby(['store_nbr', 'family'])[target_col].shift(lag)
    
    return df_sorted

def create_rolling_features(df, target_col='sales', windows=[7, 14, 30]):
    """Create rolling window statistics"""
    df_sorted = df.sort_values(['store_nbr', 'family', 'date'])
    
    for window in windows:
        # Rolling mean
        df_sorted[f'{target_col}_rolling_mean_{window}'] = df_sorted.groupby(['store_nbr', 'family'])[target_col].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
        
        # Rolling std
        df_sorted[f'{target_col}_rolling_std_{window}'] = df_sorted.groupby(['store_nbr', 'family'])[target_col].transform(
            lambda x: x.rolling(window=window, min_periods=1).std()
        )
        
        # Rolling max
        df_sorted[f'{target_col}_rolling_max_{window}'] = df_sorted.groupby(['store_nbr', 'family'])[target_col].transform(
            lambda x: x.rolling(window=window, min_periods=1).max()
        )
        
        # Rolling min
        df_sorted[f'{target_col}_rolling_min_{window}'] = df_sorted.groupby(['store_nbr', 'family'])[target_col].transform(
            lambda x: x.rolling(window=window, min_periods=1).min()
        )
    
    return df_sorted


In [15]:
train_df = create_features(train_df)
print_null_rows(train_df, 'train_df_temporal')

shape:(3000888, 20), No null rows in train_df_temporal.



In [16]:
train_df = create_rolling_features(train_df)
# print number of nulls in train_df_rolling
train_df.isnull().sum()

id                          0
date                        0
store_nbr                   0
family                      0
sales                       0
onpromotion                 0
year                        0
month                       0
day                         0
dayofweek                   0
weekofyear                  0
day_of_year                 0
is_weekend                  0
is_month_start              0
is_month_end                0
is_quarter_start            0
is_quarter_end              0
is_payday                   0
days_since_payday           0
days_until_payday           0
sales_rolling_mean_7        0
sales_rolling_std_7      1782
sales_rolling_max_7         0
sales_rolling_min_7         0
sales_rolling_mean_14       0
sales_rolling_std_14     1782
sales_rolling_max_14        0
sales_rolling_min_14        0
sales_rolling_mean_30       0
sales_rolling_std_30     1782
sales_rolling_max_30        0
sales_rolling_min_30        0
dtype: int64

In [17]:
train_df = create_lag_features(train_df)
train_df.isnull().sum()

id                           0
date                         0
store_nbr                    0
family                       0
sales                        0
onpromotion                  0
year                         0
month                        0
day                          0
dayofweek                    0
weekofyear                   0
day_of_year                  0
is_weekend                   0
is_month_start               0
is_month_end                 0
is_quarter_start             0
is_quarter_end               0
is_payday                    0
days_since_payday            0
days_until_payday            0
sales_rolling_mean_7         0
sales_rolling_std_7       1782
sales_rolling_max_7          0
sales_rolling_min_7          0
sales_rolling_mean_14        0
sales_rolling_std_14      1782
sales_rolling_max_14         0
sales_rolling_min_14         0
sales_rolling_mean_30        0
sales_rolling_std_30      1782
sales_rolling_max_30         0
sales_rolling_min_30         0
sales_la

In [19]:
train_df.dropna(inplace=True)
print_null_rows(train_df, 'train_df_final')

shape:(2947428, 36), No null rows in train_df_final.



In [35]:
national_holidays = holiday_df[holiday_df['locale'] == 'National']['date'].unique()
regional_holidays = holiday_df[holiday_df['locale'] == 'Regional']['date'].unique()
local_holidays = holiday_df[holiday_df['locale'] == 'Local']['date'].unique()
additional_holidays = holiday_df[holiday_df['type'] == 'Additional']['date'].unique()
working_days = holiday_df[holiday_df['type'] == 'Work Day']['date'].unique()
events = holiday_df[holiday_df['type'] == 'Event']['date'].unique()
bridge_days = holiday_df[holiday_df['type'] == 'Bridge']['date'].unique()
transsferred_days = holiday_df[holiday_df['transferred'] == True]['date'].unique()

In [36]:
# add holiday features to train
def add_holiday_features(df):
    df['is_national_holiday'] = df['date'].isin(national_holidays).astype(int)
    df['is_regional_holiday'] = df['date'].isin(regional_holidays).astype(int)
    df['is_local_holiday'] = df['date'].isin(local_holidays).astype(int)
    df['is_additional_holiday'] = df['date'].isin(additional_holidays).astype(int)
    df['is_working_day'] = df['date'].isin(working_days).astype(int)
    df['is_event'] = df['date'].isin(events).astype(int)
    df['is_bridge_day'] = df['date'].isin(bridge_days).astype(int)
    df['is_transferred_day'] = df['date'].isin(transsferred_days).astype(int)
    return df

In [None]:
# create features for train and test sets
train_df = add_holiday_features(train_df)
print_null_rows(train_df, 'train_df')

In [None]:
train_df = train_df.merge(oil_df, on='date', how='left')
print_null_rows(train_df, 'train_df_final')

In [None]:
train_df = train_df.merge(stores_df, on='store_nbr', how='left')
print_null_rows(train_df, 'train_df_final')

In [None]:
train_df = train_df.merge(transactions_df, on=['date', 'store_nbr'], how='left')
print_null_rows(train_df, 'train_df_final')

shape:(2947428, 50), Null rows in train_df_final:
              id       date  store_nbr      family  sales  onpromotion  year  \
334       648648 2014-01-01          1  AUTOMOTIVE    0.0            0  2014   
698      1297296 2015-01-01          1  AUTOMOTIVE    0.0            0  2015   
885      1630530 2015-07-07          1  AUTOMOTIVE    0.0            0  2015   
1062     1945944 2016-01-01          1  AUTOMOTIVE    0.0            0  2016   
1063     1947726 2016-01-02          1  AUTOMOTIVE    7.0            0  2016   
...          ...        ...        ...         ...    ...          ...   ...   
2946472  1298945 2015-01-01         54     SEAFOOD    0.0            0  2015   
2946836  1947593 2016-01-01         54     SEAFOOD    0.0            0  2016   
2946838  1951157 2016-01-03         54     SEAFOOD    2.0            0  2016   
2946839  1952939 2016-01-04         54     SEAFOOD    3.0            0  2016   
2947201  2598023 2017-01-01         54     SEAFOOD    0.0            0

In [27]:
# fill missing values in transactions with 0
train_df['transactions'] = train_df['transactions'].fillna(0)
print_null_rows(train_df, 'train_df_final')

shape:(2947428, 50), No null rows in train_df_final.



In [None]:
train_df.to_csv('../data/interim/traditional_final_train.csv', index=False)

## 2. Test Data Feature Engineering (Avoiding Data Leakage)

In [37]:
test_df = pd.read_csv('../data/raw/test.csv', parse_dates=['date'])

In [38]:
print_null_rows(test_df, 'test_df')
test_df.isnull().sum()

shape:(28512, 5), No null rows in test_df.



id             0
date           0
store_nbr      0
family         0
onpromotion    0
dtype: int64

In [39]:
test_df = create_features(test_df)
print_null_rows(test_df, 'test_df_temporal')
test_df.isnull().sum()

shape:(28512, 19), No null rows in test_df_temporal.



id                   0
date                 0
store_nbr            0
family               0
onpromotion          0
year                 0
month                0
day                  0
dayofweek            0
weekofyear           0
day_of_year          0
is_weekend           0
is_month_start       0
is_month_end         0
is_quarter_start     0
is_quarter_end       0
is_payday            0
days_since_payday    0
days_until_payday    0
dtype: int64

In [40]:
# For test data, we need to be careful about lag and rolling features
# We'll use the last available training data to create these features

# Get the last date from training data
last_train_date = train_df['date'].max()
print(f"Last training date: {last_train_date}")

# Get the first date from test data
first_test_date = test_df['date'].min()
print(f"First test date: {first_test_date}")

Last training date: 2017-08-15 00:00:00
First test date: 2017-08-16 00:00:00


In [41]:
# Create a function to safely create lag features for test data
def create_safe_lag_features_for_test(train_df, test_df, target_col='sales', lags=[1, 7, 14, 30]):
    """Create lag features for test data using only training data to avoid leakage"""
    
    # Combine train and test for consistent processing
    # But we'll only use train data for lag calculations
    combined_df = pd.concat([train_df, test_df], ignore_index=True)
    combined_df = combined_df.sort_values(['store_nbr', 'family', 'date'])
    
    # Create lag features
    for lag in lags:
        combined_df[f'{target_col}_lag_{lag}'] = combined_df.groupby(['store_nbr', 'family'])[target_col].shift(lag)
    
    # Return only test data portion
    test_with_lags = combined_df[combined_df['date'] >= first_test_date].copy()
    return test_with_lags

# Apply lag features to test data
test_df_with_lags = create_safe_lag_features_for_test(train_df, test_df)
print(f"Test data shape with lag features: {test_df_with_lags.shape}")

Test data shape with lag features: (28512, 24)


In [42]:
test_df_with_lags.isnull().sum()

id                       0
date                     0
store_nbr                0
family                   0
sales                28512
onpromotion              0
year                     0
month                    0
day                      0
dayofweek                0
weekofyear               0
day_of_year              0
is_weekend               0
is_month_start           0
is_month_end             0
is_quarter_start         0
is_quarter_end           0
is_payday                0
days_since_payday        0
days_until_payday        0
sales_lag_1          26730
sales_lag_7          16038
sales_lag_14          3564
sales_lag_30             0
dtype: int64

In [43]:
# Create rolling features for test data using training data history
def create_safe_rolling_features_for_test(train_df, test_df, target_col='sales', windows=[7, 14, 30]):
    """Create rolling features for test data using training data history"""
    
    # Combine train and test data
    combined_df = pd.concat([train_df, test_df], ignore_index=True)
    combined_df = combined_df.sort_values(['store_nbr', 'family', 'date'])
    
    for window in windows:
        # Rolling mean
        combined_df[f'{target_col}_rolling_mean_{window}'] = combined_df.groupby(['store_nbr', 'family'])[target_col].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
        
        # Rolling std
        combined_df[f'{target_col}_rolling_std_{window}'] = combined_df.groupby(['store_nbr', 'family'])[target_col].transform(
            lambda x: x.rolling(window=window, min_periods=1).std()
        )
        
        # Rolling max
        combined_df[f'{target_col}_rolling_max_{window}'] = combined_df.groupby(['store_nbr', 'family'])[target_col].transform(
            lambda x: x.rolling(window=window, min_periods=1).max()
        )
        
        # Rolling min
        combined_df[f'{target_col}_rolling_min_{window}'] = combined_df.groupby(['store_nbr', 'family'])[target_col].transform(
            lambda x: x.rolling(window=window, min_periods=1).min()
        )
    
    # Return only test data portion
    test_with_rolling = combined_df[combined_df['date'] >= first_test_date].copy()
    return test_with_rolling

# Apply rolling features to test data
test_df_processed = create_safe_rolling_features_for_test(train_df, test_df_with_lags)
print(f"Test data shape with rolling features: {test_df_processed.shape}")

Test data shape with rolling features: (28512, 36)


In [45]:
test_df_processed.isnull().sum()

id                           0
date                         0
store_nbr                    0
family                       0
sales                    28512
onpromotion                  0
year                         0
month                        0
day                          0
dayofweek                    0
weekofyear                   0
day_of_year                  0
is_weekend                   0
is_month_start               0
is_month_end                 0
is_quarter_start             0
is_quarter_end               0
is_payday                    0
days_since_payday            0
days_until_payday            0
sales_lag_1              26730
sales_lag_7              16038
sales_lag_14              3564
sales_lag_30                 0
sales_rolling_mean_7     17820
sales_rolling_std_7      19602
sales_rolling_max_7      17820
sales_rolling_min_7      17820
sales_rolling_mean_14     5346
sales_rolling_std_14      7128
sales_rolling_max_14      5346
sales_rolling_min_14      5346
sales_ro

In [46]:
# Add holiday features to test data
test_df_processed = add_holiday_features(test_df_processed)
print_null_rows(test_df_processed, 'test_df_holidays')

shape:(28512, 44), Null rows in test_df_holidays:
              id       date  store_nbr      family  sales  onpromotion  \
3000888  3000888 2017-08-16          1  AUTOMOTIVE    NaN            0   
3000889  3002670 2017-08-17          1  AUTOMOTIVE    NaN            0   
3000890  3004452 2017-08-18          1  AUTOMOTIVE    NaN            0   
3000891  3006234 2017-08-19          1  AUTOMOTIVE    NaN            0   
3000892  3008016 2017-08-20          1  AUTOMOTIVE    NaN            0   
...          ...        ...        ...         ...    ...          ...   
3029395  3022139 2017-08-27         54     SEAFOOD    NaN            0   
3029396  3023921 2017-08-28         54     SEAFOOD    NaN            0   
3029397  3025703 2017-08-29         54     SEAFOOD    NaN            0   
3029398  3027485 2017-08-30         54     SEAFOOD    NaN            0   
3029399  3029267 2017-08-31         54     SEAFOOD    NaN            0   

           year  month   day  dayofweek  ...  sales_rolling_m

In [47]:
# Merge oil prices with test data
test_df_processed = test_df_processed.merge(oil_df, on='date', how='left')
print_null_rows(test_df_processed, 'test_df_oil')

shape:(28512, 45), Null rows in test_df_oil:
            id       date  store_nbr      family  sales  onpromotion    year  \
0      3000888 2017-08-16          1  AUTOMOTIVE    NaN            0  2017.0   
1      3002670 2017-08-17          1  AUTOMOTIVE    NaN            0  2017.0   
2      3004452 2017-08-18          1  AUTOMOTIVE    NaN            0  2017.0   
3      3006234 2017-08-19          1  AUTOMOTIVE    NaN            0  2017.0   
4      3008016 2017-08-20          1  AUTOMOTIVE    NaN            0  2017.0   
...        ...        ...        ...         ...    ...          ...     ...   
28507  3022139 2017-08-27         54     SEAFOOD    NaN            0  2017.0   
28508  3023921 2017-08-28         54     SEAFOOD    NaN            0  2017.0   
28509  3025703 2017-08-29         54     SEAFOOD    NaN            0  2017.0   
28510  3027485 2017-08-30         54     SEAFOOD    NaN            0  2017.0   
28511  3029267 2017-08-31         54     SEAFOOD    NaN            0  2017.

In [48]:
# Merge store information
test_df_processed = test_df_processed.merge(stores_df, on='store_nbr', how='left')
print_null_rows(test_df_processed, 'test_df_stores')

shape:(28512, 49), Null rows in test_df_stores:
            id       date  store_nbr      family  sales  onpromotion    year  \
0      3000888 2017-08-16          1  AUTOMOTIVE    NaN            0  2017.0   
1      3002670 2017-08-17          1  AUTOMOTIVE    NaN            0  2017.0   
2      3004452 2017-08-18          1  AUTOMOTIVE    NaN            0  2017.0   
3      3006234 2017-08-19          1  AUTOMOTIVE    NaN            0  2017.0   
4      3008016 2017-08-20          1  AUTOMOTIVE    NaN            0  2017.0   
...        ...        ...        ...         ...    ...          ...     ...   
28507  3022139 2017-08-27         54     SEAFOOD    NaN            0  2017.0   
28508  3023921 2017-08-28         54     SEAFOOD    NaN            0  2017.0   
28509  3025703 2017-08-29         54     SEAFOOD    NaN            0  2017.0   
28510  3027485 2017-08-30         54     SEAFOOD    NaN            0  2017.0   
28511  3029267 2017-08-31         54     SEAFOOD    NaN            0  20

In [50]:
# For transactions in test period, we need to be careful
# We can either:
# 1. Use 0 for all test transactions (conservative)
# 2. Use historical averages for each store
# 3. Predict transactions separately

# Option 2: Use historical averages
store_avg_transactions = transactions_df.groupby('store_nbr')['transactions'].mean().reset_index()
store_avg_transactions.columns = ['store_nbr', 'avg_transactions']

# Merge with test data
test_df_processed = test_df_processed.merge(store_avg_transactions, on='store_nbr', how='left')

# Use average transactions for test period
test_df_processed['transactions'] = test_df_processed['avg_transactions']
test_df_processed.drop('avg_transactions', axis=1, inplace=True)

print_null_rows(test_df_processed, 'test_df_transactions')

shape:(28512, 50), Null rows in test_df_transactions:
            id       date  store_nbr      family  sales  onpromotion    year  \
0      3000888 2017-08-16          1  AUTOMOTIVE    NaN            0  2017.0   
1      3002670 2017-08-17          1  AUTOMOTIVE    NaN            0  2017.0   
2      3004452 2017-08-18          1  AUTOMOTIVE    NaN            0  2017.0   
3      3006234 2017-08-19          1  AUTOMOTIVE    NaN            0  2017.0   
4      3008016 2017-08-20          1  AUTOMOTIVE    NaN            0  2017.0   
...        ...        ...        ...         ...    ...          ...     ...   
28507  3022139 2017-08-27         54     SEAFOOD    NaN            0  2017.0   
28508  3023921 2017-08-28         54     SEAFOOD    NaN            0  2017.0   
28509  3025703 2017-08-29         54     SEAFOOD    NaN            0  2017.0   
28510  3027485 2017-08-30         54     SEAFOOD    NaN            0  2017.0   
28511  3029267 2017-08-31         54     SEAFOOD    NaN           

In [51]:
# Check for any remaining null values
print("Null values in test data:")
print(test_df_processed.isnull().sum())

# Fill any remaining nulls with appropriate values
# For rolling features, use forward fill
rolling_cols = [col for col in test_df_processed.columns if 'rolling' in col]
for col in rolling_cols:
    test_df_processed[col] = test_df_processed.groupby(['store_nbr', 'family'])[col].fillna(method='ffill')

# For lag features, use forward fill
lag_cols = [col for col in test_df_processed.columns if 'lag' in col]
for col in lag_cols:
    test_df_processed[col] = test_df_processed.groupby(['store_nbr', 'family'])[col].fillna(method='ffill')

print("\nNull values after filling:")
print(test_df_processed.isnull().sum())

Null values in test data:
id                           0
date                         0
store_nbr                    0
family                       0
sales                    28512
onpromotion                  0
year                         0
month                        0
day                          0
dayofweek                    0
weekofyear                   0
day_of_year                  0
is_weekend                   0
is_month_start               0
is_month_end                 0
is_quarter_start             0
is_quarter_end               0
is_payday                    0
days_since_payday            0
days_until_payday            0
sales_lag_1              26730
sales_lag_7              16038
sales_lag_14              3564
sales_lag_30                 0
sales_rolling_mean_7     17820
sales_rolling_std_7      19602
sales_rolling_max_7      17820
sales_rolling_min_7      17820
sales_rolling_mean_14     5346
sales_rolling_std_14      7128
sales_rolling_max_14      5346
sales_rolling

  test_df_processed[col] = test_df_processed.groupby(['store_nbr', 'family'])[col].fillna(method='ffill')
  test_df_processed[col] = test_df_processed.groupby(['store_nbr', 'family'])[col].fillna(method='ffill')
  test_df_processed[col] = test_df_processed.groupby(['store_nbr', 'family'])[col].fillna(method='ffill')
  test_df_processed[col] = test_df_processed.groupby(['store_nbr', 'family'])[col].fillna(method='ffill')
  test_df_processed[col] = test_df_processed.groupby(['store_nbr', 'family'])[col].fillna(method='ffill')
  test_df_processed[col] = test_df_processed.groupby(['store_nbr', 'family'])[col].fillna(method='ffill')
  test_df_processed[col] = test_df_processed.groupby(['store_nbr', 'family'])[col].fillna(method='ffill')
  test_df_processed[col] = test_df_processed.groupby(['store_nbr', 'family'])[col].fillna(method='ffill')
  test_df_processed[col] = test_df_processed.groupby(['store_nbr', 'family'])[col].fillna(method='ffill')
  test_df_processed[col] = test_df_processed.g


Null values after filling:
id                           0
date                         0
store_nbr                    0
family                       0
sales                    28512
onpromotion                  0
year                         0
month                        0
day                          0
dayofweek                    0
weekofyear                   0
day_of_year                  0
is_weekend                   0
is_month_start               0
is_month_end                 0
is_quarter_start             0
is_quarter_end               0
is_payday                    0
days_since_payday            0
days_until_payday            0
sales_lag_1                  0
sales_lag_7                  0
sales_lag_14                 0
sales_lag_30                 0
sales_rolling_mean_7         0
sales_rolling_std_7          0
sales_rolling_max_7          0
sales_rolling_min_7          0
sales_rolling_mean_14        0
sales_rolling_std_14         0
sales_rolling_max_14         0
sales_rolli

In [52]:
# drop sales column from test data
test_df_processed.drop('sales', axis=1, inplace=True, errors='ignore')
# Save the processed test data
test_df_processed.to_csv('../data/interim/traditional_final_test.csv', index=False)

## 3. Handling Null Values for Prediction

In [None]:
def handle_nulls_for_prediction(test_df, strategy='comprehensive'):
    """
    Handle null values in test data for prediction using multiple strategies
    
    Parameters:
    test_df: DataFrame with null values
    strategy: 'forward_fill', 'group_mean', 'comprehensive'
    """
    test_df_filled = test_df.copy()
    
    if strategy == 'forward_fill':
        # Strategy 1: Forward fill within each store-family group
        lag_cols = [col for col in test_df.columns if 'lag' in col]
        rolling_cols = [col for col in test_df.columns if 'rolling' in col]
        
        for col in lag_cols + rolling_cols:
            test_df_filled[col] = test_df_filled.groupby(['store_nbr', 'family'])[col].fillna(method='ffill')
        
        # Fill any remaining nulls with 0
        test_df_filled[lag_cols + rolling_cols] = test_df_filled[lag_cols + rolling_cols].fillna(0)
    
    elif strategy == 'group_mean':
        # Strategy 2: Use historical averages from training data
        # This requires passing training data statistics
        pass  # Implement if training stats are available
    
    elif strategy == 'comprehensive':
        # Strategy 3: Comprehensive approach (recommended)
        
        # 1. Forward fill lag features within groups
        lag_cols = [col for col in test_df.columns if 'lag' in col]
        for col in lag_cols:
            test_df_filled[col] = test_df_filled.groupby(['store_nbr', 'family'])[col].fillna(method='ffill')
        
        # 2. Forward fill rolling features within groups
        rolling_cols = [col for col in test_df.columns if 'rolling' in col]
        for col in rolling_cols:
            test_df_filled[col] = test_df_filled.groupby(['store_nbr', 'family'])[col].fillna(method='ffill')
        
        # 3. For remaining nulls in lag features, use the last available value or 0
        for col in lag_cols:
            # Use median of available values for each store-family, then 0
            group_medians = test_df_filled.groupby(['store_nbr', 'family'])[col].transform('median')
            test_df_filled[col] = test_df_filled[col].fillna(group_medians).fillna(0)
        
        # 4. For remaining nulls in rolling features, use available values or 0
        for col in rolling_cols:
            if 'std' in col:
                # For standard deviation, use 0 for missing values
                test_df_filled[col] = test_df_filled[col].fillna(0)
            else:
                # For mean, max, min use group median then 0
                group_medians = test_df_filled.groupby(['store_nbr', 'family'])[col].transform('median')
                test_df_filled[col] = test_df_filled[col].fillna(group_medians).fillna(0)
    
    return test_df_filled

In [None]:
# Apply the comprehensive null handling strategy
test_df_final = handle_nulls_for_prediction(test_df_processed, strategy='comprehensive')

print("Null values before handling:")
print(test_df_processed.isnull().sum().sum())

print("\nNull values after handling:")
print(test_df_final.isnull().sum().sum())

print("\nDetailed null counts after handling:")
null_counts = test_df_final.isnull().sum()
print(null_counts[null_counts > 0])

In [None]:
# Alternative: Create indicator features for missing values
def create_missing_indicators(df):
    """Create binary indicators for originally missing values"""
    df_with_indicators = df.copy()
    
    # Create indicators for lag features
    lag_cols = [col for col in df.columns if 'lag' in col]
    for col in lag_cols:
        df_with_indicators[f'{col}_missing'] = df[col].isnull().astype(int)
    
    # Create indicators for rolling features
    rolling_cols = [col for col in df.columns if 'rolling' in col and 'std' not in col]
    for col in rolling_cols:
        df_with_indicators[f'{col}_missing'] = df[col].isnull().astype(int)
    
    return df_with_indicators

# Create missing indicators before filling nulls
test_df_with_indicators = create_missing_indicators(test_df_processed)
test_df_with_indicators = handle_nulls_for_prediction(test_df_with_indicators, strategy='comprehensive')

print(f"Test data shape with missing indicators: {test_df_with_indicators.shape}")
print(f"Original shape: {test_df_processed.shape}")
print(f"Added {test_df_with_indicators.shape[1] - test_df_processed.shape[1]} indicator features")

In [None]:
# Model-specific strategies
def prepare_for_different_models(df):
    """Prepare data differently based on model type"""
    
    # For Tree-based models (Random Forest, XGBoost, etc.)
    df_tree = df.copy()
    # Tree models can handle some nulls, but let's fill them
    df_tree = handle_nulls_for_prediction(df_tree, strategy='comprehensive')
    
    # For Linear models (Linear Regression, Lasso, etc.)
    df_linear = df.copy()
    # Linear models need all nulls filled and may benefit from indicators
    df_linear = create_missing_indicators(df_linear)
    df_linear = handle_nulls_for_prediction(df_linear, strategy='comprehensive')
    
    # For Neural Networks
    df_nn = df_linear.copy()  # Same as linear but may need scaling
    
    return {
        'tree_models': df_tree,
        'linear_models': df_linear,
        'neural_networks': df_nn
    }

# Prepare data for different model types
model_ready_data = prepare_for_different_models(test_df_processed)

for model_type, data in model_ready_data.items():
    print(f"{model_type}: {data.shape}, nulls: {data.isnull().sum().sum()}")

In [None]:
# Save the processed test data for prediction
test_df_final.to_csv('../data/interim/traditional_final_test_filled.csv', index=False)
test_df_with_indicators.to_csv('../data/interim/traditional_final_test_with_indicators.csv', index=False)

print("Saved processed test data files:")
print("1. traditional_final_test_filled.csv - Basic null handling")
print("2. traditional_final_test_with_indicators.csv - With missing value indicators")

# Final verification
print(f"\nFinal test data shape: {test_df_final.shape}")
print(f"Final null count: {test_df_final.isnull().sum().sum()}")
print(f"Ready for prediction: {test_df_final.isnull().sum().sum() == 0}")