In [1]:
#This is a notebook using the darts library for time series forecasting. 
#It uses the data from a kaggle competition
#https://www.kaggle.com/competitions/store-sales-time-series-forecasting/data

In [2]:
#Version 4 - LGBMRegressor
#Version 5 - XGBoost
#Version 6 - LGBM Regressor removing January and December

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import matplotlib.pyplot as plt

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import lightgbm as lgb
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


2024-04-02 14:35:25.222044: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-02 14:35:25.222215: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-02 14:35:25.409872: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:

from numpy import std
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

In [6]:
#Load notebooks from Kaggle.  To find the notebook in Kaggle
#Click File
#Select Add input
#Search for Store Sales - Time Series Forecasting
#Hit the plus button to turn thhe circle with a white bkgrd and black cross
    #to a button with a black bkgrd and white cross


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


In [7]:
# Imported transactions.csv, train.csv, stores.csv, test.csv, oil.csv, holidays_events.csv
transactions_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')
train_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
stores_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
test_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
oil_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
holidays_events_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')


In [8]:
#Create List of dataframes so that we can ensure all date columns are in the correct format
df_list=[transactions_df, train_df, test_df, oil_df, holidays_events_df]

for df in df_list:
    df['date'] = pd.to_datetime(df['date'])

In [9]:
# Create a new DataFrame with all dates from Jan 1st, 2013, to Aug 31st, 2017
all_dates = pd.date_range(start='2013-01-01', end='2017-08-31', freq='D')
all_dates_df = pd.DataFrame({'date': all_dates})

# Merge 'all_dates_df' with 'df_oil' to fill missing dates
oil_df = pd.merge(all_dates_df, oil_df, on='date', how='left')

oil_df.ffill(axis=0, inplace=True)
oil_df.bfill(axis=0, inplace=True)

In [10]:
#We are going to ignore the future warning for fillna

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [11]:
#Merge dataframes by date for training dataframe

# Merge oil prices into the DataFrame
df = train_df.merge(oil_df, on='date', how='left')

# Merge national holidays information into the DataFrame
df = df.merge(holidays_events_df[['date', 'transferred']], on='date', how='left')
df['national_holiday'] = df['transferred'].fillna(False)
df.drop(columns=['transferred'], inplace=True)

# Add columns for day of the week, day of the month, day of the year, month, and year
df['day_of_week'] = df['date'].dt.dayofweek  # Monday=0, Sunday=6
df['day_of_month'] = df['date'].dt.day
df['day_of_year'] = df['date'].dt.dayofyear
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

train_df=df

In [12]:
#Merge dataframes by date for test dataframe

# Merge oil prices into the DataFrame
df = test_df.merge(oil_df, on='date', how='left')

# Merge national holidays information into the DataFrame
df = df.merge(holidays_events_df[['date', 'transferred']], on='date', how='left')
df['national_holiday'] = df['transferred'].fillna(False)
df.drop(columns=['transferred'], inplace=True)

# Add columns for day of the week, day of the month, day of the year, month, and year
df['day_of_week'] = df['date'].dt.dayofweek  # Monday=0, Sunday=6
df['day_of_month'] = df['date'].dt.day
df['day_of_year'] = df['date'].dt.dayofyear
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

test_df=df

In [13]:
# Filter out rows with December and January
train_df = train_df[~train_df['date'].dt.month.isin([12, 1])]

In [14]:
#PreProcess training df
#Creating a dataframe for each store_nbr/family combination and storing it in the 
    #dfs_by_combination dictionary

# Get unique combinations of store_nbr and family
store_family_combinations = train_df[['store_nbr', 'family']].drop_duplicates()

# Create a dictionary to store dataframes for each combination
dfs_by_combination = {}

# Iterate over each combination
for index, row in store_family_combinations.iterrows():
    store_nbr = row['store_nbr']
    family = row['family']
    
    # Filter data for the current combination
    df = train_df[(train_df['store_nbr'] == store_nbr) & (train_df['family'] == family)]
    
    # Store the dataframe in the dictionary
    dfs_by_combination[(store_nbr, family)] = df

In [15]:
#Preprocessing train_df continued
# Iterate over each key-value pair in dfs_by_combination
for key, df in dfs_by_combination.items():

    # Update the DataFrame in the dictionary
    dfs_by_combination[key] = df

In [16]:
#PreProcess testing df
#Creating a dataframe for each store_nbr/family combination and storing it in the 
    #dfs_by_combination_test dictionary

#We will use this to essentially run the model on each store number/family combination 
    #to predict

# Get unique combinations of store_nbr and family
store_family_combinations = test_df[['store_nbr', 'family']].drop_duplicates()

# Create a dictionary to store dataframes for each combination
dfs_by_combination_test = {}

# Iterate over each combination
for index, row in store_family_combinations.iterrows():
    store_nbr = row['store_nbr']
    family = row['family']
    
    # Filter data for the current combination
    df = test_df[(test_df['store_nbr'] == store_nbr) & (test_df['family'] == family)]
    
    # Store the dataframe in the dictionary
    dfs_by_combination_test[(store_nbr, family)] = df

In [17]:
#Preprocessing test_df continued
# Iterate over each key-value pair in dfs_by_combination_test
for key, df in dfs_by_combination_test.items():

    # Update the DataFrame in the dictionary
    dfs_by_combination_test[key] = df

In [18]:
#Calls a single dataframe within our dictionary of dataframes and breaks it into feature variables (X)
#and target variable y which is sales.  It drops unimportant features like the id, store number and family.

#We are doing this to determine the best model on a single dataframe to reduce processing time.
#The assumption is that the model that predicts the best one or two dataframes will do a good job
#Of predicting the other dataframes

#This df was picked at random
X = dfs_by_combination[(21, 'AUTOMOTIVE')].drop(columns=['sales','id', 'store_nbr','family'])
y = dfs_by_combination[(27, 'AUTOMOTIVE')]['sales']

In [19]:
# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
#To use MAPE, we can have no zero values for the true value.  
#We will need to replace with a value close to zero to avoid division by zero error

y_val=y_val.replace(0, .01)

In [21]:
#Dropping the date as it messes up the model and we have captured it with other features
X_train_LGBM=X_train.drop(columns='date')
#y_train_LGBM=y_train.drop(columns='date')

X_val_LGBM=X_val.drop(columns='date')
#y_val_LGBM=y_val.drop(columns='date')

In [22]:
#Calls the model LGBMRegressor
model = LGBMRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [23]:
#Training the model on the training data from our one dataframe
model.fit(X_train_LGBM, y_train)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002879 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 560
[LightGBM] [Info] Number of data points in the train set: 1147, number of used features: 7
[LightGBM] [Info] Start training from score 4.808195


In [24]:
#Using the trained model to predict and scoring with MAPE
y_pred = model.predict(X_val_LGBM)

print(mean_absolute_percentage_error(y_val, y_pred))



13.224416165297145


In [25]:
#We will do the same steps for XGBRegressor.  We can use the same data we used as LGBM
model2=xgb.XGBRegressor()
model2.fit(X_train_LGBM, y_train)

y_pred = model2.predict(X_val_LGBM)

print(mean_absolute_percentage_error(y_val, y_pred))


14.208904855696282


In [26]:
#The inputs need to be slightly manipulated to work with the model

X_nn = X_train.drop(columns = ['date'])
X_nn['national_holiday'] = np.multiply(X_nn['national_holiday'], 1)

X_val_nn = X_val.drop(columns = ['date'])
X_val_nn['national_holiday'] = np.multiply(X_val_nn['national_holiday'], 1)

y_val_nn=y_val.drop(columns='date')



In [27]:
#Now constructing a neural network to do predictions

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the model architecture
model3 = Sequential([
    Dense(64, activation='relu', input_shape=(8,)),  # Input layer with 8 features
    Dense(64, activation='relu'),  # Hidden layer with 64 neurons and ReLU activation
    Dense(1)  # Output layer with 1 neuron (for regression task)
])

# Compile the model
model3.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model3.fit(X_nn, y_train, epochs=15, batch_size=4)  # Adjust epochs and batch_size as needed


Epoch 1/15


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 1217.3639
Epoch 2/15
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 19.6008
Epoch 3/15
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 22.8760
Epoch 4/15
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 20.5996
Epoch 5/15
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 19.2302
Epoch 6/15
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 19.9301
Epoch 7/15
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 25.0521
Epoch 8/15
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 17.5046
Epoch 9/15
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 26.0598
Epoch 10/15
[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - lo

<keras.src.callbacks.history.History at 0x7ed5b8647b50>

In [28]:
#Predicting based on neural network model

y_pred = model3.predict(X_val_nn)

y_pred = y_pred.reshape(-1) # reshape array from (343, 1) to (343, ) to use the mape function

print(mean_absolute_percentage_error(y_val_nn, y_pred))


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
26.771276524367188


In [29]:
#Now that we have picked a model and features based on one store_nbr, family combination
#We will fit that model to every store_nbr, family combination and make a prediction

#This is the model we are using.  Verbose set to -1 removes a number of warnings and other unneeded info
model_LGBM = LGBMRegressor(verbose=-1)
#model_XGB = xgb.XGBRegressor()


#Construct an empty dataframe. We will fill this with predictions from each store_nbr, family combination
y_combined_full=pd.DataFrame()

#Loop through all the store_nbr, family combinations in the test dataset
for key, family in dfs_by_combination_test:
    
    #Split the training dataset into X and Y on the store_nbr, family combination
    X_train = dfs_by_combination[(key, family)].drop(columns=['date', 'id', 'store_nbr','family', 'sales'])
    y_train = dfs_by_combination[(key, family)]['sales']
    
    #Train the model on the store_nbr, family combination
    model_LGBM.fit(X_train, y_train)
    
    #We need the features from the Test dataset
    X_test = dfs_by_combination_test[(key, family)].drop(columns=['date', 'id', 'store_nbr','family'])

    #This generates a list of ids for the final dataframe
    ids = dfs_by_combination_test[(key, family)]['id']
    
    #Predictions using the model for the store_nbr, family combination
    y_pred = pd.DataFrame(model_LGBM.predict(X_test), columns=['sales'])
    y_pred = y_pred.values.reshape(-1)

    #This is the id to predicted sales for this store_nbr, family combination
    y_combined= pd.DataFrame({'id': ids, 'sales': list(y_pred)}, columns=['id', 'sales'])

    #This adds the dataframe we just created to the store_nbr, family combination
    #At the end of the loop we will have a dataframe for id and sales for all
    #store_nbr, family combinations
    y_combined_full = pd.concat([y_combined_full, y_combined], axis=0)

    

In [30]:
#This removes the index created by the loop and sets it to the id from the test dataset
y_combined_full.set_index('id', inplace=True)

In [31]:
#Sorts the ids 
y_combined_full.sort_index()

Unnamed: 0_level_0,sales
id,Unnamed: 1_level_1
3000888,4.671359
3000889,0.000000
3000890,5.400940
3000891,2486.062196
3000892,0.406353
...,...
3029395,383.212233
3029396,87.022945
3029397,1233.014012
3029398,70.428709


In [32]:
#Converts the dataframe to CSV file for submission to contest
y_combined_full.to_csv('submission.csv',index=True)