# Import Libs

In [27]:
import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


# Import Data

In [2]:
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')
transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')

# Data Preprocessing

In [3]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

In [4]:
# Filter data from after 2016 only
train = train[train['date'].dt.year >= 2016]
test = test[test['date'].dt.year >= 2016]

In [5]:
# Create a new 'holiday' column in the train dataset, marking dates that are holidays based on the holidays dataset. 
train['holiday'] = train['date'].isin(holidays['date'])

# For the test dataset, mark only the specific date '2017-08-24' as a holiday.
test['holiday'] = test['date'] == pd.to_datetime('2017-08-24')

In [6]:
# Extract year, month, day, and weekday from the 'date' column in the train dataset
# and create new columns for each of these components.

train['year'] = train['date'].dt.year 
train['month'] = train['date'].dt.month   
train['day'] = train['date'].dt.day        
train['weekday'] = train['date'].dt.weekday  # Extract the weekday (0 = Monday, 6 = Sunday)

In [8]:
# Same for Test data
test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['weekday'] = test['date'].dt.weekday

In [10]:
# dropping date columns from train and test
train = train.drop(columns=['date'])
test = test.drop(columns=['date'])

> One hot encoding

In [11]:
# Identify all categorial columns in the train dataset 
object_cols = train.select_dtypes(include=['object']).columns 

In [15]:
# and apply one-hot encoding to these columns in both the train and test datasets.
# The 'drop_first=True' parameter drops the first category to avoid multicollinearity.

train = pd.get_dummies(train, columns=object_cols, drop_first=True)  
test = pd.get_dummies(test, columns=object_cols, drop_first=True)

In [19]:
# Align the columns of the train and test datasets to ensure they have the same columns.
# Missing columns in either dataset (after alignment) are filled with zeros (fill_value=0).
# This is done to maintain consistency between the train and test sets for modeling.

train, test = train.align(test, join='left', axis=1, fill_value=0)

In [22]:
# Set features (X) and target (y)
# 'X' contains all columns from the train dataset except for 'sales', which is the target variable.
# 'y' contains the target variable 'sales' for the model to predict.

X = train.drop(columns=['sales'])  # Features (all columns except 'sales')
y = train['sales']  # Target variable (sales)

> data normalisation

In [24]:
# Normalising sales data with a log transformation with a skewed distribution (see EDA)
# Makes model fit better, makes data less skewed, takes care of variance. 

y_log = np.log1p(y)

In [26]:
# Split the data into training and validation sets- 80-20 split
# 'random_state=42' ensures same split every time the code is run (hence code is reproducible)

X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [28]:
# Scaling
# Apply MinMaxScaler to scale the features (X) and target (y) values to a range between 0 and 1
# scaling ensures no single feature or target value dominates because of its larger scale.

scaler_X = MinMaxScaler(feature_range=(0, 1))  # features
scaler_y = MinMaxScaler(feature_range=(0, 1))  # target variable

In [32]:
# use scaler_X to scale X_train and X_val
X_train_scaled = scaler_X.fit_transform(X_train)
X_val_scaled = scaler_X.transform(X_val)

In [31]:
# use scaler_Y to scale Y_train and Y_val
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_val_scaled = scaler_y.transform(y_val.values.reshape(-1, 1))

# LSTM Preprocessing

In [33]:
# LSTM models expect the input data to have a specific 3D shape
# number of samples (rows), number of timesteps the number of features.
# Reshape for LSTM

X_train_scaled = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_val_scaled = X_val_scaled.reshape(X_val_scaled.shape[0], 1, X_val_scaled.shape[1])

In [35]:
train

Unnamed: 0,id,store_nbr,sales,onpromotion,holiday,year,month,day,weekday,family_BABY CARE,...,family_MAGAZINES,family_MEATS,family_PERSONAL CARE,family_PET SUPPLIES,family_PLAYERS AND ELECTRONICS,family_POULTRY,family_PREPARED FOODS,family_PRODUCE,family_SCHOOL AND OFFICE SUPPLIES,family_SEAFOOD
1945944,1945944,1,0.000,0,True,2016,1,1,4,False,...,False,False,False,False,False,False,False,False,False,False
1945945,1945945,1,0.000,0,True,2016,1,1,4,True,...,False,False,False,False,False,False,False,False,False,False
1945946,1945946,1,0.000,0,True,2016,1,1,4,False,...,False,False,False,False,False,False,False,False,False,False
1945947,1945947,1,0.000,0,True,2016,1,1,4,False,...,False,False,False,False,False,False,False,False,False,False
1945948,1945948,1,0.000,0,True,2016,1,1,4,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3000883,3000883,9,438.133,0,True,2017,8,15,1,False,...,False,False,False,False,False,True,False,False,False,False
3000884,3000884,9,154.553,1,True,2017,8,15,1,False,...,False,False,False,False,False,False,True,False,False,False
3000885,3000885,9,2419.729,148,True,2017,8,15,1,False,...,False,False,False,False,False,False,False,True,False,False
3000886,3000886,9,121.000,8,True,2017,8,15,1,False,...,False,False,False,False,False,False,False,False,True,False


# Model Training

In [36]:
# Instantiate Model
model = Sequential()

> LSTM Layer

In [37]:
# Add an LSTM layer with 50 units and specify the input shape
# shape 1 is number of timesteps
# shape 2 is number of features for each timestep

model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2])))

> Dropout Layer

In [38]:
# Drop 20% of the LSTM units to prevent overfitting - Regularization technique
model.add(Dropout(0.2))  

> Repeat the steps

In [40]:
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))

> output layer

In [41]:
# Add a Dense output layer with 1 unit for the final prediction
model.add(Dense(units=1)) 

In [42]:
# Compile the model with the Adam optimizer and mean squared error as the loss function
model.compile(optimizer='adam', loss='mean_squared_error')

In [43]:
epochs = 100
retrain_recent_data = True
retrain_epochs = 20

In [44]:
# Train the model on the training data and validate it using the validation data
# The model will be trained for the specified number of epochs and with a batch size of 64

history = model.fit(X_train_scaled, y_train_scaled, epochs=epochs, batch_size=64, 
                    validation_data=(X_val_scaled, y_val_scaled), verbose=1)

Epoch 1/100
[1m13187/13187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 4ms/step - loss: 0.0120 - val_loss: 0.0076
Epoch 2/100
[1m13187/13187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 4ms/step - loss: 0.0079 - val_loss: 0.0069
Epoch 3/100
[1m13187/13187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 4ms/step - loss: 0.0073 - val_loss: 0.0066
Epoch 4/100
[1m13187/13187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 4ms/step - loss: 0.0071 - val_loss: 0.0064
Epoch 5/100
[1m13187/13187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 4ms/step - loss: 0.0069 - val_loss: 0.0063
Epoch 6/100
[1m13187/13187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 4ms/step - loss: 0.0068 - val_loss: 0.0062
Epoch 7/100
[1m13187/13187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 4ms/step - loss: 0.0067 - val_loss: 0.0060
Epoch 8/100
[1m13187/13187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 4ms/step - loss: 0.0065 - val_loss:

In [45]:
if retrain_recent_data:
    retrain_history = model.fit(X_val_scaled, y_val_scaled, epochs=retrain_epochs, batch_size=64, verbose=1)

Epoch 1/20
[1m3297/3297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 0.0047
Epoch 2/20
[1m3297/3297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 0.0047
Epoch 3/20
[1m3297/3297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 0.0046
Epoch 4/20
[1m3297/3297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 0.0046
Epoch 5/20
[1m3297/3297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 0.0047
Epoch 6/20
[1m3297/3297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 0.0046
Epoch 7/20
[1m3297/3297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 0.0046
Epoch 8/20
[1m3297/3297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 0.0046
Epoch 9/20
[1m3297/3297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 0.0046
Epoch 10/20
[1m3297/3297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

# Predict

> Validation Dataset

In [49]:
# Make predictions on the validation data
y_val_pred_scaled = model.predict(X_val_scaled)

[1m6594/6594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step


In [50]:
# Inverse transform the predicted values back to the original scale
# The predictions and actual target values (y_val_scaled) are scaled, so we need to convert them back
y_val_pred = scaler_y.inverse_transform(y_val_pred_scaled)
y_val = scaler_y.inverse_transform(y_val_scaled)

In [51]:
# Evaluate the model's performance using Mean Squared Error (MSE)
mse = mean_squared_error(y_val, y_val_pred)
print(f'Validation MSE: {mse}')

Validation MSE: 0.5465549446401388


> Test Dataset

In [52]:
# Prepare the test data by dropping the 'sales' column (since we want to predict it)
X_test = test.drop(columns=['sales'])

In [54]:
# Scale the test data using the previously fitted scaler (used for training data)
X_test_scaled = scaler_X.transform(X_test)

In [55]:
# Reshape the test data to fit the LSTM input format (samples, timesteps, features)
X_test_scaled = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

In [56]:
# Make predictions on the test data using the trained model
test_predictions_scaled = model.predict(X_test_scaled)

[1m891/891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


In [57]:
# Convert the predicted 'sales' back to the original scale using the inverse transformation
test['sales'] = np.expm1(scaler_y.inverse_transform(test_predictions_scaled))

In [61]:
# Handle any negative predictions by setting them to 0 (since sales can't be negative)
test['sales'] = np.where(test['sales'] < 0, 0, test['sales'])

In [2]:
# Save the submission DataFrame to a CSV file for submission
submission = test[['id', 'sales']]
submission

NameError: name 'test' is not defined

In [1]:
submission.to_csv('/Users/sanchalisinghal/Desktop/KGGLE/submission.csv', index=False)

NameError: name 'submission' is not defined