In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# load the dataset
df = pd.read_csv('walmart_dataset_PhD.csv')

In [4]:
# drop the variables with over 90 percent missing values, and other categorical variables not needed.
df.drop(['id','item_id','dept_id','cat_id','store_id','wm_yr_wk','weekday','wday',
        'event_name_1','event_type_1','event_name_2','event_type_2','date'],axis=1,inplace=True)

In [5]:
df.head()

Unnamed: 0,state_id,d,sales,month,year,snap_CA,snap_TX,snap_WI,sell_price
0,CA,d_1,12,1,2011,No,No,No,0.46
1,CA,d_2,15,1,2011,No,No,No,0.46
2,CA,d_3,0,1,2011,No,No,No,0.46
3,CA,d_4,0,2,2011,Yes,Yes,No,0.46
4,CA,d_5,0,2,2011,Yes,No,Yes,0.46


In [6]:
# write a function to create a simple moving average, exponential moving average and cumulative moving average for a 30 window period from a daily energy consumption variable and return the dataframe plus the created variables

def create_moving_averages(df):
    # Simple Moving Average
    df['simple_moving_average']=df['sales'].rolling(window=28, min_periods=1).mean()
    
    # Exponential Moving Average
    df['exp_weighted_moving_average']=df['sales'].ewm(span=28).mean()
    
    # Cumulative Moving Average
    df['cum_moving_average']=df['sales'].expanding().mean()
    
    return df

df = create_moving_averages(df)

In [7]:
df['total_price'] = df['sales'] * df['sell_price']

In [8]:
df.head()

Unnamed: 0,state_id,d,sales,month,year,snap_CA,snap_TX,snap_WI,sell_price,simple_moving_average,exp_weighted_moving_average,cum_moving_average,total_price
0,CA,d_1,12,1,2011,No,No,No,0.46,12.0,12.0,12.0,5.52
1,CA,d_2,15,1,2011,No,No,No,0.46,13.5,13.553571,13.5,6.9
2,CA,d_3,0,1,2011,No,No,No,0.46,9.0,8.709307,9.0,0.0
3,CA,d_4,0,2,2011,Yes,Yes,No,0.46,6.75,6.293346,6.75,0.0
4,CA,d_5,0,2,2011,Yes,No,Yes,0.46,5.4,4.848689,5.4,0.0


In [9]:
df[['state_id', 'snap_CA', 'snap_TX', 'snap_WI']] = df[['state_id', 'snap_CA', 'snap_TX', 'snap_WI']].apply(lambda x: pd.factorize(x)[0])

In [10]:
df.head()

Unnamed: 0,state_id,d,sales,month,year,snap_CA,snap_TX,snap_WI,sell_price,simple_moving_average,exp_weighted_moving_average,cum_moving_average,total_price
0,0,d_1,12,1,2011,0,0,0,0.46,12.0,12.0,12.0,5.52
1,0,d_2,15,1,2011,0,0,0,0.46,13.5,13.553571,13.5,6.9
2,0,d_3,0,1,2011,0,0,0,0.46,9.0,8.709307,9.0,0.0
3,0,d_4,0,2,2011,1,1,0,0.46,6.75,6.293346,6.75,0.0
4,0,d_5,0,2,2011,1,0,1,0.46,5.4,4.848689,5.4,0.0


In [11]:
df['d'] = df.d.str.strip('d_')

In [12]:
df['d'] = df['d'].astype('int64')

In [13]:
## downcasting loop
for column in df:
    if df[column].dtype == 'float64':
        df[column]=pd.to_numeric(df[column], downcast='float')
    if df[column].dtype == 'int64':
        df[column]=pd.to_numeric(df[column], downcast='integer')

In [14]:
# function to create the train and test datasets

def extract_train_test_samples(df):
    # Get the minimum and maximum values of the 'd' column
    d_min = df['d'].min()
    d_max = df['d'].max()
    
    # Extract the test sample in the order of df[df['d'] >= d_max - 28]
    test_sample = df[df['d'] >= d_max - 28].sort_values(by='d')
    
    # Extract the train sample in the order of df[(df['d'] >= d_min) & (df['d'] < d_max - 28)]
    train_sample = df[(df['d'] >= d_min) & (df['d'] < d_max - 28)].sort_values(by='d')
    
    return train_sample, test_sample

train_sample, test_sample = extract_train_test_samples(df)

In [15]:
print(train_sample.head())
print(train_sample.shape)
print(test_sample.head())
print(test_sample.shape)

       state_id  d  sales  month  year  snap_CA  snap_TX  snap_WI  sell_price  \
0             0  1     12      1  2011        0        0        0        0.46   
31906         1  1      0      1  2011        0        0        0        3.67   
31899         1  1      0      1  2011        0        0        0       12.12   
31892         1  1      1      1  2011        0        0        0        2.57   
31885         1  1      1      1  2011        0        0        0        6.44   

       simple_moving_average  exp_weighted_moving_average  cum_moving_average  \
0                  12.000000                    12.000000           12.000000   
31906               0.892857                     0.709585            2.601843   
31899               0.750000                     0.653497            2.602226   
31892               1.035714                     0.929516            2.602734   
31885               1.428571                     1.313224            2.603243   

       total_price  
0    

In [16]:
from sklearn.preprocessing import MinMaxScaler

    #define the X and y variables
X = train_sample
y = test_sample
    # create the train and test datasets
X_train = train_sample.drop(['sales'],axis=1)
y_train = train_sample['sales']
X_test = test_sample.drop(['sales'],axis=1)
y_test = test_sample['sales']

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [17]:
from tensorflow.keras.models import load_model
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [18]:
model = load_model('NEURAL_NETWORK_MODEL_PHD_EXPERIMENT.h5',compile=False)

In [19]:
pred = model.predict(X_train)



In [21]:
print('MAE:', mean_absolute_error(y_train, pred))
print('MSE:', mean_squared_error(y_train, pred))
print('RMSE:',np.sqrt(mean_squared_error(y_train, pred)))
print('R-squared: ',r2_score(y_train,pred))

MAE: 0.013706395
MSE: 0.009760606
RMSE: 0.09879578
R-squared:  0.9994776425339094
