#  Predict Future Sales
## Using Deep Neural Net 
## With Categorical Embeddings

### Load Training Data

In [56]:
import pandas as pd
import numpy as np
import sys
from matplotlib import pyplot as plt

stdout = sys.stdout
sales_train = pd.read_csv("/Users/djmore/Udacity/machine-learning/projects/capstone/sales_train.csv", sep =',')

sales_data = pd.DataFrame(sales_train)
items = pd.read_csv('/Users/djmore/Udacity/machine-learning/projects/capstone/items.csv')
item_categories = pd.read_csv('/Users/djmore/Udacity/machine-learning/projects/capstone/item_categories.csv')
shops = pd.read_csv('/Users/djmore/Udacity/machine-learning/projects/capstone/shops.csv')


items_data = pd.DataFrame(items)
item_categories_data = pd.DataFrame(item_categories)
shops_data = pd.DataFrame(shops)
sales_data.head(5)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


### Check if there are blank rows or null values in any data elements

In [57]:
display(sales_data.isnull().any())

date              False
date_block_num    False
shop_id           False
item_id           False
item_price        False
item_cnt_day      False
dtype: bool

## Describe data, gather basic stats and check for invalid values.

In [58]:
print('Shops               - {}'.format(shops_data['shop_id'].count()))
print('Item Categories     - {}'.format(items_data['item_category_id'].nunique()))
print('Items               - {}'.format(items_data['item_id'].count()))
print('Median item price   - {}'.format(sales_data['item_price'].median()))
print('Min item price      - {}'.format(sales_data['item_price'].min()))
print('Max item price      - {}'.format(sales_data['item_price'].max()))
print('Total Rows          - {}'.format(sales_data['shop_id'].count()))
print('')


Shops               - 60
Item Categories     - 84
Items               - 22170
Median item price   - 399.0
Min item price      - -1.0
Max item price      - 307980.0
Total Rows          - 2935849



### Negative item counts may indicate either returns or buy one get one promo. 
### Analyze which item_category and item_names have negative item counts

In [59]:
print('Negative Item Count = {}'.format((sales_data[sales_data['item_cnt_day'] < 0])['item_cnt_day'].count()))
print('Negative Item Price = {}'.format((sales_data[sales_data['item_price'] < 0])['item_price'].count()))

Negative Item Count = 7356
Negative Item Price = 1


##  Clean up sales data
### Remove negative sales values from data
#### For the purpose of this exercise we will remove all returns data i.e. negative item_price rows.
#### Negative values under item_cnt_day may indicate either a return or promo such as buy one get one free. 
#### Since we have not been any given any information and there are only handful of rows we will avoid the complexity of 
#### negative values by removing them.

In [60]:
#sales_data = sales_data[sales_data.item_cnt_day > 0] ## Remove rows that have negative item_cnt_day
sales_data = sales_data[sales_data.item_price > 0]   #  Remove rows that have negative item_price

In [61]:
print('Negative Item Count = {}'.format((sales_data[sales_data['item_cnt_day'] < 0])['item_cnt_day'].count()))
print('Negative Item Price = {}'.format((sales_data[sales_data['item_price'] < 0])['item_price'].count()))

Negative Item Count = 7356
Negative Item Price = 0


# Predict Future Sales
# Using Deep Neural Net

## Break down date column into individual features
### Year, Month, Day, DayofYear, WeekOfYear, DayOfWeek, Quarter
#### This will help us capture weekly, monthly, quarterly and yearly seasonality in the sales data

In [62]:
# Convert date field to datetime field in pandas
sales_data['date'] = pd.to_datetime(sales_data['date'])

# Extract out date features
sales_data['year'] = sales_data['date'].dt.year
sales_data['month'] = sales_data['date'].dt.month
sales_data['day'] = sales_data['date'].dt.day

sales_data['dayofyear'] = sales_data['date'].dt.dayofyear
sales_data['weekofyear'] = sales_data['date'].dt.weekofyear
sales_data['dayofweek'] = sales_data['date'].dt.dayofweek
sales_data['quarter'] = sales_data['date'].dt.quarter

# Sort data in ascending order of time series - year, month,and day
sales_data.sort_values(['date_block_num','date'], ascending=[True,True],inplace=True)

In [63]:
sales_data.drop(['date'], 1, inplace=True)

In [64]:
sales_data.head(5)

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,year,month,day,dayofyear,weekofyear,dayofweek,quarter
7554,0,19,18976,399.0,1.0,2013,1,1,1,1,1,1
7644,0,19,18284,199.0,1.0,2013,1,1,1,1,1,1
7646,0,19,18320,199.0,1.0,2013,1,1,1,1,1,1
7647,0,19,18329,299.0,1.0,2013,1,1,1,1,1,1
7694,0,19,19367,399.0,1.0,2013,1,1,1,1,1,1


In [65]:
sales_data['shop_id'] = pd.Categorical(sales_data['shop_id'])
sales_data['item_id'] = pd.Categorical(sales_data['item_id'])

## Deep Neural Net 

### Prepare the model to run raw daily sales data

#### Scale Numerical columns item_price and item_cnt_day

## Prepare Train-Test-Split Data
###  We will use 
  - First 31 months of data as training data 
  - 32nd months data as validation data
  - 33rd month data as test data

In [66]:
X_train = sales_data[sales_data.date_block_num < 28]
y_train = X_train['item_cnt_day']
del X_train['item_cnt_day']

X_val = sales_data[(sales_data.date_block_num >= 28) & (sales_data.date_block_num <= 32)] 
y_val = X_val['item_cnt_day']
del X_val['item_cnt_day']

X_test = sales_data[sales_data.date_block_num == 33]
y_test = X_test['item_cnt_day']
del X_test['item_cnt_day']

In [67]:
print('Training Date Block Num from    0 to 27')
print('Validation Date Block Num from  28 to 32')
print('Testing Date Block Num from     33')

Training Date Block Num from    0 to 27
Validation Date Block Num from  28 to 32
Testing Date Block Num from     33


### Define the Model Architecture
#### With Embeddings for daily sales data

In [81]:
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Embedding


# Embedding
max_features = 400000
maxlen = X_train.shape[1]
embedding_size = 200

# define the model
model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(Flatten(input_shape=X_train.shape[1:]))
model.add(Dense(3, activation=None))
model.add(Dropout(0.5))
model.add(Dense(3, activation=None))
model.add(Dropout(0.5))
#model.add(Dense(1, activation='softmax'))
model.add(Dense(1, activation=None))
#model.add(Activation("softmax"))

# summarize the model
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 11, 200)           80000000  
_________________________________________________________________
flatten_4 (Flatten)          (None, 2200)              0         
_________________________________________________________________
dense_10 (Dense)             (None, 3)                 6603      
_________________________________________________________________
dropout_7 (Dropout)          (None, 3)                 0         
_________________________________________________________________
dense_11 (Dense)             (None, 3)                 12        
_________________________________________________________________
dropout_8 (Dropout)          (None, 3)                 0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 4         
Total para

### Compile the Model

In [82]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])

In [83]:
print(X_test.shape)
print(y_test.shape)

(53514, 11)
(53514,)


### Evaluate test accuracy

In [84]:
# evaluate test accuracy

mae,a = model.evaluate(X_test, y_test, verbose=0)

print(mae,a)

97.2060510242 1.31898453403


### Train the Model

In [85]:
from keras.callbacks import ModelCheckpoint   

# train the model
checkpointer = ModelCheckpoint(filepath='predictsales.model.best.hdf5', verbose=1, save_best_only=True)
hist = model.fit(X_train, y_train, batch_size=10000, epochs=3, validation_data=(X_val, y_val), 
                 callbacks=[checkpointer],verbose=1)

Train on 2610003 samples, validate on 272331 samples
Epoch 1/3

Epoch 00001: val_loss improved from inf to 11.68171, saving model to predictsales.model.best.hdf5
Epoch 2/3

Epoch 00002: val_loss improved from 11.68171 to 11.67580, saving model to predictsales.model.best.hdf5
Epoch 3/3

Epoch 00003: val_loss improved from 11.67580 to 11.66417, saving model to predictsales.model.best.hdf5


### Load the Model with the Best Classification Accuracy on the Validation Set

In [86]:
# load the weights that yielded the best validation accuracy
model.load_weights('predictsales.model.best.hdf5')

### Calculate the Model Loss or Accuracy on the Test Set

In [87]:
# evaluate test accuracy

mse,a = model.evaluate(X_test, y_test, verbose=0)

print(mae,a)

95.1678670012 0.520533257584


In [88]:
from sklearn.metrics import mean_squared_error
from math import sqrt

y_pred = model.predict(X_test)

meanSquaredError=mean_squared_error(y_test, y_pred)
print("MSE:", meanSquaredError)

rootMeanSquaredError = sqrt(meanSquaredError)
print("RMSE:", rootMeanSquaredError)

MSE: 95.1678571651
RMSE: 9.755401435363234


In [94]:
print('Actual sales in 34th month were {} and model predicted {}'.format(y_test.sum(),round(y_pred.sum())))

Actual sales in 34th month were 71056.0 and model predicted 54816.0


In [95]:
x=71056
y=54816
print(x-y)

16240
