In [308]:
import keras
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Setting seed for reproducability
np.random.seed(1234)  
PYTHONHASHSEED = 0
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential,model_from_json
from keras.layers import Dense, Dropout, LSTM, Activation
%matplotlib inline
from decimal import Decimal
import warnings
warnings.filterwarnings("ignore")
import pickle
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.preprocessing import Normalizer, MinMaxScaler
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
from sklearn.metrics import mean_absolute_error
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

In [328]:
df=pd.read_csv('C:\\Users\\lengada1\\NCSU\\ten_skus.csv')
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.weekday_name

day_dummy=pd.get_dummies(df.Day)
df=pd.concat([df,day_dummy],axis=1)
df.drop(['Day','Date'],inplace=True,axis=1)


In [329]:
def split_df(df):
    d={}
    split_by_list=np.unique(df.id.values)
    for sku in split_by_list:
        d[sku]=df[df.id==sku]
    return d

df_dict=split_df(df);

In [330]:
y={}
for sku in list(df_dict.keys()):
    y[sku]=df_dict[sku]['Sales']

In [331]:
X={}
for sku in list(df_dict.keys()):
    X[sku]=df_dict[sku]
    X[sku]=X[sku].drop(['id','DayOfWeek','Customers','High_Var','Luxury','Sales'],axis=1)

In [333]:
for sku in list(df_dict.keys()):
    for obs in range(1,8):
        X[sku]["Sales_T"+str(obs)]=df_dict[sku]['Sales'].shift(obs)

In [334]:
for sku in list(df_dict.keys()):
    X[sku]["Mov_avg"]=pd.rolling_mean(df_dict[sku]['Sales'], window=7).shift(1) 

In [335]:
#Cut lagged vars NAs off top 
cut_lag=7;
for sku in list(df_dict.keys()):
    y[sku]=y[sku][cut_lag:]
    y[sku].reset_index(drop=True, inplace=True)
    
for sku in list(df_dict.keys()):
    X[sku]=X[sku][cut_lag:]
    X[sku].reset_index(drop=True, inplace=True)

In [336]:
X[1].head()

Unnamed: 0,Open,Promo,SchoolHoliday,Year,Month,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,Sales_T1,Sales_T2,Sales_T3,Sales_T4,Sales_T5,Sales_T6,Sales_T7,Mov_avg
0,1,1,1,2013,1,0,0,0,0,0,1,0,7176.0,0.0,4997.0,4486.0,4327.0,5530.0,0.0,3788.0
1,1,1,1,2013,1,0,0,0,0,0,0,1,5580.0,7176.0,0.0,4997.0,4486.0,4327.0,5530.0,4585.142857
2,1,1,1,2013,1,0,0,0,0,1,0,0,5471.0,5580.0,7176.0,0.0,4997.0,4486.0,4327.0,4576.714286
3,1,1,1,2013,1,1,0,0,0,0,0,0,4892.0,5471.0,5580.0,7176.0,0.0,4997.0,4486.0,4657.428571
4,1,0,0,2013,1,0,0,1,0,0,0,0,4881.0,4892.0,5471.0,5580.0,7176.0,0.0,4997.0,4713.857143


In [762]:
def RF_model(X_train, X_test, y_train, y_test):
    tree=RandomForestRegressor( random_state=2)
    tree.fit(X_train,y_train)
    pred=tree.predict(X_test)
    pred=pd.DataFrame(pred)
    pred.reset_index(drop=True, inplace=True)
    pred=pred.rename(columns = {0:'RF'})
    return pred

def NN_model(X_train, X_test, y_train, y_test):
    model = Sequential()
    model.add(Dense(20, input_dim=X_train.shape[1], activation='linear'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(40, activation='linear'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mse', optimizer='adam', metrics=['mae','mse'])
    model.fit(X_train, y_train, epochs=800, batch_size=10, verbose=0)
    prediction=model.predict(X_test, verbose=0)
    pred=pd.DataFrame(prediction)
    pred.reset_index(drop=True, inplace=True)
    pred=pred.rename(columns = {0:'NN'})
    return pred

def GB_model(X_train, X_test, y_train, y_test):
    params = {'n_estimators': 500, 'max_depth': 40, 'min_samples_split': 20,
          'learning_rate': 0.01, 'loss': 'ls'}
    model = ensemble.GradientBoostingRegressor(**params)
    model.fit(X_train, y_train)
    prediction=model.predict(X_test)
    pred=pd.DataFrame(prediction)
    pred.reset_index(drop=True, inplace=True)
    pred=pred.rename(columns = {0:'GB'})
    return pred

In [763]:
train_size=800
X_train={}
X_test={}
y_train={}
y_test={}

for sku in list(df_dict.keys()):
    X_train[sku],X_test[sku],y_train[sku],y_test[sku] =  X[sku][0:train_size], X[sku][train_size:len(X[sku])], y[sku][0:train_size], y[sku][train_size:len(X[sku])]

In [764]:
# Fit Models
rf_models={};
for sku in list(df_dict.keys()):
    rf_models[sku]=RF_model( X_train[sku],X_test[sku],y_train[sku],y_test[sku]   )

nn_models={};
for sku in list(df_dict.keys()):
    nn_models[sku]=NN_model( X_train[sku],X_test[sku],y_train[sku],y_test[sku]   )


In [765]:
gb_models={};
for sku in list(df_dict.keys()):
    gb_models[sku]=GB_model( X_train[sku],X_test[sku],y_train[sku],y_test[sku]   )

In [766]:
all_pred={}
for sku in list(df_dict.keys()):
    y_test[sku].reset_index(drop=True, inplace=True)
    all_pred[sku]=pd.concat([rf_models[sku],nn_models[sku],gb_models[sku],y_test[sku]],axis=1)
    

In [767]:
def MAE_score(y_true,pred1, pred2,pred3):
    mae1 = mean_absolute_error(y_true, pred1)
    mae2 = mean_absolute_error(y_true, pred2)
    mae3 = mean_absolute_error(y_true, pred3)
    return [mae1.round(),mae2.round(),mae3.round()]
MAE_score(y_test[1],rf_models[1],nn_models[1],gb_models[1])

[322.0, 767.0, 340.0]

In [768]:
mae=[]
for sku in list(df_dict.keys()):
    mae.append(MAE_score(all_pred[sku]['Sales'],all_pred[1]['RF'] ,all_pred[1]['NN'],all_pred[1]['GB'] ) )
mae=pd.DataFrame(mae)
mae=mae.rename(columns = {0:'RF',1:'NN',2:'GB'})

best=mae.idxmin(axis=1)
best.astype('category')
mae["Best"]=best
mae.index = mae.index + 1


In [770]:
def best_model(scores,X,y):
    d={}
    for sku in list(df_dict.keys()):
        
        if scores.loc[sku]["Best"]=="RF":
            tree=RandomForestRegressor( random_state=2)
            tree.fit(X[sku],y[sku])
            pred=tree.predict(X[sku])
            pred=pd.DataFrame(pred)
            pred.reset_index(drop=True, inplace=True)
            pred=pred.rename(columns = {0:'RF'})
            d[sku]=pred
            
        
        elif scores.loc[sku]["Best"]=="NN":
            model = Sequential()
            model.add(Dense(20, input_dim=X_train.shape[1], activation='linear'))
            model.add(Dense(20, activation='relu'))
            model.add(Dense(40, activation='linear'))
            model.add(Dense(10, activation='relu'))
            model.add(Dense(1, activation='linear'))
            model.compile(loss='mse', optimizer='adam', metrics=['mae','mse'])
            model.fit(X[sku], y[sku], epochs=1, batch_size=10, verbose=0)
            prediction=model.predict(X_test, verbose=0)
            pred=pd.DataFrame(prediction)
            pred.reset_index(drop=True, inplace=True)
            pred=pred.rename(columns = {0:'NN'})
            d[sku]=pred
                    
            
        elif scores.loc[sku]["Best"]=="GB":
            params = {'n_estimators': 500, 'max_depth': 40, 'min_samples_split': 20,
          'learning_rate': 0.01, 'loss': 'ls'}
            model = ensemble.GradientBoostingRegressor(**params)
            model.fit(X[sku], y[sku])
            prediction=model.predict(X[sku])
            pred=pd.DataFrame(prediction)
            pred.reset_index(drop=True, inplace=True)
            pred=pred.rename(columns = {0:'GB'})
            d[sku]=pred  
    return d      

In [771]:
bm_dict=best_model(mae,X,y)

In [772]:
sku_pred=pd.concat([ bm_dict[1],bm_dict[2],bm_dict[3],bm_dict[4],bm_dict[5],bm_dict[6],bm_dict[7],bm_dict[8],bm_dict[9],bm_dict[10],
],axis=1) 
sku_pred['Total'] = sku_pred.sum(axis=1)
sku_pred.columns = ['1','2','3','4','5','6','7','8','9','10','Total']
sku_pred.to_csv('C:\\Users\\lengada1\\NCSU\\prediction_skus.csv')
sku_pred.shape

(935, 11)