In [35]:
import pandas as pd
from datetime import timedelta
import numpy as np
from datetime import datetime
#import plt_cr
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from ipywidgets import *
from IPython.display import display
from IPython.html.widgets import *
%matplotlib inline
from sklearn.externals import joblib
from sklearn.metrics import r2_score, explained_variance_score, mean_absolute_error, mean_squared_error, median_absolute_error
TIME_FORMAT = "%Y-%m-%d"
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

In [3]:
def roll(y, min_y, max_y):
    return (y - min_y) / (max_y - min_y)
def unroll(y, min_y, max_y):
    return (max_y - min_y) * y + min_y
scaler = StandardScaler()

#### Train and test sets, model

In [26]:
##train test sets to use
train = pd.read_csv('../train_test_sets/4_cat_features/regressor-train.csv')
test = pd.read_csv('../train_test_sets/4_cat_features/regressor-test-both.csv')

## loading model you are going to use 
model=joblib.load('xgb_regressor_wd_we_bf.pkl')

##columns to drop from the datasets
cols_to_drop=['order_date','prod_id', 'sku_id', '_merge', 'rv']

train = shuffle(train)
test = shuffle(test)

train = train.sort_values(by=['order_date'])
test = test.sort_values(by=['order_date'])

In [22]:
Y = train["rv"]
Y_Val = test["rv"]

min_y = min(Y.min(), Y_Val.min())
max_y = max(Y.max(), Y_Val.max())

Y = roll(Y, min_y=min_y, max_y=max_y)

In [33]:
def test_prediction(prod_id=323407, sku_id=34006140):
    subtest=test[(test.prod_id==prod_id)&(test.sku_id==sku_id)]
    X_Val = subtest.drop(cols_to_drop, axis=1)
    Y_Val = subtest["rv"]
    Z_Val = subtest[["prod_id","sku_id","order_date","rv"]]
    
    Y_Val = roll(Y_Val, min_y=min_y, max_y=max_y)
    Z_Val['Y']=Y_Val
    
    p = model.predict(X_Val)
    Z_Val["p"]=p
    Z_Val["predicted revenue"]=unroll(p, min_y=min_y, max_y=max_y)
    
    print ("Test predict r2-score: {}".format(r2_score(Y_Val, p)))
    print ("Test predict explained_variance_score: {}".format(explained_variance_score(Y_Val, p)))
    print ("Test predict mean_absolute_error: {}".format(mean_absolute_error(Y_Val, p)))
    print ("Test predict mean_squared_error: {}".format(mean_squared_error(Y_Val, p)))
    #print ("Test predict mean_squared_log_error: {}".format(mean_squared_log_error(Y_Val, p)))
    print ("Test predict median_absolute_error: {}".format(median_absolute_error(Y_Val, p)))
    
    print ("Test predict unscaled revenue mean_squared_error: {}".format(mean_squared_error(Z_Val['rv'], Z_Val["predicted revenue"])))
    print ("Test predict unscaled revenue median_absolute_error: {}".format(median_absolute_error(Z_Val['rv'], Z_Val["predicted revenue"])))
    s=train[["prod_id","sku_id","order_date","rv"]][(train.prod_id==prod_id)&(train.sku_id==sku_id)].sort_values(by=['order_date'])
    s['Y']=Y
    s=s.append(Z_Val)
    s=s.groupby(['prod_id','sku_id','order_date']).sum()
    s.reset_index(inplace=True)
    s['order_date']=s['order_date'].apply(lambda x: datetime.strftime(datetime(1970,1,1)+timedelta(days=x),TIME_FORMAT))
    s.set_index('order_date',inplace=True)
    s.columns=["prod_id","sku_id","Y","p","predicted revenue","actual revenue"]
    return s

In [24]:
test_dct={}
with open ('../train_test_sets/subset.csv') as test_full:
    for line in test_full:
        line=line.split(',')
        if line[0]!='prod_id':
            l0=int(line[0])
            l1=int(line[1])
            if l0 in test_dct.keys():
                if l1 not in test_dct[l0]:
                    test_dct[l0].append(l1)
            else:
                test_dct[l0]=[l1]

p_id=list(test_dct.keys())

In [36]:
x_widget = widgets.Dropdown(options=p_id, value=p_id[0],description='Product', disabled=False)
y_widget = widgets.Dropdown(options=test_dct[p_id[0]], value=test_dct[p_id[0]][0],description='SKU', disabled=False)

def update_y(*args):
    y_widget.options = test_dct[x_widget.value]
x_widget.observe(update_y, 'value')

def sku_change(product,sku):
    plt.style.use('fivethirtyeight')
    csfont = {'fontname':'Comic Sans MS','fontsize':25}
    dframe=test_prediction(prod_id=product, sku_id=sku)[['actual revenue','predicted revenue']]
    dframe=dframe.plot(figsize=(25, 8),style=['ro-','bo-'])
    plt.legend(loc='upper left',fontsize=20)
    plt.title("Revenue of SKU "+str(sku),**csfont)
    plt.xticks(rotation=17,fontsize=20)
    plt.yticks(fontsize=20)
    plt.show()
prod_id_lst=[]
interact(sku_change, product=x_widget, sku=y_widget)

A Jupyter Widget

<function __main__.sku_change>

#### Models (0.7 of training set have been used for training, 0.1 of test set for validation)

#################################################
#### 1. Model with weekday (indicator shows weekday)
##### Test predict r2-score: 0.5299134246068948
##### Test predict explained_variance_score: 0.531469182765118
##### Test predict mean_absolute_error: 5.6560362707825045e-05
##### Test predict mean_squared_error: 1.260108074665276e-07
##### Test predict median_absolute_error: 1.681574030953925e-05

#################################################
#### 2. Model with weekday feature, order_date has been excluded
##### Test predict r2-score: 0.5742968039683343
##### Test predict explained_variance_score: 0.5744179952970745
##### Test predict mean_absolute_error: 6.162475825623931e-05
##### Test predict mean_squared_error: 1.1434343587423002e-07
##### Test predict median_absolute_error: 2.4781280444585718e-05

#################################################
#### 3. Model with weekday (wd), weekend indicator (we) and "high sales season 24-26 nov" (bf)
##### Test predict r2-score: 0.5896162971036237
##### Test predict explained_variance_score: 0.5898569718185549
##### Test predict mean_absolute_error: 6.183339355173199e-05
##### Test predict mean_squared_error: 1.0911726138334042e-07
##### Test predict median_absolute_error: 2.452127773722168e-05

#################################################
#### 4. Model with weekday (wd), weekend indicator (we) and "high sales season 24-26 nov" (bf) and categorical features of SKU
##### Test predict r2-score: 0.535666973876717
##### Test predict explained_variance_score: 0.5370250930124898
##### Test predict mean_absolute_error: 5.665384438628712e-05
##### Test predict mean_squared_error: 1.2308352349169409e-07
##### Test predict median_absolute_error: 1.7476406355854124e-05