In [14]:
import pandas as pd
import numpy as np
from sklearn import metrics, linear_model
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

Loading the full dataset

In [2]:
#df_all = pd.read_csv('https://bsa-correlation-one.s3.amazonaws.com/timeseries644.csv')
df_all = pd.read_csv('../data/timeseries644.csv', dtype={'product_gtin':'str'})

#converting the dates to datetime format
df_all['date'] = pd.to_datetime(df_all['date'])

Check the most important products_gtins on the x last_days

In [3]:
#input the number of days to check the most important product in terms of orders.
last_days = 60
check_sales_date = df_all['date'].max() - pd.DateOffset(last_days)

In [4]:
df_all[df_all['date'] >= check_sales_date].groupby(by=['product_gtin'])['orders'].sum().reset_index().sort_values(by = ['orders'], ascending = False).head(10)

Unnamed: 0,product_gtin,orders
556,7908243600137,441.0
626,8800550956404,316.0
235,7892049060281,233.0
494,7899495703106,158.0
581,7909389621864,130.0
386,7898507470104,110.0
128,6951003643197,99.0
80,4206113771707,82.0
499,7899621100984,70.0
385,7898506456345,70.0


Chooses the product_gtin to make the predictions and drop days with orders < threshold

In [5]:
#orders_min = 5
#df = df_all[(df_all['product_gtin'] == '7908243600137') & (df_all['orders'] >= orders_min)]
df = df_all[(df_all['product_gtin'] == '7908243600137')]

Create some features for seasonality and convert them to dummies

In [6]:
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day
df['weekday'] = pd.DatetimeIndex(df['date']).weekday

Fill the nan freight values by the median (we just have one product here)

In [7]:
df['freight_value'] = df['freight_value'].fillna(df['freight_value'].median())

In [8]:
vars_cat = ['weekday', 'month', 'day']
df_cat_dummy = pd.get_dummies(df, columns = vars_cat, drop_first=True)

Split traning and test samples

In [9]:
#for time-series it's not a good idea subset train and test datasets randonly, because time is a important feature.
#n_test = number os rows used in test dataset.
n_test = 60                
n_data = len(df_cat_dummy)
n_train = n_data - n_test

train = df_cat_dummy.iloc[0:n_train, :]
test  = df_cat_dummy.iloc[n_train:n_data, :]

Lets try a simple linear regression

In [10]:
y = 'orders'
x = ['competition_price', 'price', 'freight_value', 'stock_avg',
     'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6',
     'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 
     'month_8', 'month_9', 'month_10', 'month_11', 'month_12',
     'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'day_7', 'day_8', 'day_9', 'day_10',
     'day_11', 'day_12', 'day_13', 'day_14', 'day_15', 'day_16', 'day_17', 'day_18', 'day_19',
     'day_20', 'day_21', 'day_22', 'day_23', 'day_24', 'day_25', 'day_26', 'day_27', 'day_28',
     'day_29', 'day_30', 'day_31']

In [12]:
mod0 = linear_model.LinearRegression()
mod0.fit(train.loc[:,x], train.loc[:,y])
mod0_pred = mod0.predict(test.loc[:,x])

#evaluating model

In [18]:
median_absolute_error = metrics.median_absolute_error(test.loc[:,y], mod0_pred)
r2 = metrics.r2_score(test.loc[:,y], mod0_pred)
print (median_absolute_error)

5.502205551840806


Let's run our regression without stock, price and offer avg, min and max....there's a lot of multicolinearity there. How deal with offer? Should we drop it? The question is, to forecast the number of orders we need input independent variables...do we have it?

First we set a function to  calculate MAPE (to evaluate the output of our models)

In [None]:
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred)

In [None]:
#vars we dont run in the regression.
var_drop = ['date', 'product_gtin','product_type', 'product_type_group', 'orders','stock_max', 'stock_min',
            'stock_avg', 'price_max', 'price_min', 'price_avg', 'offer_max','offer_min', 'offer_avg', 'commission_product_olist',
            'commission_freight_olist']

y_df = pd.DataFrame(list_var)
y_df = y_df[~y_df[0].isin(var_drop)]


#input p-value threshold
threshold = 0.05

#set y variables
y = y_df[0]

#set X variable - manually, but dont change.
X = 'orders ~ '

#set initial parameters for record the results
nPar_i = len(y) #number of parameter at the beggining
nPar_f = 0 #number of parameters after the selection
model_form = []
model_nfeat = []
model_aic = []
model_r2 = []
model_MAPE = []

while nPar_f < nPar_i:
    #load the formula
    nPar_i = len(y) #number of parameters before the feature selection
    yform = ' + '.join(y)
    formula = X + yform
    #run the model
    model = smf.ols(formula=formula, data=train).fit()
    predictions = model.predict(test)
    #feed model output in lists
    model_form.append(formula) #formula used
    model_aic.append(model.aic) #aic
    model_r2.append(model.rsquared) #r2
    model_nfeat.append(len(model.params)) 
    model_MAPE.append(MAPE(test['orders'], predictions))
    #feed a dataframe with features of formula and p-values. drop the intercept, reset index, build a column abs(p-value) and filter rows with abs_p-value < threshold
    params = pd.DataFrame(model.pvalues)
    params = params.drop(['Intercept']).rename(columns = {0: 'p-value'}).reset_index()
    params['abs_p-value'] = abs(params['p-value'])
    y = params['index'][params['abs_p-value'] <= threshold]
    nPar_f = len(y) #number of parameters after feature selection



In [None]:
#print the dataframe with final analysis.
pd.DataFrame({'formula': model_form, 'aic': model_aic, 'r2' : model_r2, 'nfeat' : model_nfeat, 'mape': model_MAPE})

In [None]:
#o mape nao ta calculando direito. nao to conseguindo plotar o test.orders x prediction
trace0 = go.Scatter(x=test.date,
                   y=test.orders, mode='lines', name='orders')

trace1 = go.Scatter(x=df.date,
                    y=predictions, mode='markers', name='competition price')

iplot([trace0])

Transforming float variables by boxcox criterium. The interpretation indexes was found here: https://www.statisticshowto.com/box-cox-transformation/

In [None]:
#creating a function to check if data is float and if it is, convert it by boxcox criterium. I added 0.0001 to allow the log calculations.

def transf_boxcox(data):
    temp, fitted_lambda = stats.boxcox(data+0.0001)
    if fitted_lambda < -2.7:
        return (1/(data+0.0001)**3)
    elif fitted_lambda <-1.7:
        return (1/(data+0.0001)**2)
    elif fitted_lambda <-0.7:
        return (1/(data+0.0001))
    elif fitted_lambda <-0.3:
        return (1/(data+0.0001)**0.5)
    elif fitted_lambda <0.3:
        return np.log(data+0.0001)
    elif fitted_lambda <0.7:
        return ((data+0.0001)**0.5)
    elif fitted_lambda <1.3:
        return ((data+0.0001))
    elif fitted_lambda <2.3:
        return ((data+0.0001)**2)
    elif fitted_lambda <3.3:
        return ((data+0.0001)**3)
    else:
        return ((data+0.0001))

In [None]:
df_cat_dummy_t = df_cat_dummy.copy()
for col in df_cat_dummy_t.columns:
    if df_cat_dummy_t[col].dtypes == "float":
        df_cat_dummy_t[col] = transf_boxcox(df_cat_dummy_t[col])

In [None]:
df_cat_dummy_t

In [None]:
train = df_cat_dummy_t.iloc[0:n_train, :]
test  = df_cat_dummy_t.iloc[n_train:n_data, :]

In [None]:
#vars we dont run in the regression.
var_drop = ['date', 'product_gtin','product_type', 'product_type_group', 'orders','stock_max', 'stock_min',
            'stock_avg', 'price_max', 'price_min', 'price_avg', 'offer_max','offer_min', 'offer_avg', 'commission_product_olist',
            'commission_freight_olist']

y_df = pd.DataFrame(list_var)
y_df = y_df[~y_df[0].isin(var_drop)]


#input p-value threshold
threshold = 0.05

#set y variables
y = y_df[0]

#set X variable - manually, but dont change.
X = 'orders ~ '

#set the dataset
dataset = 'train'

#set initial parameters for record the results
nPar_i = len(y) #number of parameter at the beggining
nPar_f = 0 #number of parameters after the selection
model_form = []
model_nfeat = []
model_aic = []
model_r2 = []
model_MAPE = []

while nPar_f < nPar_i:
    #load the formula
    nPar_i = len(y) #number of parameters before the feature selection
    yform = ' + '.join(y)
    formula = X + yform
    #run the model
    model = smf.ols(formula=formula, data=train).fit()
    predictions = model.predict(test)
    #feed model output in lists
    model_form.append(formula) #formula used
    model_aic.append(model.aic) #aic
    model_r2.append(model.rsquared) #r2
    model_nfeat.append(len(model.params)) 
    model_MAPE.append(MAPE(test['orders'], predictions))
    #feed a dataframe with features of formula and p-values. drop the intercept, reset index, build a column abs(p-value) and filter rows with abs_p-value < threshold
    params = pd.DataFrame(model.pvalues)
    params = params.drop(['Intercept']).rename(columns = {0: 'p-value'}).reset_index()
    params['abs_p-value'] = abs(params['p-value'])
    y = params['index'][params['abs_p-value'] <= threshold]
    nPar_f = len(y) #number of parameters after feature selection



In [None]:
#print the dataframe with final analysis.
pd.DataFrame({'formula': model_form, 'aic': model_aic, 'r2' : model_r2, 'nfeat' : model_nfeat, 'mape': model_MAPE})