## 4.1.2 - Running regressions for all products in one model after clusterization

In [3]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from scipy import stats
from statistics import mean 

from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn import metrics, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, minmax_scale
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR


import matplotlib.pyplot       as plt
import seaborn                 as sns

### Loading the cleaned dataset

In [4]:
df = pd.read_csv('../data/celular_over50.csv', dtype={'gtin':'str'})
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,date,competition_price,gtin,item_name,orders,olist_price,freight_value,price_ratio,freight_ratio,weekday_1,...,monthday_22,monthday_23,monthday_24,monthday_25,monthday_26,monthday_27,monthday_28,monthday_29,monthday_30,monthday_31
0,2019-07-14,419.9,6438409014344,Nokia 8110 4g dual chip amarelo .,1.0,419.9,24.23,1.0,0.057704,0,...,0,0,0,0,0,0,0,0,0,0
1,2019-07-15,419.9,6438409014344,Nokia 8110 4g Dual Chip Amarelo,1.0,419.9,24.23,1.0,0.057704,0,...,0,0,0,0,0,0,0,0,0,0
2,2019-07-21,377.91,6438409014344,Nokia 8110 4g Dual Chip Amarelo,1.0,419.9,24.23,1.111111,0.057704,0,...,0,0,0,0,0,0,0,0,0,0
3,2019-07-27,377.91,6438409014344,Nokia 8110 4g dual chip amarelo .,1.0,419.9,42.58,1.111111,0.101405,0,...,0,0,0,0,0,1,0,0,0,0
4,2019-08-05,379.9,6438409014344,Nokia 8110 4g Dual Chip Amarelo,1.0,419.9,20.19,1.105291,0.048083,0,...,0,0,0,0,0,0,0,0,0,0


### Normalizing the prices and freights

In [5]:
df['competition_price_sc'] = df.groupby('gtin').competition_price.transform(lambda x: minmax_scale(x.astype(float)))
df['price_sc'] = df.groupby('gtin').olist_price.transform(lambda x: minmax_scale(x.astype(float)))
df['freight_sc'] = df.groupby('gtin').freight_value.transform(lambda x: minmax_scale(x.astype(float)))

### Group the data, trying to find the most important products

In [6]:
price_ratio = pd.DataFrame(df.groupby('gtin')['price_ratio'].mean())
freight_ratio = pd.DataFrame(df.groupby('gtin')['freight_ratio'].mean())
total_orders = pd.DataFrame(df.groupby('gtin')['orders'].sum())

In [7]:
df_summary = price_ratio.merge(total_orders, on='gtin').sort_values('orders', ascending = False)
df_summary = df_summary.merge(freight_ratio, on='gtin').sort_values('orders', ascending = False)
df_summary.head()

Unnamed: 0_level_0,price_ratio,orders,freight_ratio
gtin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7893299910340,0.981869,3417.0,0.005112
7893299910425,0.978969,3146.0,0.004572
7893299910418,1.015605,1239.0,0.012521
7892509104661,1.015379,1213.0,0.004321
7892509104586,1.00582,1101.0,0.003551


## Product 1

In [10]:
sel_gtin = '7893299910340'
df_c = df[df['gtin'] == sel_gtin]

### Trying some regressions

In [13]:
mae = []
for i in range(0,100):
    train, test = train_test_split(df_c, test_size=0.8)
    formula = 'orders ~ olist_price + competition_price + freight_value'
    model = smf.ols(formula = formula, data = train).fit()
    mae.append(metrics.median_absolute_error(test['orders'], model.predict(test)))

mean(mae)    
#print(model.summary())
    

5.0975776034040186

In [14]:
mae = []
for i in range(0,100):
    train, test = train_test_split(df_c, test_size=0.8)
    formula = 'orders ~ price_sc + competition_price_sc + freight_ratio'
    model = smf.ols(formula = formula, data = train).fit()
    mae.append(metrics.median_absolute_error(test['orders'], model.predict(test)))

mean(mae)    
#print(model.summary())
    

5.361899218915834

In [15]:
mae = []
for i in range(0,100):
    train, test = train_test_split(df_c, test_size=0.8)
    formula = 'orders ~ price_ratio + freight_ratio'
    model = smf.ols(formula = formula, data = train).fit()
    mae.append(metrics.median_absolute_error(test['orders'], model.predict(test)))

mean(mae)
#print(model.summary())
    

4.949045070792471

In [16]:
mae = []
for i in range(0,100):
    train, test = train_test_split(df_c, test_size=0.8)
    formula = 'orders ~ price_ratio + freight_ratio + competition_price_sc + price_sc + freight_sc + weekday_1 + weekday_2 + weekday_3 +\
        weekday_4 + weekday_5 + weekday_6 + week_2 + week_3 + week_4 + week_5 + week_6 + week_7 + \
        week_8 + week_9 + week_10 + week_11 + week_12 + week_13 + week_14 + week_15 + week_16 + week_17 +\
        week_18 + week_19 + week_20 + week_21 + week_22 + week_23 + week_24 + week_25 + week_26 + week_27 +\
        week_28 + week_29 + week_30 + week_31 + week_32 + week_33 + week_34 + week_35 + week_36 + week_37 +\
        week_38 + week_39 + week_40 + week_41 + week_42 + week_43 + week_44 + week_45 + week_46 + week_47 +\
        week_48 + week_49 + week_50 + week_51 + week_52 + month_2 + month_3 + month_4 + month_5 + month_6 +\
        month_7 + month_8 + month_9 + month_10 + month_11 + month_12 + monthday_2 + monthday_3 + monthday_4 +\
        monthday_5 + monthday_6 + monthday_7 + monthday_8 + monthday_9 + monthday_10 + monthday_11 + monthday_12 +\
        monthday_13 + monthday_14 + monthday_15 + monthday_16 + monthday_17 + monthday_18 + monthday_19 +\
        monthday_20 + monthday_21 + monthday_22 + monthday_23 + monthday_24 + monthday_25 + monthday_26 +\
        monthday_27 + monthday_28 + monthday_29 + monthday_30 + monthday_31'
    model = smf.ols(formula = formula, data = train).fit()
    mae.append(metrics.median_absolute_error(test['orders'], model.predict(test)))

mean(mae)   
#print(model.summary())
    

10.18992209136789

### Decision Trees

In [18]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

0.5869787067197068

In [19]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

0.5395505992292956

In [20]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio', 
           'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

0.5367134846518279

In [21]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio', 
           'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'week_2', 'week_3', 'week_4',
            'week_5', 'week_6', 'week_7', 'week_8', 'week_9', 'week_10', 'week_11', 'week_12', 'week_13',
            'week_14', 'week_15', 'week_16', 'week_17', 'week_18', 'week_19', 'week_20', 'week_21', 'week_22',
            'week_23', 'week_24', 'week_25', 'week_26', 'week_27', 'week_28', 'week_29', 'week_30', 'week_31', 
            'week_32', 'week_33', 'week_34', 'week_35', 'week_36', 'week_37', 'week_38', 'week_39', 'week_40', 
            'week_41', 'week_42', 'week_43', 'week_44', 'week_45', 'week_46', 'week_47', 'week_48', 'week_49', 
            'week_50', 'week_51', 'week_52']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

0.5385429025444879

In [23]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio', 
           'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'month_2',
           'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 
           'month_11','month_12']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

0.5675450996441634

In [24]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio', 
            'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'month_2',
            'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 
            'month_11','month_12''monthday_2', 'monthday_3', 'monthday_4', 'monthday_5', 'monthday_6', 
            'monthday_7', 'monthday_8', 'monthday_9', 'monthday_10', 'monthday_11', 'monthday_12', 
            'monthday_13', 'monthday_14', 'monthday_15', 'monthday_16', 'monthday_17', 'monthday_18', 
            'monthday_19', 'monthday_20', 'monthday_21', 'monthday_22', 'monthday_23','monthday_24',
            'monthday_25', 'monthday_26', 'monthday_27', 'monthday_28', 'monthday_29', 'monthday_30',
            'monthday_31']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

0.5514843459594453

### Random Forest

After some hyperparameter adjusts, I found that. It's not better than decicion tree

In [26]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
treeregressor = RandomForestRegressor(n_estimators=1000, max_depth=5)
tree_fit = treeregressor.fit(x_train, y_train)
tree_pred = tree_fit.predict(x_test)
print(metrics.median_absolute_error(y_test, tree_pred))


0.49046123065994157


### SVM

In [27]:
sel_col = ['competition_price', 'olist_price', 'freight_value', 'price_ratio', 
           'freight_ratio', 'competition_price_sc', 'price_sc', 'freight_sc']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

regressor = SVR(C=1.0, epsilon=0.2)
fit = regressor.fit(x_train, y_train)
pred = fit.predict(x_test)
print('MAE: '+ str(metrics.median_absolute_error(y_test, pred)))

MAE: 0.5834451976590613


In [28]:
sel_col = ['competition_price', 'olist_price', 'freight_value', 'price_ratio', 'freight_ratio', 'weekday_1',
            'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'week_2', 'week_3', 'week_4',
            'week_5', 'week_6', 'week_7', 'week_8', 'week_9', 'week_10', 'week_11', 'week_12', 'week_13',
            'week_14', 'week_15', 'week_16', 'week_17', 'week_18', 'week_19', 'week_20', 'week_21', 'week_22',
            'week_23', 'week_24', 'week_25', 'week_26', 'week_27', 'week_28', 'week_29', 'week_30', 'week_31', 'week_32',
            'week_33', 'week_34', 'week_35', 'week_36', 'week_37', 'week_38', 'week_39', 'week_40', 'week_41', 'week_42',
            'week_43', 'week_44', 'week_45', 'week_46', 'week_47', 'week_48', 'week_49', 'week_50', 'week_51', 'week_52',
            'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
            'month_12', 'monthday_2', 'monthday_3', 'monthday_4', 'monthday_5', 'monthday_6', 'monthday_7', 'monthday_8',
            'monthday_9', 'monthday_10', 'monthday_11', 'monthday_12', 'monthday_13', 'monthday_14', 'monthday_15', 'monthday_16',
            'monthday_17', 'monthday_18', 'monthday_19', 'monthday_20', 'monthday_21', 'monthday_22', 'monthday_23','monthday_24',
            'monthday_25', 'monthday_26', 'monthday_27', 'monthday_28', 'monthday_29', 'monthday_30', 'monthday_31','competition_price_sc',
            'price_sc', 'freight_sc']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

regressor = SVR(C=1.0, epsilon=0.2)
fit = regressor.fit(x_train, y_train)
pred = fit.predict(x_test)
print('MAE: '+ str(metrics.median_absolute_error(y_test, pred)))

MAE: 0.692335942841106


for the most sold product, the results are worst than the cluster and even the all by gtin.

## Product 2

In [29]:
sel_gtin = '7893299910425'
df_c = df[df['gtin'] == sel_gtin]

### Trying some regressions

In [30]:
mae = []
for i in range(0,100):
    train, test = train_test_split(df_c, test_size=0.8)
    formula = 'orders ~ olist_price + competition_price + freight_value'
    model = smf.ols(formula = formula, data = train).fit()
    mae.append(metrics.median_absolute_error(test['orders'], model.predict(test)))

mean(mae)    
#print(model.summary())
    

6.708218839049585

In [31]:
mae = []
for i in range(0,100):
    train, test = train_test_split(df_c, test_size=0.8)
    formula = 'orders ~ price_sc + competition_price_sc + freight_ratio'
    model = smf.ols(formula = formula, data = train).fit()
    mae.append(metrics.median_absolute_error(test['orders'], model.predict(test)))

mean(mae)    
#print(model.summary())
    

6.902281711550244

In [32]:
mae = []
for i in range(0,100):
    train, test = train_test_split(df_c, test_size=0.8)
    formula = 'orders ~ price_ratio + freight_ratio'
    model = smf.ols(formula = formula, data = train).fit()
    mae.append(metrics.median_absolute_error(test['orders'], model.predict(test)))

mean(mae)
#print(model.summary())
    

7.030892447594522

In [33]:
mae = []
for i in range(0,100):
    train, test = train_test_split(df_c, test_size=0.8)
    formula = 'orders ~ price_ratio + freight_ratio + competition_price_sc + price_sc + freight_sc + weekday_1 + weekday_2 + weekday_3 +\
        weekday_4 + weekday_5 + weekday_6 + week_2 + week_3 + week_4 + week_5 + week_6 + week_7 + \
        week_8 + week_9 + week_10 + week_11 + week_12 + week_13 + week_14 + week_15 + week_16 + week_17 +\
        week_18 + week_19 + week_20 + week_21 + week_22 + week_23 + week_24 + week_25 + week_26 + week_27 +\
        week_28 + week_29 + week_30 + week_31 + week_32 + week_33 + week_34 + week_35 + week_36 + week_37 +\
        week_38 + week_39 + week_40 + week_41 + week_42 + week_43 + week_44 + week_45 + week_46 + week_47 +\
        week_48 + week_49 + week_50 + week_51 + week_52 + month_2 + month_3 + month_4 + month_5 + month_6 +\
        month_7 + month_8 + month_9 + month_10 + month_11 + month_12 + monthday_2 + monthday_3 + monthday_4 +\
        monthday_5 + monthday_6 + monthday_7 + monthday_8 + monthday_9 + monthday_10 + monthday_11 + monthday_12 +\
        monthday_13 + monthday_14 + monthday_15 + monthday_16 + monthday_17 + monthday_18 + monthday_19 +\
        monthday_20 + monthday_21 + monthday_22 + monthday_23 + monthday_24 + monthday_25 + monthday_26 +\
        monthday_27 + monthday_28 + monthday_29 + monthday_30 + monthday_31'
    model = smf.ols(formula = formula, data = train).fit()
    mae.append(metrics.median_absolute_error(test['orders'], model.predict(test)))

mean(mae)   
#print(model.summary())
    

824.7140915788691

### Decision Trees

In [34]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

1.2750121630980253

In [35]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

1.2723782816972862

In [36]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio', 
           'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

1.303531140217483

In [37]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio', 
           'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'week_2', 'week_3', 'week_4',
            'week_5', 'week_6', 'week_7', 'week_8', 'week_9', 'week_10', 'week_11', 'week_12', 'week_13',
            'week_14', 'week_15', 'week_16', 'week_17', 'week_18', 'week_19', 'week_20', 'week_21', 'week_22',
            'week_23', 'week_24', 'week_25', 'week_26', 'week_27', 'week_28', 'week_29', 'week_30', 'week_31', 
            'week_32', 'week_33', 'week_34', 'week_35', 'week_36', 'week_37', 'week_38', 'week_39', 'week_40', 
            'week_41', 'week_42', 'week_43', 'week_44', 'week_45', 'week_46', 'week_47', 'week_48', 'week_49', 
            'week_50', 'week_51', 'week_52']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

1.2070998217094837

In [38]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio', 
           'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'month_2',
           'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 
           'month_11','month_12']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

1.3229796382054668

In [39]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio', 
            'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'month_2',
            'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 
            'month_11','month_12''monthday_2', 'monthday_3', 'monthday_4', 'monthday_5', 'monthday_6', 
            'monthday_7', 'monthday_8', 'monthday_9', 'monthday_10', 'monthday_11', 'monthday_12', 
            'monthday_13', 'monthday_14', 'monthday_15', 'monthday_16', 'monthday_17', 'monthday_18', 
            'monthday_19', 'monthday_20', 'monthday_21', 'monthday_22', 'monthday_23','monthday_24',
            'monthday_25', 'monthday_26', 'monthday_27', 'monthday_28', 'monthday_29', 'monthday_30',
            'monthday_31']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

1.3611796941606313

### Random Forest

After some hyperparameter adjusts, I found that. It's not better than decicion tree

In [40]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
treeregressor = RandomForestRegressor(n_estimators=1000, max_depth=5)
tree_fit = treeregressor.fit(x_train, y_train)
tree_pred = tree_fit.predict(x_test)
print(metrics.median_absolute_error(y_test, tree_pred))


1.140330702034054


### SVM

In [41]:
sel_col = ['competition_price', 'olist_price', 'freight_value', 'price_ratio', 
           'freight_ratio', 'competition_price_sc', 'price_sc', 'freight_sc']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

regressor = SVR(C=1.0, epsilon=0.2)
fit = regressor.fit(x_train, y_train)
pred = fit.predict(x_test)
print('MAE: '+ str(metrics.median_absolute_error(y_test, pred)))

MAE: 1.4128847997968155


In [42]:
sel_col = ['competition_price', 'olist_price', 'freight_value', 'price_ratio', 'freight_ratio', 'weekday_1',
            'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'week_2', 'week_3', 'week_4',
            'week_5', 'week_6', 'week_7', 'week_8', 'week_9', 'week_10', 'week_11', 'week_12', 'week_13',
            'week_14', 'week_15', 'week_16', 'week_17', 'week_18', 'week_19', 'week_20', 'week_21', 'week_22',
            'week_23', 'week_24', 'week_25', 'week_26', 'week_27', 'week_28', 'week_29', 'week_30', 'week_31', 'week_32',
            'week_33', 'week_34', 'week_35', 'week_36', 'week_37', 'week_38', 'week_39', 'week_40', 'week_41', 'week_42',
            'week_43', 'week_44', 'week_45', 'week_46', 'week_47', 'week_48', 'week_49', 'week_50', 'week_51', 'week_52',
            'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
            'month_12', 'monthday_2', 'monthday_3', 'monthday_4', 'monthday_5', 'monthday_6', 'monthday_7', 'monthday_8',
            'monthday_9', 'monthday_10', 'monthday_11', 'monthday_12', 'monthday_13', 'monthday_14', 'monthday_15', 'monthday_16',
            'monthday_17', 'monthday_18', 'monthday_19', 'monthday_20', 'monthday_21', 'monthday_22', 'monthday_23','monthday_24',
            'monthday_25', 'monthday_26', 'monthday_27', 'monthday_28', 'monthday_29', 'monthday_30', 'monthday_31','competition_price_sc',
            'price_sc', 'freight_sc']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

regressor = SVR(C=1.0, epsilon=0.2)
fit = regressor.fit(x_train, y_train)
pred = fit.predict(x_test)
print('MAE: '+ str(metrics.median_absolute_error(y_test, pred)))

MAE: 1.2975983338440897


## Product 3

In [43]:
sel_gtin = '7893299910418'
df_c = df[df['gtin'] == sel_gtin]

### Trying some regressions

In [44]:
mae = []
for i in range(0,100):
    train, test = train_test_split(df_c, test_size=0.8)
    formula = 'orders ~ olist_price + competition_price + freight_value'
    model = smf.ols(formula = formula, data = train).fit()
    mae.append(metrics.median_absolute_error(test['orders'], model.predict(test)))

mean(mae)    
#print(model.summary())
    

2.5247520992208203

In [45]:
mae = []
for i in range(0,100):
    train, test = train_test_split(df_c, test_size=0.8)
    formula = 'orders ~ price_sc + competition_price_sc + freight_ratio'
    model = smf.ols(formula = formula, data = train).fit()
    mae.append(metrics.median_absolute_error(test['orders'], model.predict(test)))

mean(mae)    
#print(model.summary())
    

2.4208492445377847

In [46]:
mae = []
for i in range(0,100):
    train, test = train_test_split(df_c, test_size=0.8)
    formula = 'orders ~ price_ratio + freight_ratio'
    model = smf.ols(formula = formula, data = train).fit()
    mae.append(metrics.median_absolute_error(test['orders'], model.predict(test)))

mean(mae)
#print(model.summary())
    

3.306922966803859

In [47]:
mae = []
for i in range(0,100):
    train, test = train_test_split(df_c, test_size=0.8)
    formula = 'orders ~ price_ratio + freight_ratio + competition_price_sc + price_sc + freight_sc + weekday_1 + weekday_2 + weekday_3 +\
        weekday_4 + weekday_5 + weekday_6 + week_2 + week_3 + week_4 + week_5 + week_6 + week_7 + \
        week_8 + week_9 + week_10 + week_11 + week_12 + week_13 + week_14 + week_15 + week_16 + week_17 +\
        week_18 + week_19 + week_20 + week_21 + week_22 + week_23 + week_24 + week_25 + week_26 + week_27 +\
        week_28 + week_29 + week_30 + week_31 + week_32 + week_33 + week_34 + week_35 + week_36 + week_37 +\
        week_38 + week_39 + week_40 + week_41 + week_42 + week_43 + week_44 + week_45 + week_46 + week_47 +\
        week_48 + week_49 + week_50 + week_51 + week_52 + month_2 + month_3 + month_4 + month_5 + month_6 +\
        month_7 + month_8 + month_9 + month_10 + month_11 + month_12 + monthday_2 + monthday_3 + monthday_4 +\
        monthday_5 + monthday_6 + monthday_7 + monthday_8 + monthday_9 + monthday_10 + monthday_11 + monthday_12 +\
        monthday_13 + monthday_14 + monthday_15 + monthday_16 + monthday_17 + monthday_18 + monthday_19 +\
        monthday_20 + monthday_21 + monthday_22 + monthday_23 + monthday_24 + monthday_25 + monthday_26 +\
        monthday_27 + monthday_28 + monthday_29 + monthday_30 + monthday_31'
    model = smf.ols(formula = formula, data = train).fit()
    mae.append(metrics.median_absolute_error(test['orders'], model.predict(test)))

mean(mae)   
#print(model.summary())
    

90.70254817332004

### Decision Trees

In [48]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

0.5055140914405667

In [49]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

0.4377837201554403

In [50]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio', 
           'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

0.4238434376452884

In [51]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio', 
           'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'week_2', 'week_3', 'week_4',
            'week_5', 'week_6', 'week_7', 'week_8', 'week_9', 'week_10', 'week_11', 'week_12', 'week_13',
            'week_14', 'week_15', 'week_16', 'week_17', 'week_18', 'week_19', 'week_20', 'week_21', 'week_22',
            'week_23', 'week_24', 'week_25', 'week_26', 'week_27', 'week_28', 'week_29', 'week_30', 'week_31', 
            'week_32', 'week_33', 'week_34', 'week_35', 'week_36', 'week_37', 'week_38', 'week_39', 'week_40', 
            'week_41', 'week_42', 'week_43', 'week_44', 'week_45', 'week_46', 'week_47', 'week_48', 'week_49', 
            'week_50', 'week_51', 'week_52']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

0.47970736008552856

In [52]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio', 
           'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'month_2',
           'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 
           'month_11','month_12']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

0.42597850706642154

In [53]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc', 'price_ratio', 'freight_ratio', 
            'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'month_2',
            'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 
            'month_11','month_12''monthday_2', 'monthday_3', 'monthday_4', 'monthday_5', 'monthday_6', 
            'monthday_7', 'monthday_8', 'monthday_9', 'monthday_10', 'monthday_11', 'monthday_12', 
            'monthday_13', 'monthday_14', 'monthday_15', 'monthday_16', 'monthday_17', 'monthday_18', 
            'monthday_19', 'monthday_20', 'monthday_21', 'monthday_22', 'monthday_23','monthday_24',
            'monthday_25', 'monthday_26', 'monthday_27', 'monthday_28', 'monthday_29', 'monthday_30',
            'monthday_31']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]
mae = []

for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    treeregressor = DecisionTreeRegressor(max_depth = 5)
    tree_fit = treeregressor.fit(x_train, y_train)
    tree_pred = tree_fit.predict(x_test)
    mae.append(metrics.median_absolute_error(y_test, tree_pred))

mean(mae)

0.46470988926970846

### Random Forest

After some hyperparameter adjusts, I found that. It's not better than decicion tree

In [54]:
sel_col = ['competition_price_sc', 'price_sc', 'freight_value_sc']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
treeregressor = RandomForestRegressor(n_estimators=1000, max_depth=5)
tree_fit = treeregressor.fit(x_train, y_train)
tree_pred = tree_fit.predict(x_test)
print(metrics.median_absolute_error(y_test, tree_pred))


0.3883602604379657


### SVM

In [55]:
sel_col = ['competition_price', 'olist_price', 'freight_value', 'price_ratio', 
           'freight_ratio', 'competition_price_sc', 'price_sc', 'freight_sc']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

regressor = SVR(C=1.0, epsilon=0.2)
fit = regressor.fit(x_train, y_train)
pred = fit.predict(x_test)
print('MAE: '+ str(metrics.median_absolute_error(y_test, pred)))

MAE: 0.47109290910771895


In [56]:
sel_col = ['competition_price', 'olist_price', 'freight_value', 'price_ratio', 'freight_ratio', 'weekday_1',
            'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'week_2', 'week_3', 'week_4',
            'week_5', 'week_6', 'week_7', 'week_8', 'week_9', 'week_10', 'week_11', 'week_12', 'week_13',
            'week_14', 'week_15', 'week_16', 'week_17', 'week_18', 'week_19', 'week_20', 'week_21', 'week_22',
            'week_23', 'week_24', 'week_25', 'week_26', 'week_27', 'week_28', 'week_29', 'week_30', 'week_31', 'week_32',
            'week_33', 'week_34', 'week_35', 'week_36', 'week_37', 'week_38', 'week_39', 'week_40', 'week_41', 'week_42',
            'week_43', 'week_44', 'week_45', 'week_46', 'week_47', 'week_48', 'week_49', 'week_50', 'week_51', 'week_52',
            'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
            'month_12', 'monthday_2', 'monthday_3', 'monthday_4', 'monthday_5', 'monthday_6', 'monthday_7', 'monthday_8',
            'monthday_9', 'monthday_10', 'monthday_11', 'monthday_12', 'monthday_13', 'monthday_14', 'monthday_15', 'monthday_16',
            'monthday_17', 'monthday_18', 'monthday_19', 'monthday_20', 'monthday_21', 'monthday_22', 'monthday_23','monthday_24',
            'monthday_25', 'monthday_26', 'monthday_27', 'monthday_28', 'monthday_29', 'monthday_30', 'monthday_31','competition_price_sc',
            'price_sc', 'freight_sc']
y = df_c['orders']
x = df_c[df_c.columns.intersection(sel_col)]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

regressor = SVR(C=1.0, epsilon=0.2)
fit = regressor.fit(x_train, y_train)
pred = fit.predict(x_test)
print('MAE: '+ str(metrics.median_absolute_error(y_test, pred)))

MAE: 0.4074482131109316
