In [2]:
import pandas as pd
import numpy as np
import psycopg2
import os
from dotenv import load_dotenv
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
load_dotenv()

True

In [3]:
def mape(y_true, y_pred):
    ape = np.abs((y_true - y_pred) / y_true)
    return np.mean(ape)

def wmape(y_true, y_pred):
    return np.sum(np.abs((y_true - y_pred))) / np.sum(np.abs(y_true))

In [4]:
host = os.getenv("HOST")
dbname = os.getenv("DATABASE_NAME")
user = os.getenv("USERNAME")
password = os.getenv("DATABASE_PASSWORD")

conn_string = "host={0} user={1} dbname={2} password={3}".format(host, user, dbname, password)
conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

In [5]:
query = "SELECT * FROM core_sale;"
df_salesmans = pd.read_sql(query, conn)



In [6]:
df_sales = df_salesmans.copy()


In [7]:
df_sales_copy = df_sales.copy()

In [8]:
df_sales_copy["year"] = df_sales_copy.apply(lambda row : row["date"].year, axis=1)
df_sales_copy["month"] = df_sales_copy.apply(lambda row : row["date"].month, axis=1)
df_sales_copy["day"] = df_sales_copy.apply(lambda row : row["date"].day, axis=1)
df_sales_copy.drop(columns=["date"], inplace=True)
df_sales_copy.drop(columns=["description"], inplace=True)
df_sales_copy.drop(columns=["status"], inplace=True)
df_sales_machine_learning = df_sales_copy.copy()

In [9]:
df_sales_copy.set_index(['year', 'month', 'day'], inplace=True)

In [10]:
df_groupby_sales = df_sales_copy.groupby(["year", "month"])
sales_data = {
    'income': df_groupby_sales['income'].sum(),
    'count': df_groupby_sales['id'].count()
}

df_sales_per_months = pd.DataFrame(sales_data)
df_sales_per_months

Unnamed: 0_level_0,Unnamed: 1_level_0,income,count
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,11,52361.97,144
2018,12,44421.13,138
2019,1,76548.24,149
2019,2,14674.46,106
2019,3,48730.99,71
2019,4,182488.12,201
2019,5,171914.8,272
2019,6,189254.46,222
2019,7,242337.45,230
2019,8,114204.27,236


In [11]:
df_groupby_sales_year = df_sales_copy.groupby(["year"])
sales_data = {
    'income': df_groupby_sales_year['income'].sum(),
    'count': df_groupby_sales_year['id'].count()
}

df_sales_per_year = pd.DataFrame(sales_data)

In [12]:
df_groupby_sales_day = df_sales_copy.groupby(["year", "month","day"])
sales_data = {
    'income': df_groupby_sales_day['income'].sum(),
    'count': df_groupby_sales_day['id'].count()
}

df_sales_per_day = pd.DataFrame(sales_data)

In [13]:
df_sales_per_day

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,income,count
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,11,12,6697.59,12
2018,11,13,4446.46,24
2018,11,14,1062.53,7
2018,11,15,1079.60,2
2018,11,16,3495.26,11
...,...,...,...,...
2022,2,7,11799.68,25
2022,2,8,16709.13,71
2022,2,9,11288.48,50
2022,2,10,9933.31,50


In [14]:

df_sales_per_months

Unnamed: 0_level_0,Unnamed: 1_level_0,income,count
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,11,52361.97,144
2018,12,44421.13,138
2019,1,76548.24,149
2019,2,14674.46,106
2019,3,48730.99,71
2019,4,182488.12,201
2019,5,171914.8,272
2019,6,189254.46,222
2019,7,242337.45,230
2019,8,114204.27,236


In [15]:

df_sales_per_year

Unnamed: 0_level_0,income,count
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,96783.1,282
2019,1622842.69,3058
2020,2691034.5,6727
2021,2395357.87,10096
2022,420782.79,1502


In [16]:
df_sales_machine_learning = df_sales_per_day.copy()

In [17]:
split_point = 2022
df_sales_machine_learning.reset_index(inplace=True)

df_sales_machine_learning
data_train = df_sales_machine_learning.loc[df_sales_machine_learning['year'] < split_point].copy()
data_valid = df_sales_machine_learning.loc[df_sales_machine_learning['year'] >= split_point].copy()

In [18]:
month_data_train = data_train.copy()
month_data_valid = data_valid.copy()
month_data_train["sales_next_month_in_day"] = data_train.groupby("day")["income"].shift(-1)
month_data_train["sales_next_month_in_day_count"] = data_train.groupby("day")["count"].shift(-1)
month_data_valid["sales_next_month_in_day"] = data_valid.groupby("day")["income"].shift(-1)
month_data_valid["sales_next_month_in_day_count"] = data_valid.groupby("day")["count"].shift(-1)

month_data_train[month_data_train["day"] == 12]

Unnamed: 0,year,month,day,income,count,sales_next_month_in_day,sales_next_month_in_day_count
0,2018,11,12,6697.59,12,21.49,1.0
20,2018,12,12,21.49,1,1550.4,12.0
49,2019,2,12,1550.4,12,1157.94,5.0
75,2019,4,12,1157.94,5,610.83,2.0
115,2019,6,12,610.83,2,279.29,4.0
138,2019,7,12,279.29,4,2752.49,11.0
159,2019,8,12,2752.49,11,1177.09,17.0
180,2019,9,12,1177.09,17,980.13,2.0
201,2019,10,12,980.13,2,27376.24,16.0
223,2019,11,12,27376.24,16,18656.84,33.0


In [19]:
data_train_day = data_train.copy()
data_valid_day = data_valid.copy()
data_train_day["sales_next_day"] = data_train["income"].shift(-1)
data_train_day["sales_next_day_count"] = data_train["count"].shift(-1)
data_valid_day["sales_next_day"] = data_valid["income"].shift(-1)
data_valid_day["sales_next_day_count"] = data_valid["count"].shift(-1)
data_train_day

Unnamed: 0,year,month,day,income,count,sales_next_day,sales_next_day_count
0,2018,11,12,6697.59,12,4446.46,24.0
1,2018,11,13,4446.46,24,1062.53,7.0
2,2018,11,14,1062.53,7,1079.60,2.0
3,2018,11,15,1079.60,2,3495.26,11.0
4,2018,11,16,3495.26,11,1057.18,3.0
...,...,...,...,...,...,...,...
781,2021,12,24,6637.56,16,3744.06,20.0
782,2021,12,27,3744.06,20,9308.59,59.0
783,2021,12,28,9308.59,59,5518.38,38.0
784,2021,12,29,5518.38,38,2697.19,24.0


In [20]:
data_train_day.dropna(inplace=True)
data_train_day["lag_sales_next_day"] = data_train_day["income"].shift(1)
data_train_day["lag_sales_next_day_count"] = data_train_day["count"].shift(1)
data_train_day

Unnamed: 0,year,month,day,income,count,sales_next_day,sales_next_day_count,lag_sales_next_day,lag_sales_next_day_count
0,2018,11,12,6697.59,12,4446.46,24.0,,
1,2018,11,13,4446.46,24,1062.53,7.0,6697.59,12.0
2,2018,11,14,1062.53,7,1079.60,2.0,4446.46,24.0
3,2018,11,15,1079.60,2,3495.26,11.0,1062.53,7.0
4,2018,11,16,3495.26,11,1057.18,3.0,1079.60,2.0
...,...,...,...,...,...,...,...,...,...
780,2021,12,23,14936.40,69,6637.56,16.0,22189.92,103.0
781,2021,12,24,6637.56,16,3744.06,20.0,14936.40,69.0
782,2021,12,27,3744.06,20,9308.59,59.0,6637.56,16.0
783,2021,12,28,9308.59,59,5518.38,38.0,3744.06,20.0


In [21]:
data_valid_day["lag_sales_next_day"] = data_valid_day["income"].shift(1)
data_valid_day["lag_sales_next_day_count"] = data_valid_day["count"].shift(1)
data_valid_day

Unnamed: 0,year,month,day,income,count,sales_next_day,sales_next_day_count,lag_sales_next_day,lag_sales_next_day_count
786,2022,1,5,6478.88,38,17035.09,90.0,,
787,2022,1,6,17035.09,90,4542.59,31.0,6478.88,38.0
788,2022,1,7,4542.59,31,22410.47,69.0,17035.09,90.0
789,2022,1,10,22410.47,69,22752.23,64.0,4542.59,31.0
790,2022,1,11,22752.23,64,5830.05,11.0,22410.47,69.0
791,2022,1,12,5830.05,11,8913.54,59.0,22752.23,64.0
792,2022,1,13,8913.54,59,3454.24,27.0,5830.05,11.0
793,2022,1,14,3454.24,27,8167.05,28.0,8913.54,59.0
794,2022,1,17,8167.05,28,23315.77,69.0,3454.24,27.0
795,2022,1,18,23315.77,69,30300.34,88.0,8167.05,28.0


In [22]:
data_valid_day["diff_sales_next_day"] = data_valid_day["income"].diff(1)
data_valid_day["diff_sales_next_day_count"] = data_valid_day["count"].diff(1)
data_valid_day

Unnamed: 0,year,month,day,income,count,sales_next_day,sales_next_day_count,lag_sales_next_day,lag_sales_next_day_count,diff_sales_next_day,diff_sales_next_day_count
786,2022,1,5,6478.88,38,17035.09,90.0,,,,
787,2022,1,6,17035.09,90,4542.59,31.0,6478.88,38.0,10556.21,52.0
788,2022,1,7,4542.59,31,22410.47,69.0,17035.09,90.0,-12492.5,-59.0
789,2022,1,10,22410.47,69,22752.23,64.0,4542.59,31.0,17867.88,38.0
790,2022,1,11,22752.23,64,5830.05,11.0,22410.47,69.0,341.76,-5.0
791,2022,1,12,5830.05,11,8913.54,59.0,22752.23,64.0,-16922.18,-53.0
792,2022,1,13,8913.54,59,3454.24,27.0,5830.05,11.0,3083.49,48.0
793,2022,1,14,3454.24,27,8167.05,28.0,8913.54,59.0,-5459.3,-32.0
794,2022,1,17,8167.05,28,23315.77,69.0,3454.24,27.0,4712.81,1.0
795,2022,1,18,23315.77,69,30300.34,88.0,8167.05,28.0,15148.72,41.0


In [23]:
data_train_day["diff_sales_next_day"] = data_train_day["income"].diff(1)
data_train_day["diff_sales_next_day_count"] = data_train_day["count"].diff(1)
data_train_day

Unnamed: 0,year,month,day,income,count,sales_next_day,sales_next_day_count,lag_sales_next_day,lag_sales_next_day_count,diff_sales_next_day,diff_sales_next_day_count
0,2018,11,12,6697.59,12,4446.46,24.0,,,,
1,2018,11,13,4446.46,24,1062.53,7.0,6697.59,12.0,-2251.13,12.0
2,2018,11,14,1062.53,7,1079.60,2.0,4446.46,24.0,-3383.93,-17.0
3,2018,11,15,1079.60,2,3495.26,11.0,1062.53,7.0,17.07,-5.0
4,2018,11,16,3495.26,11,1057.18,3.0,1079.60,2.0,2415.66,9.0
...,...,...,...,...,...,...,...,...,...,...,...
780,2021,12,23,14936.40,69,6637.56,16.0,22189.92,103.0,-7253.52,-34.0
781,2021,12,24,6637.56,16,3744.06,20.0,14936.40,69.0,-8298.84,-53.0
782,2021,12,27,3744.06,20,9308.59,59.0,6637.56,16.0,-2893.50,4.0
783,2021,12,28,9308.59,59,5518.38,38.0,3744.06,20.0,5564.53,39.0


In [24]:
y_pred = data_train_day["income"]
y_true = data_train_day['sales_next_day']

In [25]:
mape(y_true, y_pred)

9.039333582313759

In [26]:
wmape(y_true, y_pred)

0.9307289036969174

In [27]:
data_train_day.tail()

Unnamed: 0,year,month,day,income,count,sales_next_day,sales_next_day_count,lag_sales_next_day,lag_sales_next_day_count,diff_sales_next_day,diff_sales_next_day_count
780,2021,12,23,14936.4,69,6637.56,16.0,22189.92,103.0,-7253.52,-34.0
781,2021,12,24,6637.56,16,3744.06,20.0,14936.4,69.0,-8298.84,-53.0
782,2021,12,27,3744.06,20,9308.59,59.0,6637.56,16.0,-2893.5,4.0
783,2021,12,28,9308.59,59,5518.38,38.0,3744.06,20.0,5564.53,39.0
784,2021,12,29,5518.38,38,2697.19,24.0,9308.59,59.0,-3790.21,-21.0


In [28]:
features = ["income", "count", "lag_sales_next_day", "lag_sales_next_day_count", "diff_sales_next_day", "diff_sales_next_day_count"]

In [29]:
imputer = SimpleImputer()
Xtr_per_day = imputer.fit_transform(data_train_day[features])
ytr_per_day = data_train_day['sales_next_day']


model = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model.fit(Xtr_per_day, ytr_per_day)


RandomForestRegressor(n_jobs=6, random_state=0)

In [30]:
# Predicting for sales but for every day.
Xval_per_day = imputer.transform(data_valid_day[features])
yval_per_day = data_valid_day['sales_next_day']

prediction_per_day = model.predict(Xval_per_day)

df_forest_per_day = pd.DataFrame({"day": data_valid_day["day"] ,"predicted": prediction_per_day, "true": yval_per_day})
df_forest_per_day

Unnamed: 0,day,predicted,true
786,5,7141.7398,17035.09
787,6,5801.9262,4542.59
788,7,7079.8645,22410.47
789,10,10985.9003,22752.23
790,11,11497.9482,5830.05
791,12,6658.519,8913.54
792,13,6820.7832,3454.24
793,14,4998.4236,8167.05
794,17,9633.9118,23315.77
795,18,9250.4975,30300.34


In [31]:
data_valid_day
mape(yval_per_day, prediction_per_day)

0.7268474176165012

In [32]:
wmape(yval_per_day, prediction_per_day)

0.553518081931691

In [33]:
split_point = 2021
df_sales_machine_learning = df_sales_per_months.copy()
df_sales_machine_learning.reset_index(inplace=True)

data_train = df_sales_machine_learning.loc[df_sales_machine_learning['year'] < split_point].copy()
data_valid = df_sales_machine_learning.loc[df_sales_machine_learning['year'] >= split_point].copy()

df_sales_machine_learning

Unnamed: 0,year,month,income,count
0,2018,11,52361.97,144
1,2018,12,44421.13,138
2,2019,1,76548.24,149
3,2019,2,14674.46,106
4,2019,3,48730.99,71
5,2019,4,182488.12,201
6,2019,5,171914.8,272
7,2019,6,189254.46,222
8,2019,7,242337.45,230
9,2019,8,114204.27,236


In [34]:
month_data_train = data_train.copy()
month_data_valid = data_valid.copy()
month_data_train["sales_next_month"] = data_train["income"].shift(-1)
month_data_train["sales_next_month_count"] = data_train["count"].shift(-1)
month_data_valid["sales_next_month"] = data_valid["income"].shift(-1)
month_data_valid["sales_next_month_count"] = data_valid["count"].shift(-1)

month_data_train

Unnamed: 0,year,month,income,count,sales_next_month,sales_next_month_count
0,2018,11,52361.97,144,44421.13,138.0
1,2018,12,44421.13,138,76548.24,149.0
2,2019,1,76548.24,149,14674.46,106.0
3,2019,2,14674.46,106,48730.99,71.0
4,2019,3,48730.99,71,182488.12,201.0
5,2019,4,182488.12,201,171914.8,272.0
6,2019,5,171914.8,272,189254.46,222.0
7,2019,6,189254.46,222,242337.45,230.0
8,2019,7,242337.45,230,114204.27,236.0
9,2019,8,114204.27,236,86824.01,273.0


In [35]:
month_data_train.dropna(inplace=True)
month_data_train

Unnamed: 0,year,month,income,count,sales_next_month,sales_next_month_count
0,2018,11,52361.97,144,44421.13,138.0
1,2018,12,44421.13,138,76548.24,149.0
2,2019,1,76548.24,149,14674.46,106.0
3,2019,2,14674.46,106,48730.99,71.0
4,2019,3,48730.99,71,182488.12,201.0
5,2019,4,182488.12,201,171914.8,272.0
6,2019,5,171914.8,272,189254.46,222.0
7,2019,6,189254.46,222,242337.45,230.0
8,2019,7,242337.45,230,114204.27,236.0
9,2019,8,114204.27,236,86824.01,273.0


In [36]:
y_pred_month = month_data_train["income"]
y_true_month = month_data_train['sales_next_month']

In [37]:
mape(y_true_month, y_pred_month)

0.5110407403965338

In [38]:
wmape(y_true_month, y_pred_month)

0.35260403881669133

In [39]:
features = ["income", "count"]
imputer = SimpleImputer()
Xtr_per_month = imputer.fit_transform(month_data_train[features])
ytr_per_month = month_data_train['sales_next_month']

model_month = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model_month.fit(Xtr_per_month, ytr_per_month)

RandomForestRegressor(n_jobs=6, random_state=0)

In [40]:
# Predicting for sales but for every month

Xval_per_month = imputer.transform(month_data_valid[features])
yval_per_month = month_data_valid['sales_next_month']

prediction_per_month = model_month.predict(Xval_per_month)

df_forest_per_month = pd.DataFrame({"month": month_data_valid["month"], "predicted": prediction_per_month, "true": yval_per_month})
df_forest_per_month

Unnamed: 0,month,predicted,true
26,1,208830.9746,257297.47
27,2,243455.2343,192503.59
28,3,225232.1706,206944.43
29,4,236827.8707,166545.74
30,5,219313.5532,166283.13
31,6,219313.5532,158943.72
32,7,218138.4688,138120.65
33,8,294315.4564,175135.81
34,9,218281.1689,279933.26
35,10,237505.3135,272847.04


In [41]:
mape(yval_per_month, prediction_per_month)

0.3076442790243136

In [42]:
wmape(yval_per_month, prediction_per_month)

0.27578355787783854

In [48]:
imputer_month_count = SimpleImputer()
Xtr_per_month_count = imputer.fit_transform(month_data_train[features])
ytr_per_month_count = month_data_train['sales_next_month_count']

model_month_count = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model_month_count.fit(Xtr_per_month_count, ytr_per_month_count)

RandomForestRegressor(n_jobs=6, random_state=0)

In [49]:
# Predict sales count for every month

Xval_per_month_count = imputer_month_count.fit_transform(month_data_valid[features])
yval_per_month_count = month_data_valid['count']

prediction_per_month_count = model_month_count.predict(Xval_per_month_count)

df_forest_per_month_count = pd.DataFrame({"month": month_data_valid["month"], "predicted": prediction_per_month_count, "true": yval_per_month_count})

df_forest_per_month_count

Unnamed: 0,month,predicted,true
26,1,601.49,595
27,2,696.96,872
28,3,656.94,777
29,4,649.26,710
30,5,653.56,739
31,6,653.56,745
32,7,651.88,876
33,8,651.88,816
34,9,652.37,908
35,10,701.88,1114


In [45]:

mape(yval_per_month_count, prediction_per_month_count)

0.16201935947843324

In [46]:
wmape(yval_per_month_count, prediction_per_month_count)

0.18418175547508195