In [5]:
import pandas as pd
import numpy as np
import psycopg2
import os
from dotenv import load_dotenv
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
load_dotenv()

True

In [4]:
# Equation for mean average percentage error for test the model performance
def mape(y_true, y_pred):
    ape = np.abs((y_true - y_pred) / y_true)
    return np.mean(ape)

def wmape(y_true, y_pred):
    return np.sum(np.abs((y_true - y_pred))) / np.sum(np.abs(y_true))

In [7]:
# Database connection and query
host = os.getenv("HOST")
dbname = os.getenv("DATABASE_NAME")
user = os.getenv("USERNAME")
password = os.getenv("DATABASE_PASSWORD")

conn_string = "host={0} user={1} dbname={2} password={3}".format(host, user, dbname, password)
conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

query = "SELECT * FROM core_sale;"
df_sales = pd.read_sql(query, conn)



In [17]:
# Copy DF, adding a new column with the date of the sale split in year
# month and day, and drop columns will not be used. Also, copy for ML model
# and set sales_copy df index for a future for return some results

df_sales_copy = df_sales.copy()
df_sales_copy["year"] = df_sales_copy.apply(lambda row : row["date"].year, axis=1)
df_sales_copy["month"] = df_sales_copy.apply(lambda row : row["date"].month, axis=1)
df_sales_copy["day"] = df_sales_copy.apply(lambda row : row["date"].day, axis=1)
df_sales_copy.drop(columns=["date"], inplace=True)
df_sales_copy.drop(columns=["description"], inplace=True)
df_sales_copy.drop(columns=["status"], inplace=True)

df_sales_copy.set_index(['year', 'month', 'day'], inplace=True)

In [16]:
# grouping sales per month for return stadistics about this.
# This data frame will be used to plot the graphs for see the historic of sales per day
# This data frame can be use for weeks too

df_groupby_sales_day = df_sales_copy.groupby(["year", "month","day"])
sales_data = {
    'income': df_groupby_sales_day['income'].sum(),
    'count': df_groupby_sales_day['id'].count()
}

df_sales_per_day = pd.DataFrame(sales_data)

df_sales_per_day

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,income,count
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,11,12,6697.59,12
2018,11,13,4446.46,24
2018,11,14,1062.53,7
2018,11,15,1079.60,2
2018,11,16,3495.26,11
...,...,...,...,...
2022,2,7,11799.68,25
2022,2,8,16709.13,71
2022,2,9,11288.48,50
2022,2,10,9933.31,50


In [18]:
# Copy df for ML model. The other DF is for chart.
#Then, it just set split point, reset index and
# split the dataframe in two parts, one for train and one for test

df_sales_machine_learning = df_sales_per_day.copy()
split_point = 2022
df_sales_machine_learning.reset_index(inplace=True)

df_sales_machine_learning
data_train = df_sales_machine_learning.loc[df_sales_machine_learning['year'] < split_point].copy()
data_valid = df_sales_machine_learning.loc[df_sales_machine_learning['year'] >= split_point].copy()

# Example for data train DF
data_train

Unnamed: 0,year,month,day,income,count
0,2018,11,12,6697.59,12
1,2018,11,13,4446.46,24
2,2018,11,14,1062.53,7
3,2018,11,15,1079.60,2
4,2018,11,16,3495.26,11
...,...,...,...,...,...
781,2021,12,24,6637.56,16
782,2021,12,27,3744.06,20
783,2021,12,28,9308.59,59
784,2021,12,29,5518.38,38


In [21]:
# Add the columns which gonna be predicted.
# It just set the sales next day using 
# the income of the next sales in the DF

data_train["sales_next_day"] = data_train["income"].shift(-1)
data_train["sales_next_day_count"] = data_train["count"].shift(-1)
data_valid["sales_next_day"] = data_valid["income"].shift(-1)
data_valid["sales_next_day_count"] = data_valid["count"].shift(-1)
data_train

Unnamed: 0,year,month,day,income,count,sales_next_day,sales_next_day_count
0,2018,11,12,6697.59,12,4446.46,24.0
1,2018,11,13,4446.46,24,1062.53,7.0
2,2018,11,14,1062.53,7,1079.60,2.0
3,2018,11,15,1079.60,2,3495.26,11.0
4,2018,11,16,3495.26,11,1057.18,3.0
...,...,...,...,...,...,...,...
781,2021,12,24,6637.56,16,3744.06,20.0
782,2021,12,27,3744.06,20,9308.59,59.0
783,2021,12,28,9308.59,59,5518.38,38.0
784,2021,12,29,5518.38,38,2697.19,24.0


In [23]:
# Drop NaN values for train, otherwise, the code dont work
# Add lag sales next day in both DF train and valid. It is use
# when some info doesnt exist or there has missing values

data_train.dropna(inplace=True)
data_train["lag_sales_next_day"] = data_train["income"].shift(1)
data_train["lag_sales_next_day_count"] = data_train["count"].shift(1)
data_valid["lag_sales_next_day"] = data_valid["income"].shift(1)
data_valid["lag_sales_next_day_count"] = data_valid["count"].shift(1)
data_train

Unnamed: 0,year,month,day,income,count,sales_next_day,sales_next_day_count,lag_sales_next_day,lag_sales_next_day_count
1,2018,11,13,4446.46,24,1062.53,7.0,,
2,2018,11,14,1062.53,7,1079.60,2.0,4446.46,24.0
3,2018,11,15,1079.60,2,3495.26,11.0,1062.53,7.0
4,2018,11,16,3495.26,11,1057.18,3.0,1079.60,2.0
5,2018,11,19,1057.18,3,2521.85,15.0,3495.26,11.0
...,...,...,...,...,...,...,...,...,...
780,2021,12,23,14936.40,69,6637.56,16.0,22189.92,103.0
781,2021,12,24,6637.56,16,3744.06,20.0,14936.40,69.0
782,2021,12,27,3744.06,20,9308.59,59.0,6637.56,16.0
783,2021,12,28,9308.59,59,5518.38,38.0,3744.06,20.0


In [24]:
# Set diff between sales and sales next day

data_valid["diff_sales_next_day"] = data_valid["income"].diff(1)
data_valid["diff_sales_next_day_count"] = data_valid["count"].diff(1)
data_train["diff_sales_next_day"] = data_train["income"].diff(1)
data_train["diff_sales_next_day_count"] = data_train["count"].diff(1)
data_train

Unnamed: 0,year,month,day,income,count,sales_next_day,sales_next_day_count,lag_sales_next_day,lag_sales_next_day_count,diff_sales_next_day,diff_sales_next_day_count
1,2018,11,13,4446.46,24,1062.53,7.0,,,,
2,2018,11,14,1062.53,7,1079.60,2.0,4446.46,24.0,-3383.93,-17.0
3,2018,11,15,1079.60,2,3495.26,11.0,1062.53,7.0,17.07,-5.0
4,2018,11,16,3495.26,11,1057.18,3.0,1079.60,2.0,2415.66,9.0
5,2018,11,19,1057.18,3,2521.85,15.0,3495.26,11.0,-2438.08,-8.0
...,...,...,...,...,...,...,...,...,...,...,...
780,2021,12,23,14936.40,69,6637.56,16.0,22189.92,103.0,-7253.52,-34.0
781,2021,12,24,6637.56,16,3744.06,20.0,14936.40,69.0,-8298.84,-53.0
782,2021,12,27,3744.06,20,9308.59,59.0,6637.56,16.0,-2893.50,4.0
783,2021,12,28,9308.59,59,5518.38,38.0,3744.06,20.0,5564.53,39.0


In [25]:
# Baseline for indicators for precission of the model

y_pred = data_train["income"]
y_true = data_train['sales_next_day']

print(mape(y_true, y_pred))
wmape(y_true, y_pred)

9.050217586048285


0.931006659960033

In [26]:
# Set features for ML model, which columns will be use for predict
# Imputer is for transform some data
# Xtr and Ytr are the X and Y values for train the model
# Then the model is just a RandomForestRegressor and with fit it get trained

features = ["income", "count", "lag_sales_next_day", "lag_sales_next_day_count", "diff_sales_next_day", "diff_sales_next_day_count"]
imputer = SimpleImputer()
Xtr_per_day = imputer.fit_transform(data_train[features])
ytr_per_day = data_train['sales_next_day']


model = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model.fit(Xtr_per_day, ytr_per_day)


RandomForestRegressor(n_jobs=6, random_state=0)

In [36]:
# Predicting sales for everyh day with the model
# This model just predict the income of the next day, not how many sales

# The final data frame is just for seing true values and predicted values

Xval_per_day = imputer.fit_transform(data_valid[features])
yval_per_day = data_valid['sales_next_day']

prediction_per_day = model.predict(Xval_per_day)

df_forest_per_day = pd.DataFrame({"day": data_valid["day"] ,"predicted": prediction_per_day, "true": yval_per_day, "diff": data_valid["diff_sales_next_day"]})
df_forest_per_day

Unnamed: 0,day,predicted,true,diff
786,5,8899.3986,17035.09,
787,6,6139.4021,4542.59,10556.21
788,7,7008.0546,22410.47,-12492.5
789,10,11216.9415,22752.23,17867.88
790,11,11226.0273,5830.05,341.76
791,12,7121.3663,8913.54,-16922.18
792,13,6181.8766,3454.24,3083.49
793,14,4885.3761,8167.05,-5459.3
794,17,10274.1621,23315.77,4712.81
795,18,6418.07,30300.34,15148.72


In [35]:
# see the percentage for the error of the model for sales income

print(mape(yval_per_day, prediction_per_day))
print(wmape(yval_per_day, prediction_per_day))

0.6841780977191902
0.5470511709628807


In [37]:
# Baseline for indicators for precission of the model
# This baseline is for how many sales will be

y_pred_count = data_train["count"]
y_true_count = data_train['sales_next_day_count']

print(mape(y_true, y_pred))
wmape(y_true, y_pred)

9.050217586048285


0.931006659960033

In [39]:
# Set features for ML model, which columns will be use for predict
# Imputer is for transform some data
# Xtr and Ytr are the X and Y values for train the model
# Then the model is just a RandomForestRegressor and with fit it get trained
# Again, same shit, but for sales count

features = ["income", "count", "lag_sales_next_day", "lag_sales_next_day_count", "diff_sales_next_day", "diff_sales_next_day_count"]
imputer_count = SimpleImputer()
Xtr_per_day_count = imputer.fit_transform(data_train[features])
ytr_per_day_count = data_train['sales_next_day_count']


model = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model.fit(Xtr_per_day_count, ytr_per_day_count)

RandomForestRegressor(n_jobs=6, random_state=0)

In [41]:
# Predicting sales for everyh day with the model
# This model just predict the income of the next day, not how many sales

# The final data frame is just for seing true values and predicted values
# Again, same shit, but for sales count


Xval_per_day_count = imputer.fit_transform(data_valid[features])
yval_per_day_count = data_valid['sales_next_day_count']

prediction_per_day_count = model.predict(Xval_per_day)

df_forest_per_day_count = pd.DataFrame({"day": data_valid["day"] ,"predicted": prediction_per_day_count, "true": yval_per_day_count, "diff": data_valid["diff_sales_next_day_count"]})
df_forest_per_day_count

Unnamed: 0,day,predicted,true,diff
786,5,31.79,90.0,
787,6,30.14,31.0,10556.21
788,7,25.96,69.0,-12492.5
789,10,38.26,64.0,17867.88
790,11,34.59,11.0,341.76
791,12,20.83,59.0,-16922.18
792,13,27.18,27.0,3083.49
793,14,24.43,28.0,-5459.3
794,17,44.85,69.0,4712.81
795,18,20.61,88.0,15148.72


In [43]:
print(mape(yval_per_day_count, prediction_per_day_count))
wmape(yval_per_day_count, prediction_per_day_count)

0.4490422162007576


0.4270833333333333