In [64]:
import pandas as pd
import numpy as np
import psycopg2
import os
from dotenv import load_dotenv
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
load_dotenv()

True

In [65]:
# Equation for mean average percentage error for test the model performance
def mape(y_true, y_pred):
    ape = np.abs((y_true - y_pred) / y_true)
    return np.mean(ape)

def wmape(y_true, y_pred):
    return np.sum(np.abs((y_true - y_pred))) / np.sum(np.abs(y_true))

In [66]:
# Database connection and query
host = os.getenv("HOST")
dbname = os.getenv("DATABASE_NAME")
user = os.getenv("USERNAME")
password = os.getenv("DATABASE_PASSWORD")

conn_string = "host={0} user={1} dbname={2} password={3}".format(host, user, dbname, password)
conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

query = "SELECT * FROM core_sale;"
df_sales = pd.read_sql(query, conn)



In [67]:
# Copy DF, adding a new column with the date of the sale split in year
# month and day, and drop columns will not be used. Also, copy for ML model
# and set sales_copy df index for a future for return some results

df_sales_copy = df_sales.copy()
df_sales_copy["year"] = df_sales_copy.apply(lambda row : row["date"].year, axis=1)
df_sales_copy["month"] = df_sales_copy.apply(lambda row : row["date"].month, axis=1)
df_sales_copy["day"] = df_sales_copy.apply(lambda row : row["date"].day, axis=1)
df_sales_copy.drop(columns=["date"], inplace=True)
df_sales_copy.drop(columns=["description"], inplace=True)
df_sales_copy.drop(columns=["status"], inplace=True)

df_sales_copy.set_index(['year', 'month', 'day'], inplace=True)

In [68]:
# grouping sales per month for return stadistics about this.
# This data frame will be used to plot the graphs for see the historic of sales per month in every year
# This data frame can be use for trimester too

df_groupby_sales = df_sales_copy.groupby(["year", "month"])
sales_data = {
    'income': df_groupby_sales['income'].sum(),
    'count': df_groupby_sales['id'].count()
}

df_sales_per_months = pd.DataFrame(sales_data)
df_sales_per_months

Unnamed: 0_level_0,Unnamed: 1_level_0,income,count
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,11,52361.97,144
2018,12,44421.13,138
2019,1,76548.24,149
2019,2,14674.46,106
2019,3,48730.99,71
2019,4,182488.12,201
2019,5,171914.8,272
2019,6,189254.46,222
2019,7,242337.45,230
2019,8,114204.27,236


In [69]:
# Copy df for ML model. The other DF is for chart.
#Then, it just set split point, reset index and
# split the dataframe in two parts, one for train and one for test

df_sales_machine_learning = df_sales_per_months.copy()
split_point = 2021
df_sales_machine_learning.reset_index(inplace=True)

df_sales_machine_learning
data_train = df_sales_machine_learning.loc[df_sales_machine_learning['year'] < split_point].copy()
data_valid = df_sales_machine_learning.loc[df_sales_machine_learning['year'] >= split_point].copy()

# Example for data train DF
data_train

Unnamed: 0,year,month,income,count
0,2018,11,52361.97,144
1,2018,12,44421.13,138
2,2019,1,76548.24,149
3,2019,2,14674.46,106
4,2019,3,48730.99,71
5,2019,4,182488.12,201
6,2019,5,171914.8,272
7,2019,6,189254.46,222
8,2019,7,242337.45,230
9,2019,8,114204.27,236


In [70]:
# Add the columns which gonna be predicted.
# It just set the sales next month using 
# the income of the next sales in the DF

data_train["sales_month_next_year"] = data_train.groupby("month")["income"].shift(-1)
data_train["sales_month_next_year_count"] = data_train.groupby("month")["count"].shift(-1)
data_valid["sales_month_next_year"] = data_valid.groupby("month")["income"].shift(-1)
data_valid["sales_month_next_year_count"] = data_valid.groupby("month")["count"].shift(-1)

data_train[data_train["month"] == 5]

Unnamed: 0,year,month,income,count,sales_month_next_year,sales_month_next_year_count
6,2019,5,171914.8,272,184797.9,379.0
18,2020,5,184797.9,379,,


In [71]:
# Set diff between sales and sales next day

data_train.dropna(inplace=True)
data_valid["diff_sales_month_next_year"] = data_train.groupby("month")["income"].diff(1)
data_valid["diff_sales_month_next_year_count"] = data_valid.groupby("month")["count"].diff(1)
data_train["diff_sales_month_next_year"] = data_train.groupby("month")["count"].diff(1)
data_train["diff_sales_month_next_year_count"] = data_train.groupby("month")["count"].diff(1)
data_train

Unnamed: 0,year,month,income,count,sales_month_next_year,sales_month_next_year_count,diff_sales_month_next_year,diff_sales_month_next_year_count
0,2018,11,52361.97,144,210125.16,566.0,,
1,2018,12,44421.13,138,163990.1,456.0,,
2,2019,1,76548.24,149,169677.59,498.0,,
3,2019,2,14674.46,106,142474.62,421.0,,
4,2019,3,48730.99,71,384130.64,506.0,,
5,2019,4,182488.12,201,176819.59,383.0,,
6,2019,5,171914.8,272,184797.9,379.0,,
7,2019,6,189254.46,222,167705.86,575.0,,
8,2019,7,242337.45,230,219422.97,649.0,,
9,2019,8,114204.27,236,257356.98,634.0,,


In [72]:
# Baseline for indicators for precission of the model

y_pred = data_train["income"]
y_true = data_train['sales_month_next_year']

print(mape(y_true, y_pred))
wmape(y_true, y_pred)

0.442369445393089


0.47168566080112184

In [73]:
# Set features for ML model, which columns will be use for predict
# Imputer is for transform some data
# Xtr and Ytr are the X and Y values for train the model
# Then the model is just a RandomForestRegressor and with fit it get trained

features = ["income", "count"]
imputer = SimpleImputer()
Xtr_per_month = imputer.fit_transform(data_train[features])
ytr_per_month = data_train['sales_month_next_year']


model = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model.fit(Xtr_per_month, ytr_per_month)


RandomForestRegressor(n_jobs=6, random_state=0)

In [74]:
# Predicting sales for everyh day with the model
# This model just predict the income of the next month, not how many sales

# The final data frame is just for seing true values and predicted values

Xval_per_month = imputer.fit_transform(data_valid[features])
yval_per_month = data_valid['sales_month_next_year']

prediction_per_month = model.predict(Xval_per_month)

df_forest_per_month = pd.DataFrame({"month": data_valid["month"] ,"predicted": prediction_per_month, "true": yval_per_month, "diff": data_valid["diff_sales_month_next_year"]})
df_forest_per_month

Unnamed: 0,month,predicted,true,diff
26,1,222708.1464,260265.91,
27,2,230430.8083,160516.88,
28,3,229962.6586,,
29,4,230639.1909,,
30,5,226788.43,,
31,6,226788.43,,
32,7,222708.1464,,
33,8,221320.8813,,
34,9,225373.3552,,
35,10,230430.8083,,


In [75]:
print(mape(yval_per_month, prediction_per_month))
print(wmape(yval_per_month, prediction_per_month))

0.2899301714375373
0.25540895315609263


In [79]:
# Set features for ML model, which columns will be use for predict
# Imputer is for transform some data
# Xtr and Ytr are the X and Y values for train the model
# Then the model is just a RandomForestRegressor and with fit it get trained
# Again, same shit, but for sales count

features_count = ["income", "count"]
imputer_count = SimpleImputer()
Xtr_per_month_count = imputer.fit_transform(data_train[features_count])
ytr_per_month_count = data_train['sales_month_next_year_count']


model_count = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model_count.fit(Xtr_per_month_count, ytr_per_month_count)

RandomForestRegressor(n_jobs=6, random_state=0)

In [77]:
# Predicting sales for everyh month with the model
# This model just predict the income of the next month, not how many sales

# The final data frame is just for seing true values and predicted values
# Again, same shit, but for sales count


Xval_per_month_count = imputer.fit_transform(data_valid[features_count])
yval_per_month_count = data_valid['sales_month_next_year_count']

prediction_per_month_count = model_count.predict(Xval_per_month_count)

df_forest_per_month_count = pd.DataFrame({"month": data_valid["month"] ,"predicted": prediction_per_month_count, "true": yval_per_month_count, "diff": data_valid["diff_sales_month_next_year_count"]})
df_forest_per_month_count

Unnamed: 0,month,predicted,true,diff
26,1,605.98,1011.0,
27,2,640.13,491.0,
28,3,619.66,,
29,4,632.62,,
30,5,602.74,,
31,6,602.74,,
32,7,605.98,,
33,8,664.39,,
34,9,571.9,,
35,10,640.13,,


In [80]:
print(mape(yval_per_month_count, prediction_per_month_count))
wmape(yval_per_month_count, prediction_per_month_count)

0.3521701708900667


0.36894141145139814