In [36]:
import pandas as pd
import numpy as np
import psycopg2
import os
from dotenv import load_dotenv
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
load_dotenv()

True

In [37]:
# Equation for mean average percentage error for test the model performance
def mape(y_true, y_pred):
    ape = np.abs((y_true - y_pred) / y_true)
    return np.mean(ape)

def wmape(y_true, y_pred):
    return np.sum(np.abs((y_true - y_pred))) / np.sum(np.abs(y_true))

In [38]:
# Database connection and query
host = os.getenv("HOST")
dbname = os.getenv("DATABASE_NAME")
user = os.getenv("USERNAME")
password = os.getenv("DATABASE_PASSWORD")

conn_string = "host={0} user={1} dbname={2} password={3}".format(host, user, dbname, password)
conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

df_productsales = pd.read_sql("SELECT * FROM core_productsale;", conn)
df_products = pd.read_sql("SELECT * FROM core_product;", conn)
df_sales = pd.read_sql("SELECT * FROM core_sale;", conn)
df_categories = pd.read_sql("SELECT * FROM core_category;", conn)
df_categories



Unnamed: 0,id,name
0,1,NO REFRIGERADO
1,2,SERVICIOS
2,3,REFRIGERADOS
3,4,EMBUTIDOS
4,5,CONGELADOS
5,6,QUESOS
6,8,CREMAS Y NATILLAS
7,13,LECHES Y JUGOS REFRIGERADOS
8,16,CHORIZOS
9,17,JAMONERIA


In [39]:
df_merged_products_productsales = pd.merge(df_productsales, df_products, left_on="product_id", right_on="id", suffixes=("_productsale", "_product"))
df_merged_products_productsales.drop(columns=["brand", "image", "cost_currency", "price_1_currency", "price_2_currency", "price_3_currency", "code", "price_1", "price_2", "price_3", "description", "presentation", "income_currency"], inplace=True)
df_merged_products_productsales.set_index("id_product", inplace=True)
df_merged_products_productsales

Unnamed: 0_level_0,id_productsale,quantity,income,product_id,sale_id,name,cost,category_id
id_product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
17,16921,4.1,14.35,17,00034279,QUESO TELITA CACHAPERO (E),2.70,23
17,16953,4.0,14.00,17,00034290,QUESO TELITA CACHAPERO (E),2.70,23
17,17217,3.6,12.60,17,00034379,QUESO TELITA CACHAPERO (E),2.70,23
17,17330,3.4,11.90,17,00034415,QUESO TELITA CACHAPERO (E),2.70,23
17,17334,3.4,11.90,17,00034418,QUESO TELITA CACHAPERO (E),2.70,23
...,...,...,...,...,...,...,...,...
6,59532,6.0,16.50,6,004428,ACEITUNAS RE/PIM ENV. 450GR ESCURRIDO,2.75,54
6,81979,11.0,30.25,6,NE010339,ACEITUNAS RE/PIM ENV. 450GR ESCURRIDO,2.75,54
6,82805,6.0,16.50,6,NE010715,ACEITUNAS RE/PIM ENV. 450GR ESCURRIDO,2.75,54
24,62974,4.6,0.00,24,00002780,QUESO CREMA CABRA GRANEL,0.00,26


In [40]:
df_merged_sales_with_products = pd.merge(df_merged_products_productsales, df_sales, left_on="sale_id", right_on="id", suffixes=("_productsale", "_sale"))
df_merged_sales_with_products["year"] = df_merged_sales_with_products.apply(lambda row: row["date"].year, axis=1)
df_merged_sales_with_products["month"] = df_merged_sales_with_products.apply(lambda row: row["date"].month, axis=1)
df_merged_sales_with_products.drop(columns=["id", "description", "income_currency", "status", "date" ], inplace=True)
df_merged_sales_with_products

Unnamed: 0,id_productsale,quantity,income_productsale,product_id,sale_id,name,cost,category_id,income_sale,client_id,salesman_id,year,month
0,16921,4.1,14.35,17,00034279,QUESO TELITA CACHAPERO (E),2.70,23,70.15,390,4,2018,11
1,16923,12.0,55.80,254,00034279,BOLOGNA NORMAL 1KG DEL CORRAL X UNIDAD,3.54,19,70.15,390,4,2018,11
2,16922,16.0,0.00,256,00034279,SALCHIPOLLO 450GR DEL CORRAL X UND,0.00,22,70.15,390,4,2018,11
3,16924,4.3,0.00,489,00034279,NONE,0.00,1056,70.15,390,4,2018,11
4,16953,4.0,14.00,17,00034290,QUESO TELITA CACHAPERO (E),2.70,23,235.15,101,4,2018,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
71828,82823,1.0,2.28,195,NE010724,REPOSTERIA CHOCOLATE LECHE 250 GRS,1.88,45,2.28,16,1,2021,12
71829,88368,1.0,2.28,195,001001,REPOSTERIA CHOCOLATE LECHE 250 GRS,1.88,45,2.28,16,1,2021,12
71830,82138,2.0,3.69,198,NE010420,REPOSTERIA COBERTURA LECHE 250 GRS,1.23,1064,3.69,16,1,2021,12
71831,82181,8.0,14.77,198,NE010437,REPOSTERIA COBERTURA LECHE 250 GRS,1.23,1064,14.77,16,1,2021,12


In [41]:
df_merged_with_categories = pd.merge(df_merged_sales_with_products, df_categories, left_on="category_id", right_on="id", suffixes=("_productsale", "_category"))
df_merged_with_categories

Unnamed: 0,id_productsale,quantity,income_productsale,product_id,sale_id,name_productsale,cost,category_id,income_sale,client_id,salesman_id,year,month,id,name_category
0,16921,4.1,14.35,17,00034279,QUESO TELITA CACHAPERO (E),2.70,23,70.15,390,4,2018,11,23,QUESOS FRESCOS
1,16953,4.0,14.00,17,00034290,QUESO TELITA CACHAPERO (E),2.70,23,235.15,101,4,2018,11,23,QUESOS FRESCOS
2,17217,3.6,12.60,17,00034379,QUESO TELITA CACHAPERO (E),2.70,23,425.02,377,4,2018,11,23,QUESOS FRESCOS
3,17330,3.4,11.90,17,00034415,QUESO TELITA CACHAPERO (E),2.70,23,72.74,377,4,2018,12,23,QUESOS FRESCOS
4,17329,7.1,22.84,23,00034415,QUESO BLANCO TIPO GUAYANES (E),2.70,23,72.74,377,4,2018,12,23,QUESOS FRESCOS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71828,51977,24.0,14.88,54,043524,TE PAST. DURAZNO 400ML PARMALAT,0.53,29,19.36,234,2,2021,10,29,TE
71829,78903,24.0,14.88,54,NE009171,TE PAST. DURAZNO 400ML PARMALAT,0.53,29,20.48,152,2,2021,10,29,TE
71830,56704,1.0,0.00,94,00003373,MANTECA DE CERDO PLUMROSE 17 KGRS,0.00,40,1580.00,945,1,2020,9,40,VIVERES
71831,56707,1.0,0.00,94,00003376,MANTECA DE CERDO PLUMROSE 17 KGRS,0.00,40,46.40,558,11,2020,9,40,VIVERES


In [42]:
df_groupby_categories = df_merged_with_categories.groupby(["category_id", "year", "month"])

sales_data = {
    'income': df_groupby_categories['income_productsale'].sum(),
    'count': df_groupby_categories['id_productsale'].count(),
    'name': df_groupby_categories['name_category'].first(),
}

df_sales_per_month = pd.DataFrame(sales_data)
df_sales_per_month



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,income,count,name
category_id,year,month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,2021,3,67.54,2,EMBUTIDOS
8,2018,11,14144.79,107,CREMAS Y NATILLAS
8,2018,12,7289.16,85,CREMAS Y NATILLAS
8,2019,1,23956.06,235,CREMAS Y NATILLAS
8,2019,2,2753.26,64,CREMAS Y NATILLAS
...,...,...,...,...,...
1064,2021,10,326.44,40,CONFITERIA
1064,2021,11,88.29,17,CONFITERIA
1064,2021,12,136.51,23,CONFITERIA
1064,2022,1,76.93,15,CONFITERIA


In [43]:
df_machine_learning = df_sales_per_month.copy()
df_machine_learning.reset_index(inplace=True)
# df_machine_learning[df_machine_learning["category_id"] == 24]



split_point = 2021

data_train = df_machine_learning[df_machine_learning['year'] < split_point].copy()
data_valid = df_machine_learning[df_machine_learning['year'] >= split_point].copy()
data_train

Unnamed: 0,category_id,year,month,income,count,name
1,8,2018,11,14144.79,107,CREMAS Y NATILLAS
2,8,2018,12,7289.16,85,CREMAS Y NATILLAS
3,8,2019,1,23956.06,235,CREMAS Y NATILLAS
4,8,2019,2,2753.26,64,CREMAS Y NATILLAS
5,8,2019,3,1064.39,28,CREMAS Y NATILLAS
...,...,...,...,...,...,...
828,1061,2020,10,161.91,4,PRODUCTOS CORDERO Y CONEJOS
829,1061,2020,11,165.43,4,PRODUCTOS CORDERO Y CONEJOS
830,1061,2020,12,21.64,3,PRODUCTOS CORDERO Y CONEJOS
832,1062,2020,11,200.33,5,CACACO EN POLVO


In [44]:
# Add the columns which gonna be predicted.
# It just set the products sales next month using 
# the income of the next products sales in the DF

data_train["sales_next_month"] = data_train.groupby("category_id")["income"].shift(-1)
data_train["sales_next_month_count"] = data_train.groupby("category_id")["count"].shift(-1)
data_valid["sales_next_month"] = data_valid.groupby("category_id")["income"].shift(-1)
data_valid["sales_next_month_count"] = data_valid.groupby("category_id")["count"].shift(-1)

In [48]:
# Set diff between sales and sales next day
data_train.dropna(inplace=True)

data_valid["diff_sales_next_month"] = data_valid.groupby("category_id")["income"].diff(1)
data_valid["diff_sales_next_month_count"] = data_valid.groupby("category_id")["count"].diff(1)
data_train["diff_sales_next_month"] = data_train.groupby("category_id")["income"].diff(1)
data_train["diff_sales_next_month_count"] = data_train.groupby("category_id")["count"].diff(1)
data_train

Unnamed: 0,category_id,year,month,income,count,name,sales_next_month,sales_next_month_count,diff_sales_next_month,diff_sales_next_month_count
2,8,2018,12,7289.16,85,CREMAS Y NATILLAS,23956.06,235.0,,
3,8,2019,1,23956.06,235,CREMAS Y NATILLAS,2753.26,64.0,16666.90,150.0
4,8,2019,2,2753.26,64,CREMAS Y NATILLAS,1064.39,28.0,-21202.80,-171.0
5,8,2019,3,1064.39,28,CREMAS Y NATILLAS,7079.94,122.0,-1688.87,-36.0
6,8,2019,4,7079.94,122,CREMAS Y NATILLAS,5285.08,132.0,6015.55,94.0
...,...,...,...,...,...,...,...,...,...,...
825,1061,2020,7,38.01,3,PRODUCTOS CORDERO Y CONEJOS,41.43,6.0,25.75,2.0
826,1061,2020,8,41.43,6,PRODUCTOS CORDERO Y CONEJOS,102.23,8.0,3.42,3.0
827,1061,2020,9,102.23,8,PRODUCTOS CORDERO Y CONEJOS,161.91,4.0,60.80,2.0
828,1061,2020,10,161.91,4,PRODUCTOS CORDERO Y CONEJOS,165.43,4.0,59.68,-4.0


In [49]:
# Baseline for indicators for precission of the model

y_pred = data_train["income"]
y_true = data_train['sales_next_month']

print(mape(y_true, y_pred))
wmape(y_true, y_pred)

1.063834369810746


0.4900397477129018

In [50]:
# Set features for ML model, which columns will be use for predict
# Imputer is for transform some data
# Xtr and Ytr are the X and Y values for train the model
# Then the model is just a RandomForestRegressor and with fit it get trained

features = ["income", "count", "category_id"]
imputer = SimpleImputer()
Xtr_per_month = imputer.fit_transform(data_train[features])
ytr_per_month = data_train['sales_next_month']


model = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model.fit(Xtr_per_month, ytr_per_month)

RandomForestRegressor(n_jobs=6, random_state=0)

In [59]:
# Predicting sales for everyh day with the model
# This model just predict the income of the next month, not how many sales

# The final data frame is just for seing true values and predicted values

Xval_per_month = imputer.fit_transform(data_valid[features])
yval_per_month = data_valid['sales_next_month'][data_valid['category_id'] == 8]

prediction_per_month = model.predict(Xval_per_month)

df_forest_per_month = pd.DataFrame({"category": data_valid["category_id"] ,"month": data_valid["month"] ,"value": data_valid["income"],"predicted": prediction_per_month, "true": yval_per_month, "diff": data_valid["diff_sales_next_month"]})
df_forest_per_month[df_forest_per_month["category"] == 8]


Unnamed: 0,category,month,value,predicted,true,diff
27,8,1,7551.77,11103.661,8286.06,
28,8,2,8286.06,11211.3871,10409.23,734.29
29,8,3,10409.23,10753.8063,7048.83,2123.17
30,8,4,7048.83,8284.8467,8997.96,-3360.4
31,8,5,8997.96,8605.93,11688.66,1949.13
32,8,6,11688.66,11393.089,14610.52,2690.7
33,8,7,14610.52,14525.5388,14764.34,2921.86
34,8,8,14764.34,14525.5388,15961.41,153.82
35,8,9,15961.41,15124.2662,14979.96,1197.07
36,8,10,14979.96,16089.8796,16411.81,-981.45


In [60]:
print(mape(df_forest_per_month[df_forest_per_month["category"] == 8]["true"], df_forest_per_month[df_forest_per_month["category"] == 8]["predicted"]))
print(wmape(df_forest_per_month[df_forest_per_month["category"] == 8]["true"], df_forest_per_month[df_forest_per_month["category"] == 8]["predicted"]))

0.24557187756314397
0.17820719116930667


In [61]:
# Set features for ML model, which columns will be use for predict
# Imputer is for transform some data
# Xtr and Ytr are the X and Y values for train the model
# Then the model is just a RandomForestRegressor and with fit it get trained
# Again, same shit, but for sales count

features_count = ["income", "count", "month"]
imputer_count = SimpleImputer()
Xtr_per_month_count = imputer.fit_transform(data_train[features_count])
ytr_per_month_count = data_train['sales_next_month_count']


model_count = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model_count.fit(Xtr_per_month_count, ytr_per_month_count)

RandomForestRegressor(n_jobs=6, random_state=0)

In [62]:
# Predicting sales for everyh month with the model
# This model just predict the income of the next month, not how many sales

# The final data frame is just for seing true values and predicted values
# Again, same shit, but for sales count


Xval_per_month_count = imputer.fit_transform(data_valid[features_count])
yval_per_month_count = data_valid['sales_next_month_count']

prediction_per_month_count = model_count.predict(Xval_per_month_count)

df_forest_per_month_count = pd.DataFrame({"category": data_valid["category_id"] ,"month": data_valid["month"] ,"value": data_valid["count"],"predicted": prediction_per_month_count, "true": yval_per_month_count, "diff": data_valid["diff_sales_next_month_count"]})
df_forest_per_month_count[df_forest_per_month_count["category"] == 8]


Unnamed: 0,category,month,value,predicted,true,diff
27,8,1,188,194.51,253.0,
28,8,2,253,111.74,315.0,65.0
29,8,3,315,168.04,208.0,62.0
30,8,4,208,149.8,259.0,-107.0
31,8,5,259,185.97,332.0,51.0
32,8,6,332,258.69,455.0,73.0
33,8,7,455,282.38,470.0,123.0
34,8,8,470,358.65,441.0,15.0
35,8,9,441,348.85,398.0,-29.0
36,8,10,398,274.24,380.0,-43.0


In [65]:
print(mape(df_forest_per_month_count[df_forest_per_month_count["category"] == 8]["true"], df_forest_per_month_count[df_forest_per_month_count["category"] == 8]["predicted"]))
print(wmape(df_forest_per_month_count[df_forest_per_month_count["category"] == 8]["true"], df_forest_per_month_count[df_forest_per_month_count["category"] == 8]["predicted"]))

0.3195654140728198
0.33712681638044917
