In [1]:
import pandas as pd
import numpy as np
import psycopg2
import os
from dotenv import load_dotenv
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
load_dotenv()

True

In [2]:
# Equation for mean average percentage error for test the model performance
def mape(y_true, y_pred):
    ape = np.abs((y_true - y_pred) / y_true)
    return np.mean(ape)

def wmape(y_true, y_pred):
    return np.sum(np.abs((y_true - y_pred))) / np.sum(np.abs(y_true))

In [3]:
# Database connection and query
host = os.getenv("HOST")
dbname = os.getenv("DATABASE_NAME")
user = os.getenv("USERNAME")
password = os.getenv("DATABASE_PASSWORD")

conn_string = "host={0} user={1} dbname={2} password={3}".format(host, user, dbname, password)
conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

df_productsales = pd.read_sql("SELECT * FROM core_productsale;", conn)
df_products = pd.read_sql("SELECT * FROM core_product;", conn)
df_sales = pd.read_sql("SELECT * FROM core_sale;", conn)
df_categories = pd.read_sql("SELECT * FROM core_category;", conn)
df_categories



Unnamed: 0,id,name
0,1,NO REFRIGERADO
1,2,SERVICIOS
2,3,REFRIGERADOS
3,4,EMBUTIDOS
4,5,CONGELADOS
5,6,QUESOS
6,8,CREMAS Y NATILLAS
7,13,LECHES Y JUGOS REFRIGERADOS
8,16,CHORIZOS
9,17,JAMONERIA


In [4]:
df_merged_products_productsales = pd.merge(df_productsales, df_products, left_on="product_id", right_on="id", suffixes=("_productsale", "_product"))
df_merged_products_productsales.drop(columns=["brand", "image", "cost_currency", "price_1_currency", "price_2_currency", "price_3_currency", "code", "price_1", "price_2", "price_3", "description", "presentation", "income_currency"], inplace=True)
df_merged_products_productsales.set_index("id_product", inplace=True)
df_merged_products_productsales

Unnamed: 0_level_0,id_productsale,quantity,income,product_id,sale_id,name,cost,category_id
id_product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
17,74,4.1,14.35,17,00034279,QUESO TELITA CACHAPERO (E),2.70,23
17,106,4.0,14.00,17,00034290,QUESO TELITA CACHAPERO (E),2.70,23
17,370,3.6,12.60,17,00034379,QUESO TELITA CACHAPERO (E),2.70,23
17,483,3.4,11.90,17,00034415,QUESO TELITA CACHAPERO (E),2.70,23
17,487,3.4,11.90,17,00034418,QUESO TELITA CACHAPERO (E),2.70,23
...,...,...,...,...,...,...,...,...
6,65958,6.0,16.50,6,NE010715,ACEITUNAS RE/PIM ENV. 450GR ESCURRIDO,2.75,54
24,46127,4.6,0.00,24,00002780,QUESO CREMA CABRA GRANEL,0.00,26
479,31784,4.0,5.20,479,042686,ADEREZO DE AJO Y CEBOLLA NATUGURT,1.00,1063
481,31786,4.0,5.20,481,042686,ADEREZO DE CILANTRO NATUGURT,1.00,1063


In [6]:
df_merged_sales_with_products = pd.merge(df_merged_products_productsales, df_sales, left_on="sale_id", right_on="id", suffixes=("_productsale", "_sale"))
df_merged_sales_with_products["year"] = df_merged_sales_with_products.apply(lambda row: row["date"].year, axis=1)
df_merged_sales_with_products["month"] = df_merged_sales_with_products.apply(lambda row: row["date"].month, axis=1)
df_merged_sales_with_products.drop(columns=["id", "description", "income_currency", "status", "date" ], inplace=True)
df_merged_sales_with_products

Unnamed: 0,id_productsale,quantity,income_productsale,product_id,sale_id,name,cost,category_id,income_sale,client_id,salesman_id,year,month
0,74,4.1,14.35,17,00034279,QUESO TELITA CACHAPERO (E),2.70,23,70.15,475,34,2018,11
1,76,12.0,55.80,254,00034279,BOLOGNA NORMAL 1KG DEL CORRAL X UNIDAD,3.54,19,70.15,475,34,2018,11
2,75,16.0,0.00,256,00034279,SALCHIPOLLO 450GR DEL CORRAL X UND,0.00,22,70.15,475,34,2018,11
3,77,4.3,0.00,489,00034279,NONE,0.00,1056,70.15,475,34,2018,11
4,106,4.0,14.00,17,00034290,QUESO TELITA CACHAPERO (E),2.70,23,235.15,186,34,2018,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
71828,65976,1.0,2.28,195,NE010724,REPOSTERIA CHOCOLATE LECHE 250 GRS,1.88,45,2.28,101,31,2021,12
71829,71521,1.0,2.28,195,001001,REPOSTERIA CHOCOLATE LECHE 250 GRS,1.88,45,2.28,101,31,2021,12
71830,65291,2.0,3.69,198,NE010420,REPOSTERIA COBERTURA LECHE 250 GRS,1.23,1064,3.69,101,31,2021,12
71831,65334,8.0,14.77,198,NE010437,REPOSTERIA COBERTURA LECHE 250 GRS,1.23,1064,14.77,101,31,2021,12


In [7]:
df_merged_with_categories = pd.merge(df_merged_sales_with_products, df_categories, left_on="category_id", right_on="id", suffixes=("_productsale", "_category"))
df_merged_with_categories

Unnamed: 0,id_productsale,quantity,income_productsale,product_id,sale_id,name_productsale,cost,category_id,income_sale,client_id,salesman_id,year,month,id,name_category
0,74,4.1,14.35,17,00034279,QUESO TELITA CACHAPERO (E),2.70,23,70.15,475,34,2018,11,23,QUESOS FRESCOS
1,106,4.0,14.00,17,00034290,QUESO TELITA CACHAPERO (E),2.70,23,235.15,186,34,2018,11,23,QUESOS FRESCOS
2,370,3.6,12.60,17,00034379,QUESO TELITA CACHAPERO (E),2.70,23,425.02,462,34,2018,11,23,QUESOS FRESCOS
3,483,3.4,11.90,17,00034415,QUESO TELITA CACHAPERO (E),2.70,23,72.74,462,34,2018,12,23,QUESOS FRESCOS
4,482,7.1,22.84,23,00034415,QUESO BLANCO TIPO GUAYANES (E),2.70,23,72.74,462,34,2018,12,23,QUESOS FRESCOS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71828,35130,24.0,14.88,54,043524,TE PAST. DURAZNO 400ML PARMALAT,0.53,29,19.36,319,32,2021,10,29,TE
71829,62056,24.0,14.88,54,NE009171,TE PAST. DURAZNO 400ML PARMALAT,0.53,29,20.48,237,32,2021,10,29,TE
71830,39860,1.0,0.00,94,00003376,MANTECA DE CERDO PLUMROSE 17 KGRS,0.00,40,46.40,643,41,2020,9,40,VIVERES
71831,39857,1.0,0.00,94,00003373,MANTECA DE CERDO PLUMROSE 17 KGRS,0.00,40,1580.00,1030,31,2020,9,40,VIVERES
