In [39]:
import pandas as pd
import numpy as np
import psycopg2
import os
from dotenv import load_dotenv
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
load_dotenv()

True

In [40]:
# Equation for mean average percentage error for test the model performance
def mape(y_true, y_pred):
    ape = np.abs((y_true - y_pred) / y_true)
    return np.mean(ape)

def wmape(y_true, y_pred):
    return np.sum(np.abs((y_true - y_pred))) / np.sum(np.abs(y_true))

In [41]:
# Database connection and query
host = os.getenv("HOST")
dbname = os.getenv("DATABASE_NAME")
user = os.getenv("USERNAME")
password = os.getenv("DATABASE_PASSWORD")

conn_string = "host={0} user={1} dbname={2} password={3}".format(host, user, dbname, password)
conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

query = "SELECT * FROM core_sale;"
df_sales = pd.read_sql(query, conn)



In [42]:
# Copy DF, adding a new column with the date of the sale split in year
# month and day, and drop columns will not be used. Also, copy for ML model
# and set sales_copy df index for a future for return some results

df_sales_copy = df_sales.copy()
df_sales_copy["year"] = df_sales_copy.apply(lambda row : row["date"].year, axis=1)
df_sales_copy["month"] = df_sales_copy.apply(lambda row : row["date"].month, axis=1)
df_sales_copy["day"] = df_sales_copy.apply(lambda row : row["date"].day, axis=1)
df_sales_copy.drop(columns=["date"], inplace=True)
df_sales_copy.drop(columns=["description"], inplace=True)
df_sales_copy.drop(columns=["status"], inplace=True)

df_sales_copy.set_index(['year', 'month', 'day'], inplace=True)

In [43]:
# grouping sales per month for return stadistics about this.
# This data frame will be used to plot the graphs for see the historic of sales per month in every year
# This data frame can be use for trimester too

df_groupby_sales = df_sales_copy.groupby(["year"])
sales_data = {
    'income': df_groupby_sales['income'].sum(),
    'count': df_groupby_sales['id'].count()
}

df_sales_per_years = pd.DataFrame(sales_data)
df_sales_per_years

Unnamed: 0_level_0,income,count
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,96783.1,282
2019,1622842.69,3058
2020,2691034.5,6727
2021,2395357.87,10096
2022,420782.79,1502


In [44]:
# Copy df for ML model. The other DF is for chart.
#Then, it just set split point, reset index and
# split the dataframe in two parts, one for train and one for test

df_sales_machine_learning = df_sales_per_years.copy()
split_point = 2021
df_sales_machine_learning.reset_index(inplace=True)

df_sales_machine_learning
data_train = df_sales_machine_learning.loc[df_sales_machine_learning['year'] < split_point].copy()
data_valid = df_sales_machine_learning.loc[df_sales_machine_learning['year'] >= split_point].copy()

# Example for data train DF
data_train

Unnamed: 0,year,income,count
0,2018,96783.1,282
1,2019,1622842.69,3058
2,2020,2691034.5,6727


In [45]:
# Add the columns which gonna be predicted.
# It just set the sales next month using 
# the income of the next sales in the DF

data_train["sales_next_year"] = data_train["income"].shift(-1)
data_train["sales_next_year_count"] = data_train["count"].shift(-1)
data_valid["sales_next_year"] = data_valid["income"].shift(-1)
data_valid["sales_next_year_count"] = data_valid["count"].shift(-1)

data_train

Unnamed: 0,year,income,count,sales_next_year,sales_next_year_count
0,2018,96783.1,282,1622842.69,3058.0
1,2019,1622842.69,3058,2691034.5,6727.0
2,2020,2691034.5,6727,,


In [46]:
# Set diff between sales and sales next day

data_train.dropna(inplace=True)
data_valid["diff_sales_next_year"] = data_valid["income"].diff(1)
data_valid["diff_sales_next_year_count"] = data_valid["count"].diff(1)
data_train["diff_sales_next_year"] = data_train["income"].diff(1)
data_train["diff_sales_next_year_count"] = data_train["count"].diff(1)
data_train

Unnamed: 0,year,income,count,sales_next_year,sales_next_year_count,diff_sales_next_year,diff_sales_next_year_count
0,2018,96783.1,282,1622842.69,3058.0,,
1,2019,1622842.69,3058,2691034.5,6727.0,1526059.59,2776.0


In [47]:
# Baseline for indicators for precission of the model

y_pred = data_train["income"]
y_true = data_train['sales_next_year']

print(mape(y_true, y_pred))
wmape(y_true, y_pred)

0.6686533342098231


0.6013734943622724

In [48]:
# Set features for ML model, which columns will be use for predict
# Imputer is for transform some data
# Xtr and Ytr are the X and Y values for train the model
# Then the model is just a RandomForestRegressor and with fit it get trained

features = ["income", "count"]
imputer = SimpleImputer()
Xtr_per_year = imputer.fit_transform(data_train[features])
ytr_per_year = data_train['sales_next_year']


model = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model.fit(Xtr_per_year, ytr_per_year)


RandomForestRegressor(n_jobs=6, random_state=0)

In [49]:
# Predicting sales for everyh day with the model
# This model just predict the income of the next month, not how many sales

# The final data frame is just for seing true values and predicted values

Xval_per_year = imputer.fit_transform(data_valid[features])
yval_per_year = data_valid['sales_next_year']

prediction_per_year = model.predict(Xval_per_year)

df_forest_per_year = pd.DataFrame({"month": data_valid["year"] ,"predicted": prediction_per_year, "true": yval_per_year, "diff": data_valid["diff_sales_next_year"]})
df_forest_per_year
prediction_per_year
data_valid

Unnamed: 0,year,income,count,sales_next_year,sales_next_year_count,diff_sales_next_year,diff_sales_next_year_count
3,2021,2395357.87,10096,420782.79,1502.0,,
4,2022,420782.79,1502,,,-1974575.08,-8594.0


In [50]:
print(mape(yval_per_year, prediction_per_year))
print(wmape(yval_per_year, prediction_per_year))

4.836817379817267
4.836817379817267
