In [78]:
import pandas as pd
import numpy as np
import psycopg2
import os
from dotenv import load_dotenv
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
load_dotenv()

True

In [79]:
# Equation for mean average percentage error for test the model performance
def mape(y_true, y_pred):
    ape = np.abs((y_true - y_pred) / y_true)
    return np.mean(ape)

def wmape(y_true, y_pred):
    return np.sum(np.abs((y_true - y_pred))) / np.sum(np.abs(y_true))

In [80]:
# Database connection and query
host = os.getenv("HOST")
dbname = os.getenv("DATABASE_NAME")
user = os.getenv("USERNAME")
password = os.getenv("DATABASE_PASSWORD")

conn_string = "host={0} user={1} dbname={2} password={3}".format(host, user, dbname, password)
conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

query = "SELECT * FROM core_sale;"
df_sales = pd.read_sql(query, conn)



In [81]:
# Copy DF, adding a new column with the date of the sale split in year
# month and day, and drop columns will not be used. Also, copy for ML model
# and set sales_copy df index for a future for return some results

df_sales_copy = df_sales.copy()
df_sales_copy["year"] = df_sales_copy.apply(lambda row : row["date"].year, axis=1)
df_sales_copy["month"] = df_sales_copy.apply(lambda row : row["date"].month, axis=1)
df_sales_copy["day"] = df_sales_copy.apply(lambda row : row["date"].day, axis=1)
df_sales_copy.drop(columns=["date"], inplace=True)
df_sales_copy.drop(columns=["description"], inplace=True)
df_sales_copy.drop(columns=["status"], inplace=True)

df_sales_copy.set_index(['year', 'month', 'day'], inplace=True)

In [82]:
# grouping sales per month for return stadistics about this.
# This data frame will be used to plot the graphs for see the historic of sales per month in every year
# This data frame can be use for trimester too

df_groupby_sales = df_sales_copy.groupby(["year", "month"])
sales_data = {
    'income': df_groupby_sales['income'].sum(),
    'count': df_groupby_sales['id'].count()
}

df_sales_per_months = pd.DataFrame(sales_data)
df_sales_per_months

Unnamed: 0_level_0,Unnamed: 1_level_0,income,count
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,11,52361.97,144
2018,12,44421.13,138
2019,1,76548.24,149
2019,2,14674.46,106
2019,3,48730.99,71
2019,4,182488.12,201
2019,5,171914.8,272
2019,6,189254.46,222
2019,7,242337.45,230
2019,8,114204.27,236


In [83]:
# Copy df for ML model. The other DF is for chart.
#Then, it just set split point, reset index and
# split the dataframe in two parts, one for train and one for test

df_sales_machine_learning = df_sales_per_months.copy()
split_point = 2021
df_sales_machine_learning.reset_index(inplace=True)

df_sales_machine_learning
data_train = df_sales_machine_learning.loc[df_sales_machine_learning['year'] < split_point].copy()
data_valid = df_sales_machine_learning.loc[df_sales_machine_learning['year'] >= split_point].copy()

# Example for data train DF
data_train

Unnamed: 0,year,month,income,count
0,2018,11,52361.97,144
1,2018,12,44421.13,138
2,2019,1,76548.24,149
3,2019,2,14674.46,106
4,2019,3,48730.99,71
5,2019,4,182488.12,201
6,2019,5,171914.8,272
7,2019,6,189254.46,222
8,2019,7,242337.45,230
9,2019,8,114204.27,236


In [84]:
# Add the columns which gonna be predicted.
# It just set the sales next month using 
# the income of the next sales in the DF

data_train["sales_next_month"] = data_train["income"].shift(-1)
data_train["sales_next_month_count"] = data_train["count"].shift(-1)
data_valid["sales_next_month"] = data_valid["income"].shift(-1)
data_valid["sales_next_month_count"] = data_valid["count"].shift(-1)
data_train

Unnamed: 0,year,month,income,count,sales_next_month,sales_next_month_count
0,2018,11,52361.97,144,44421.13,138.0
1,2018,12,44421.13,138,76548.24,149.0
2,2019,1,76548.24,149,14674.46,106.0
3,2019,2,14674.46,106,48730.99,71.0
4,2019,3,48730.99,71,182488.12,201.0
5,2019,4,182488.12,201,171914.8,272.0
6,2019,5,171914.8,272,189254.46,222.0
7,2019,6,189254.46,222,242337.45,230.0
8,2019,7,242337.45,230,114204.27,236.0
9,2019,8,114204.27,236,86824.01,273.0


In [85]:
# Drop NaN values for train, otherwise, the code dont work
# Add lag sales next day in both DF train and valid. It is use
# when some info doesnt exist or there has missing values

data_train.dropna(inplace=True)
data_train["lag_sales_next_month"] = data_train["income"].shift(1)
data_train["lag_sales_next_month_count"] = data_train["count"].shift(1)
data_valid["lag_sales_next_month"] = data_valid["income"].shift(1)
data_valid["lag_sales_next_month_count"] = data_valid["count"].shift(1)
data_train

Unnamed: 0,year,month,income,count,sales_next_month,sales_next_month_count,lag_sales_next_month,lag_sales_next_month_count
0,2018,11,52361.97,144,44421.13,138.0,,
1,2018,12,44421.13,138,76548.24,149.0,52361.97,144.0
2,2019,1,76548.24,149,14674.46,106.0,44421.13,138.0
3,2019,2,14674.46,106,48730.99,71.0,76548.24,149.0
4,2019,3,48730.99,71,182488.12,201.0,14674.46,106.0
5,2019,4,182488.12,201,171914.8,272.0,48730.99,71.0
6,2019,5,171914.8,272,189254.46,222.0,182488.12,201.0
7,2019,6,189254.46,222,242337.45,230.0,171914.8,272.0
8,2019,7,242337.45,230,114204.27,236.0,189254.46,222.0
9,2019,8,114204.27,236,86824.01,273.0,242337.45,230.0


In [86]:
# Set diff between sales and sales next day

data_valid["diff_sales_next_month"] = data_valid["income"].diff(1)
data_valid["diff_sales_next_month_count"] = data_valid["count"].diff(1)
data_train["diff_sales_next_month"] = data_train["income"].diff(1)
data_train["diff_sales_next_month_count"] = data_train["count"].diff(1)
data_train

Unnamed: 0,year,month,income,count,sales_next_month,sales_next_month_count,lag_sales_next_month,lag_sales_next_month_count,diff_sales_next_month,diff_sales_next_month_count
0,2018,11,52361.97,144,44421.13,138.0,,,,
1,2018,12,44421.13,138,76548.24,149.0,52361.97,144.0,-7940.84,-6.0
2,2019,1,76548.24,149,14674.46,106.0,44421.13,138.0,32127.11,11.0
3,2019,2,14674.46,106,48730.99,71.0,76548.24,149.0,-61873.78,-43.0
4,2019,3,48730.99,71,182488.12,201.0,14674.46,106.0,34056.53,-35.0
5,2019,4,182488.12,201,171914.8,272.0,48730.99,71.0,133757.13,130.0
6,2019,5,171914.8,272,189254.46,222.0,182488.12,201.0,-10573.32,71.0
7,2019,6,189254.46,222,242337.45,230.0,171914.8,272.0,17339.66,-50.0
8,2019,7,242337.45,230,114204.27,236.0,189254.46,222.0,53082.99,8.0
9,2019,8,114204.27,236,86824.01,273.0,242337.45,230.0,-128133.18,6.0


In [87]:
# Baseline for indicators for precission of the model

y_pred = data_train["income"]
y_true = data_train['sales_next_month']

print(mape(y_true, y_pred))
wmape(y_true, y_pred)

0.5110407403965338


0.35260403881669133

In [88]:
# Set features for ML model, which columns will be use for predict
# Imputer is for transform some data
# Xtr and Ytr are the X and Y values for train the model
# Then the model is just a RandomForestRegressor and with fit it get trained

features = ["income", "count", "lag_sales_next_month", "lag_sales_next_month_count", "diff_sales_next_month", "diff_sales_next_month_count"]
imputer = SimpleImputer()
Xtr_per_month = imputer.fit_transform(data_train[features])
ytr_per_month = data_train['sales_next_month']


model = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model.fit(Xtr_per_month, ytr_per_month)


RandomForestRegressor(n_jobs=6, random_state=0)

In [89]:
# Predicting sales for everyh day with the model
# This model just predict the income of the next month, not how many sales

# The final data frame is just for seing true values and predicted values

Xval_per_month = imputer.fit_transform(data_valid[features])
yval_per_month = data_valid['sales_next_month']

prediction_per_month = model.predict(Xval_per_month)

df_forest_per_month = pd.DataFrame({"month": data_valid["month"] ,"predicted": prediction_per_month, "true": yval_per_month, "diff": data_valid["diff_sales_next_month"]})
df_forest_per_month

Unnamed: 0,month,predicted,true,diff
26,1,242069.6153,257297.47,
27,2,213743.2993,192503.59,98591.39
28,3,231814.5664,206944.43,-64793.88
29,4,258007.4872,166545.74,14440.84
30,5,219713.4398,166283.13,-40398.69
31,6,229921.1299,158943.72,-262.61
32,7,225617.0591,138120.65,-7339.41
33,8,279100.3626,175135.81,-20823.07
34,9,222187.772,279933.26,37015.16
35,10,235743.4233,272847.04,104797.45


In [90]:
# see the percentage for the error of the model for sales income

print(mape(yval_per_month, prediction_per_month))
print(wmape(yval_per_month, prediction_per_month))

0.30011680204686075
0.2624788785957619


In [91]:
# Baseline for indicators for precission of the model
# This baseline is for how many sales will be

y_pred_count = data_train["count"]
y_true_count = data_train['sales_next_month_count']

print(mape(y_true, y_pred))
wmape(y_true, y_pred)

0.5110407403965338


0.35260403881669133

In [92]:
# Set features for ML model, which columns will be use for predict
# Imputer is for transform some data
# Xtr and Ytr are the X and Y values for train the model
# Then the model is just a RandomForestRegressor and with fit it get trained
# Again, same shit, but for sales count

features_count = ["income", "count", "lag_sales_next_month", "lag_sales_next_month_count", "diff_sales_next_month", "diff_sales_next_month_count"]
imputer_count = SimpleImputer()
Xtr_per_month_count = imputer.fit_transform(data_train[features_count])
ytr_per_month_count = data_train['sales_next_month_count']


model_count = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model_count.fit(Xtr_per_month_count, ytr_per_month_count)

RandomForestRegressor(n_jobs=6, random_state=0)

In [93]:
# Predicting sales for everyh month with the model
# This model just predict the income of the next month, not how many sales

# The final data frame is just for seing true values and predicted values
# Again, same shit, but for sales count


Xval_per_month_count = imputer.fit_transform(data_valid[features_count])
yval_per_month_count = data_valid['sales_next_month_count']

prediction_per_month_count = model_count.predict(Xval_per_month)

df_forest_per_month_count = pd.DataFrame({"month": data_valid["month"] ,"predicted": prediction_per_month_count, "true": yval_per_month_count, "diff": data_valid["diff_sales_next_month_count"]})
df_forest_per_month_count

Unnamed: 0,month,predicted,true,diff
26,1,619.36,872.0,
27,2,629.32,777.0,277.0
28,3,656.47,710.0,-95.0
29,4,629.49,739.0,-67.0
30,5,649.1,745.0,29.0
31,6,630.14,876.0,6.0
32,7,632.42,816.0,131.0
33,8,618.81,908.0,-60.0
34,9,625.47,1114.0,92.0
35,10,648.21,1106.0,206.0


In [97]:
print(mape(yval_per_month_count, prediction_per_month_count))
wmape(yval_per_month_count, prediction_per_month_count)

0.2649903871894907


0.27532763791693177