In [71]:
import pandas as pd
import numpy as np
import psycopg2
import os
from dotenv import load_dotenv
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
load_dotenv()

True

In [72]:
# Equation for mean average percentage error for test the model performance
def mape(y_true, y_pred):
    ape = np.abs((y_true - y_pred) / y_true)
    return np.mean(ape)

def wmape(y_true, y_pred):
    return np.sum(np.abs((y_true - y_pred))) / np.sum(np.abs(y_true))

In [73]:
# Database connection and query
host = os.getenv("HOST")
dbname = os.getenv("DATABASE_NAME")
user = os.getenv("USERNAME")
password = os.getenv("DATABASE_PASSWORD")

conn_string = "host={0} user={1} dbname={2} password={3}".format(host, user, dbname, password)
conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

df_clients = pd.read_sql_query("SELECT * FROM core_client", conn)
df_sales = pd.read_sql("SELECT * FROM core_sale;", conn)
df_clients




Unnamed: 0,id,identity_card,name,image,address,phone
0,1,-,"INVERSIONES CAREOCA, C.A",,CALLE LIBERTAD ESQUINA CALLE COLINA S/N\n\nSEC...,
1,2,--,VECINO DEFREITAS,,,
2,3,----,KALIFA,,,
3,4,.-,"ARCOFRESCO PRODUCCION Y COMERCIALIZACION, C.A",,Calle General Pelayo entre calles San Roman y ...,
4,5,...,VENTAS DIRECTAS,,,
...,...,...,...,...,...,...
1070,1071,XX,KLEIBER MARTINEZ,,,
1071,1072,-XX,DOLIS DIAZ,,,
1072,1073,XXXX,LA MEGA,,,
1073,1074,XXXXX,JOAQUIN FEREIRA,,,


In [74]:
df_clients.drop(columns=['address', 'phone', 'image'], inplace=True)

In [75]:
df_clients

Unnamed: 0,id,identity_card,name
0,1,-,"INVERSIONES CAREOCA, C.A"
1,2,--,VECINO DEFREITAS
2,3,----,KALIFA
3,4,.-,"ARCOFRESCO PRODUCCION Y COMERCIALIZACION, C.A"
4,5,...,VENTAS DIRECTAS
...,...,...,...
1070,1071,XX,KLEIBER MARTINEZ
1071,1072,-XX,DOLIS DIAZ
1072,1073,XXXX,LA MEGA
1073,1074,XXXXX,JOAQUIN FEREIRA


In [76]:
df_merged = pd.merge(df_clients, df_sales, left_on='id', right_on='client_id', suffixes=('_client', '_sale'))
df_merged.drop(columns=['id_client', 'salesman_id', 'income_currency', 'status', 'description'], inplace=True)
df_merged

Unnamed: 0,identity_card,name,id_sale,income,date,client_id
0,-,"INVERSIONES CAREOCA, C.A",NE00005150,105.33,2021-03-16,1
1,--,VECINO DEFREITAS,00003710,6365.00,2020-10-30,2
2,--,VECINO DEFREITAS,00004109,55.28,2020-12-23,2
3,--,VECINO DEFREITAS,00002943,52.67,2020-08-04,2
4,--,VECINO DEFREITAS,NE00004365,33.20,2021-01-25,2
...,...,...,...,...,...,...
21660,XXXXXXX,GUARDIA NACIONAL ALCABALA LOS MEDANOS,00001419,52.67,2019-09-20,1075
21661,XXXXXXX,GUARDIA NACIONAL ALCABALA LOS MEDANOS,00001646,19.21,2019-11-28,1075
21662,XXXXXXX,GUARDIA NACIONAL ALCABALA LOS MEDANOS,00001656,5.88,2019-12-13,1075
21663,XXXXXXX,GUARDIA NACIONAL ALCABALA LOS MEDANOS,00001979,6320.00,2020-03-17,1075


In [77]:

df_merged['year'] = df_merged.apply(lambda row: row['date'].year, axis=1)
df_merged['month'] = df_merged.apply(lambda row: row['date'].month, axis=1)
df_merged.drop(columns=['date'], inplace=True)
df_merged

Unnamed: 0,identity_card,name,id_sale,income,client_id,year,month
0,-,"INVERSIONES CAREOCA, C.A",NE00005150,105.33,1,2021,3
1,--,VECINO DEFREITAS,00003710,6365.00,2,2020,10
2,--,VECINO DEFREITAS,00004109,55.28,2,2020,12
3,--,VECINO DEFREITAS,00002943,52.67,2,2020,8
4,--,VECINO DEFREITAS,NE00004365,33.20,2,2021,1
...,...,...,...,...,...,...,...
21660,XXXXXXX,GUARDIA NACIONAL ALCABALA LOS MEDANOS,00001419,52.67,1075,2019,9
21661,XXXXXXX,GUARDIA NACIONAL ALCABALA LOS MEDANOS,00001646,19.21,1075,2019,11
21662,XXXXXXX,GUARDIA NACIONAL ALCABALA LOS MEDANOS,00001656,5.88,1075,2019,12
21663,XXXXXXX,GUARDIA NACIONAL ALCABALA LOS MEDANOS,00001979,6320.00,1075,2020,3


In [78]:
df_groupby = df_merged.groupby(["client_id",'year', 'month'])
sales_data = {
    'income': df_groupby['income'].sum(),
    'count': df_groupby['client_id'].count(),
    'name': df_groupby['name'].first(),
}

df_sales_per_month = pd.DataFrame(sales_data)
df_sales_per_month


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,income,count,name
client_id,year,month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2021,3,105.33,1,"INVERSIONES CAREOCA, C.A"
2,2020,8,52.67,1,VECINO DEFREITAS
2,2020,10,6365.00,1,VECINO DEFREITAS
2,2020,12,55.28,1,VECINO DEFREITAS
2,2021,1,33.20,1,VECINO DEFREITAS
...,...,...,...,...,...
1075,2019,9,60.46,2,GUARDIA NACIONAL ALCABALA LOS MEDANOS
1075,2019,10,5.84,1,GUARDIA NACIONAL ALCABALA LOS MEDANOS
1075,2019,11,19.21,1,GUARDIA NACIONAL ALCABALA LOS MEDANOS
1075,2019,12,5.88,1,GUARDIA NACIONAL ALCABALA LOS MEDANOS


In [79]:
df_machine_learning = df_sales_per_month.copy()
df_machine_learning.reset_index(inplace=True)


split_point = 2021

data_train = df_machine_learning[df_machine_learning['year'] < split_point].copy()
data_valid = df_machine_learning[df_machine_learning['year'] >= split_point].copy()
data_train

Unnamed: 0,client_id,year,month,income,count,name
1,2,2020,8,52.67,1,VECINO DEFREITAS
2,2,2020,10,6365.00,1,VECINO DEFREITAS
3,2,2020,12,55.28,1,VECINO DEFREITAS
13,3,2019,11,10.90,1,KALIFA
14,4,2019,11,186.07,2,"ARCOFRESCO PRODUCCION Y COMERCIALIZACION, C.A"
...,...,...,...,...,...,...
8608,1075,2019,9,60.46,2,GUARDIA NACIONAL ALCABALA LOS MEDANOS
8609,1075,2019,10,5.84,1,GUARDIA NACIONAL ALCABALA LOS MEDANOS
8610,1075,2019,11,19.21,1,GUARDIA NACIONAL ALCABALA LOS MEDANOS
8611,1075,2019,12,5.88,1,GUARDIA NACIONAL ALCABALA LOS MEDANOS


In [80]:
# Add the columns which gonna be predicted.
# It just set the products sales next month using 
# the income of the next products sales in the DF

data_train["sales_next_month"] = data_train.groupby("client_id")["income"].shift(-1)
data_train["sales_next_month_count"] = data_train.groupby("client_id")["count"].shift(-1)
data_valid["sales_next_month"] = data_valid.groupby("client_id")["income"].shift(-1)
data_valid["sales_next_month_count"] = data_valid.groupby("client_id")["count"].shift(-1)

In [85]:
data_train.dropna(inplace=True)

data_valid["diff_sales_next_month"] = data_valid.groupby("client_id")["income"].diff(1)
data_valid["diff_sales_next_month_count"] = data_valid.groupby("client_id")["count"].diff(1)
data_train["diff_sales_next_month"] = data_train.groupby("client_id")["income"].diff(1)
data_train["diff_sales_next_month_count"] = data_train.groupby("client_id")["count"].diff(1)
data_train

Unnamed: 0,client_id,year,month,income,count,name,sales_next_month,sales_next_month_count,diff_sales_next_month,diff_sales_next_month_count
2,2,2020,10,6365.00,1,VECINO DEFREITAS,55.28,1.0,,
16,5,2019,5,12800.85,5,VENTAS DIRECTAS,7952.67,1.0,,
17,5,2019,6,7952.67,1,VENTAS DIRECTAS,38183.35,5.0,-4848.18,-4.0
18,5,2019,7,38183.35,5,VENTAS DIRECTAS,10289.63,3.0,30230.68,4.0
19,5,2019,8,10289.63,3,VENTAS DIRECTAS,11060.00,1.0,-27893.72,-2.0
...,...,...,...,...,...,...,...,...,...,...
8607,1075,2019,3,7900.00,1,GUARDIA NACIONAL ALCABALA LOS MEDANOS,60.46,2.0,,
8608,1075,2019,9,60.46,2,GUARDIA NACIONAL ALCABALA LOS MEDANOS,5.84,1.0,-7839.54,1.0
8609,1075,2019,10,5.84,1,GUARDIA NACIONAL ALCABALA LOS MEDANOS,19.21,1.0,-54.62,-1.0
8610,1075,2019,11,19.21,1,GUARDIA NACIONAL ALCABALA LOS MEDANOS,5.88,1.0,13.37,0.0


In [82]:
# Baseline for indicators for precission of the model

y_pred = data_train[data_train["client_id"] == 5]["income"]
y_true = data_train[data_train["client_id"] == 5]["sales_next_month"]

print(mape(y_true, y_pred))
wmape(y_true, y_pred)

9.571251695582944


0.8772000650991422

In [83]:
# Set features for ML model, which columns will be use for predict
# Imputer is for transform some data
# Xtr and Ytr are the X and Y values for train the model
# Then the model is just a RandomForestRegressor and with fit it get trained

features = ["income", "count", "client_id", "month"]
imputer = SimpleImputer()
Xtr_per_month = imputer.fit_transform(data_train[features])
ytr_per_month = data_train['sales_next_month']


model = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model.fit(Xtr_per_month, ytr_per_month)

RandomForestRegressor(n_jobs=6, random_state=0)

In [86]:
# Predicting sales for everyh day with the model
# This model just predict the income of the next month, not how many sales

# The final data frame is just for seing true values and predicted values

Xval_per_month = imputer.fit_transform(data_valid[features])
yval_per_month = data_valid['sales_next_month']

prediction_per_month = model.predict(Xval_per_month)

df_forest_per_month = pd.DataFrame({"client_id": data_valid["client_id"], "client": data_valid["name"] ,"month": data_valid["month"] ,"value": data_valid["income"],"predicted": prediction_per_month, "true": yval_per_month})
df_forest_per_month[data_valid["client_id"] == 1069]


Unnamed: 0,client_id,client,month,value,predicted,true
8592,1069,ARELIS ARGUELLAS,2,19.53,3451.4653,27.81
8593,1069,ARELIS ARGUELLAS,3,27.81,5083.1341,23.44
8594,1069,ARELIS ARGUELLAS,4,23.44,3368.3201,231.7
8595,1069,ARELIS ARGUELLAS,6,231.7,449.2521,52.67
8596,1069,ARELIS ARGUELLAS,1,52.67,3620.9365,27.68
8597,1069,ARELIS ARGUELLAS,2,27.68,3377.5481,


In [88]:
print(mape(df_forest_per_month[data_valid["client_id"] == 1069]["true"], df_forest_per_month[data_valid["client_id"] == 1069]["predicted"]))
print(wmape(df_forest_per_month[data_valid["client_id"] == 1069]["true"], df_forest_per_month[data_valid["client_id"] == 1069]["predicted"]))

97.96944083516959
42.96671648775119


In [89]:
# Set features for ML model, which columns will be use for predict
# Imputer is for transform some data
# Xtr and Ytr are the X and Y values for train the model
# Then the model is just a RandomForestRegressor and with fit it get trained
# Again, same shit, but for sales count

features_count = ["income", "count", "month"]
imputer_count = SimpleImputer()
Xtr_per_month_count = imputer.fit_transform(data_train[features_count])
ytr_per_month_count = data_train['sales_next_month_count']


model_count = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model_count.fit(Xtr_per_month_count, ytr_per_month_count)

RandomForestRegressor(n_jobs=6, random_state=0)

In [92]:
# Predicting sales for everyh month with the model
# This model just predict the income of the next month, not how many sales

# The final data frame is just for seing true values and predicted values
# Again, same shit, but for sales count


Xval_per_month_count = imputer.fit_transform(data_valid[features_count])
yval_per_month_count = data_valid['sales_next_month_count']

prediction_per_month_count = model_count.predict(Xval_per_month_count)

df_forest_per_month_count = pd.DataFrame({"id_client": data_valid["client_id"], "name": data_valid["name"],"month": data_valid["month"] ,"predicted": prediction_per_month_count, "true": yval_per_month_count})
df_forest_per_month_count[df_forest_per_month_count["id_client"] == 1069]

Unnamed: 0,id_client,name,month,predicted,true
8592,1069,ARELIS ARGUELLAS,2,1.81,1.0
8593,1069,ARELIS ARGUELLAS,3,1.72,1.0
8594,1069,ARELIS ARGUELLAS,4,1.06,3.0
8595,1069,ARELIS ARGUELLAS,6,3.61,1.0
8596,1069,ARELIS ARGUELLAS,1,1.420046,1.0
8597,1069,ARELIS ARGUELLAS,2,1.3,


In [95]:
print(mape(df_forest_per_month_count["true"], df_forest_per_month_count["predicted"]))
print(wmape(df_forest_per_month_count["true"], df_forest_per_month_count["predicted"]))

0.7085208710439325
0.5410431130761036
