In [13]:
import pandas as pd
import numpy as np
import psycopg2
import os
from dotenv import load_dotenv
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
load_dotenv()

True

In [14]:
# Equation for mean average percentage error for test the model performance
def mape(y_true, y_pred):
    ape = np.abs((y_true - y_pred) / y_true)
    return np.mean(ape)

def wmape(y_true, y_pred):
    return np.sum(np.abs((y_true - y_pred))) / np.sum(np.abs(y_true))

In [15]:
# Database connection and query
host = os.getenv("HOST")
dbname = os.getenv("DATABASE_NAME")
user = os.getenv("USERNAME")
password = os.getenv("DATABASE_PASSWORD")

conn_string = "host={0} user={1} dbname={2} password={3}".format(host, user, dbname, password)
conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

df_salesmans = pd.read_sql_query("SELECT * FROM core_salesman", conn)
df_sales = pd.read_sql("SELECT * FROM core_sale;", conn)
df_salesmans




Unnamed: 0,id,identity_card,name,image,phone_1,phone_2,address
0,1,VDIRECTA,VENTAS DIRECTA,,,,
1,2,V2,V2,,,,
2,3,V3,Luis Pirona,,,,
3,4,18700742,Luis Guanipa,,4126675072.0,,
4,5,V5,Jhonatan Salas,,4246883977.0,4165030165.0,
5,6,V6,Victor Naveda,,2682514205.0,4121647209.0,Calle Libertad entre callejon mi Cabaï¿½a y ca...
6,7,V7,Edgado Rojas,,,,
7,8,8,Centro De Distribucion,,4127882192.0,,
8,9,9,Julio Garcia,,,,
9,10,10,Jesus Rodriguez,,,,


In [16]:
df_merged = pd.merge(df_salesmans, df_sales, left_on='id', right_on='salesman_id', suffixes=('_salesman', '_sale'))
df_merged.drop(columns=['client_id', 'income_currency', 'status', 'description', 'address', 'phone_1','phone_2', 'image'], inplace=True)
df_merged

Unnamed: 0,id_salesman,identity_card,name,id_sale,income,date,salesman_id
0,1,VDIRECTA,VENTAS DIRECTA,00034347,1611.32,2018-11-26,1
1,1,VDIRECTA,VENTAS DIRECTA,00034311,837.86,2018-11-19,1
2,1,VDIRECTA,VENTAS DIRECTA,00034396,1895.35,2018-12-03,1
3,1,VDIRECTA,VENTAS DIRECTA,00034397,2026.97,2018-12-03,1
4,1,VDIRECTA,VENTAS DIRECTA,00034357,98.93,2018-11-27,1
...,...,...,...,...,...,...,...
21660,15,15,inactivo,042780,40.51,2021-08-09,15
21661,15,15,inactivo,NE007579,42.19,2021-07-27,15
21662,15,15,inactivo,NE007580,17.18,2021-07-27,15
21663,15,15,inactivo,NE007582,1.79,2021-07-27,15


In [17]:
df_merged['year'] = df_merged.apply(lambda row: row['date'].year, axis=1)
df_merged['month'] = df_merged.apply(lambda row: row['date'].month, axis=1)
df_merged.drop(columns=['date'], inplace=True)
df_merged

Unnamed: 0,id_salesman,identity_card,name,id_sale,income,salesman_id,year,month
0,1,VDIRECTA,VENTAS DIRECTA,00034347,1611.32,1,2018,11
1,1,VDIRECTA,VENTAS DIRECTA,00034311,837.86,1,2018,11
2,1,VDIRECTA,VENTAS DIRECTA,00034396,1895.35,1,2018,12
3,1,VDIRECTA,VENTAS DIRECTA,00034397,2026.97,1,2018,12
4,1,VDIRECTA,VENTAS DIRECTA,00034357,98.93,1,2018,11
...,...,...,...,...,...,...,...,...
21660,15,15,inactivo,042780,40.51,15,2021,8
21661,15,15,inactivo,NE007579,42.19,15,2021,7
21662,15,15,inactivo,NE007580,17.18,15,2021,7
21663,15,15,inactivo,NE007582,1.79,15,2021,7


In [18]:
df_groupby = df_merged.groupby(["salesman_id",'year', 'month'])
sales_data = {
    'income': df_groupby['income'].sum(),
    'count': df_groupby['salesman_id'].count(),
    'name': df_groupby['name'].first(),
}

df_sales_per_month = pd.DataFrame(sales_data)
df_sales_per_month

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,income,count,name
salesman_id,year,month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2018,11,4222.89,8,VENTAS DIRECTA
1,2018,12,11581.27,13,VENTAS DIRECTA
1,2019,1,8669.05,9,VENTAS DIRECTA
1,2019,2,322.17,4,VENTAS DIRECTA
1,2019,3,11546.92,10,VENTAS DIRECTA
...,...,...,...,...,...
14,2021,12,1615.51,7,Gunner Oviol
14,2022,1,10573.20,38,Gunner Oviol
14,2022,2,5656.32,41,Gunner Oviol
15,2021,7,62.95,4,inactivo


In [19]:
df_machine_learning = df_sales_per_month.copy()
df_machine_learning.reset_index(inplace=True)


split_point = 2021

data_train = df_machine_learning[df_machine_learning['year'] < split_point].copy()
data_valid = df_machine_learning[df_machine_learning['year'] >= split_point].copy()
data_train

Unnamed: 0,salesman_id,year,month,income,count,name
0,1,2018,11,4222.89,8,VENTAS DIRECTA
1,1,2018,12,11581.27,13,VENTAS DIRECTA
2,1,2019,1,8669.05,9,VENTAS DIRECTA
3,1,2019,2,322.17,4,VENTAS DIRECTA
4,1,2019,3,11546.92,10,VENTAS DIRECTA
...,...,...,...,...,...,...
343,11,2020,12,15170.91,69,CARELIS HERNANDEZ G.
358,12,2020,9,1649.68,1,Edgar D Vargas
359,12,2020,10,4738.04,43,Edgar D Vargas
360,12,2020,11,3461.10,36,Edgar D Vargas


In [21]:
# Add the columns which gonna be predicted.
# It just set the products sales next month using 
# the income of the next products sales in the DF

data_train["sales_next_month"] = data_train.groupby("salesman_id")["income"].shift(-1)
data_train["sales_next_month_count"] = data_train.groupby("salesman_id")["count"].shift(-1)
data_valid["sales_next_month"] = data_valid.groupby("salesman_id")["income"].shift(-1)
data_valid["sales_next_month_count"] = data_valid.groupby("salesman_id")["count"].shift(-1)

In [22]:
data_train.dropna(inplace=True)

data_valid["diff_sales_next_month"] = data_valid.groupby("salesman_id")["income"].diff(1)
data_valid["diff_sales_next_month_count"] = data_valid.groupby("salesman_id")["count"].diff(1)
data_train["diff_sales_next_month"] = data_train.groupby("salesman_id")["income"].diff(1)
data_train["diff_sales_next_month_count"] = data_train.groupby("salesman_id")["count"].diff(1)
data_train

Unnamed: 0,salesman_id,year,month,income,count,name,sales_next_month,sales_next_month_count,diff_sales_next_month,diff_sales_next_month_count
0,1,2018,11,4222.89,8,VENTAS DIRECTA,11581.27,13.0,,
1,1,2018,12,11581.27,13,VENTAS DIRECTA,8669.05,9.0,7358.38,5.0
2,1,2019,1,8669.05,9,VENTAS DIRECTA,322.17,4.0,-2912.22,-4.0
3,1,2019,2,322.17,4,VENTAS DIRECTA,11546.92,10.0,-8346.88,-5.0
4,1,2019,3,11546.92,10,VENTAS DIRECTA,42314.75,29.0,11224.75,6.0
...,...,...,...,...,...,...,...,...,...,...
341,11,2020,10,10913.47,69,CARELIS HERNANDEZ G.,26172.94,82.0,-12118.85,-21.0
342,11,2020,11,26172.94,82,CARELIS HERNANDEZ G.,15170.91,69.0,15259.47,13.0
358,12,2020,9,1649.68,1,Edgar D Vargas,4738.04,43.0,,
359,12,2020,10,4738.04,43,Edgar D Vargas,3461.10,36.0,3088.36,42.0


In [24]:
# Baseline for indicators for precission of the model

y_pred = data_train[data_train["salesman_id"] == 1]["income"]
y_true = data_train[data_train["salesman_id"] == 1]["sales_next_month"]

print(mape(y_true, y_pred))
wmape(y_true, y_pred)

1.7336118757053516


0.6120303514065554

In [26]:
# Set features for ML model, which columns will be use for predict
# Imputer is for transform some data
# Xtr and Ytr are the X and Y values for train the model
# Then the model is just a RandomForestRegressor and with fit it get trained

features = ["income", "count", "salesman_id", "month"]
imputer = SimpleImputer()
Xtr_per_month = imputer.fit_transform(data_train[features])
ytr_per_month = data_train['sales_next_month']


model = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model.fit(Xtr_per_month, ytr_per_month)

RandomForestRegressor(n_jobs=6, random_state=0)

In [28]:
# Predicting sales for everyh day with the model
# This model just predict the income of the next month, not how many sales

# The final data frame is just for seing true values and predicted values

Xval_per_month = imputer.fit_transform(data_valid[features])
yval_per_month = data_valid['sales_next_month']

prediction_per_month = model.predict(Xval_per_month)

df_forest_per_month = pd.DataFrame({"salesman_id": data_valid["salesman_id"], "salesman": data_valid["name"] ,"month": data_valid["month"] ,"value": data_valid["income"],"predicted": prediction_per_month, "true": yval_per_month})
df_forest_per_month[data_valid["salesman_id"] == 1]

Unnamed: 0,salesman_id,salesman,month,value,predicted,true
26,1,VENTAS DIRECTA,1,8308.01,13388.7821,8411.67
27,1,VENTAS DIRECTA,2,8411.67,14736.8594,5003.5
28,1,VENTAS DIRECTA,3,5003.5,14338.3126,5302.34
29,1,VENTAS DIRECTA,4,5302.34,16492.9793,3582.93
30,1,VENTAS DIRECTA,5,3582.93,14280.3814,3594.47
31,1,VENTAS DIRECTA,6,3594.47,14526.2755,4432.23
32,1,VENTAS DIRECTA,7,4432.23,16106.5165,3091.61
33,1,VENTAS DIRECTA,8,3091.61,13188.2847,5875.21
34,1,VENTAS DIRECTA,9,5875.21,16141.7087,44688.51
35,1,VENTAS DIRECTA,10,44688.51,32983.5353,12833.55


In [29]:
print(mape(df_forest_per_month[df_forest_per_month["salesman_id"] == 1]["true"], df_forest_per_month[df_forest_per_month["salesman_id"] == 1]["predicted"]))
print(wmape(df_forest_per_month[df_forest_per_month["salesman_id"] == 1]["true"], df_forest_per_month[df_forest_per_month["salesman_id"] == 1]["predicted"]))

16.79897777162625
1.4570552848044105


In [30]:
# Set features for ML model, which columns will be use for predict
# Imputer is for transform some data
# Xtr and Ytr are the X and Y values for train the model
# Then the model is just a RandomForestRegressor and with fit it get trained
# Again, same shit, but for sales count

features_count = ["income", "count", "month"]
imputer_count = SimpleImputer()
Xtr_per_month_count = imputer.fit_transform(data_train[features_count])
ytr_per_month_count = data_train['sales_next_month_count']


model_count = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model_count.fit(Xtr_per_month_count, ytr_per_month_count)

RandomForestRegressor(n_jobs=6, random_state=0)

In [31]:
# Predicting sales for everyh month with the model
# This model just predict the income of the next month, not how many sales

# The final data frame is just for seing true values and predicted values
# Again, same shit, but for sales count


Xval_per_month_count = imputer.fit_transform(data_valid[features_count])
yval_per_month_count = data_valid['sales_next_month_count']

prediction_per_month_count = model_count.predict(Xval_per_month_count)

df_forest_per_month_count = pd.DataFrame({"id_salesman": data_valid["salesman_id"], "name": data_valid["name"],"month": data_valid["month"] ,"predicted": prediction_per_month_count, "true": yval_per_month_count})
df_forest_per_month_count[df_forest_per_month_count["id_salesman"] == 1]

Unnamed: 0,id_salesman,name,month,predicted,true
26,1,VENTAS DIRECTA,1,33.62,43.0
27,1,VENTAS DIRECTA,2,34.79,18.0
28,1,VENTAS DIRECTA,3,37.57,23.0
29,1,VENTAS DIRECTA,4,27.72,19.0
30,1,VENTAS DIRECTA,5,31.04,19.0
31,1,VENTAS DIRECTA,6,35.04,24.0
32,1,VENTAS DIRECTA,7,27.77,25.0
33,1,VENTAS DIRECTA,8,29.1,34.0
34,1,VENTAS DIRECTA,9,28.14,50.0
35,1,VENTAS DIRECTA,10,49.9,56.0


In [32]:
print(mape(df_forest_per_month_count["true"], df_forest_per_month_count["predicted"]))
print(wmape(df_forest_per_month_count["true"], df_forest_per_month_count["predicted"]))

0.6010330682739863
0.3282958579881657
