In [4]:
import pandas as pd
import numpy as np
import psycopg2
import os
from dotenv import load_dotenv
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
load_dotenv()

True

In [5]:
def mape(y_true, y_pred):
    ape = np.abs((y_true - y_pred) / y_true)
    return np.mean(ape)

def wmape(y_true, y_pred):
    return np.sum(np.abs((y_true - y_pred))) / np.sum(np.abs(y_true))

In [6]:
host = os.getenv("HOST")
dbname = os.getenv("DATABASE_NAME")
user = os.getenv("USERNAME")
password = os.getenv("DATABASE_PASSWORD")

conn_string = "host={0} user={1} dbname={2} password={3}".format(host, user, dbname, password)
conn = psycopg2.connect(conn_string)
cursor = conn.cursor()

In [7]:
query = "SELECT * FROM core_sale;"
df_sales = pd.read_sql(query, conn)
df_sales_copy = df_sales.copy()



In [8]:
df_sales_copy["year"] = df_sales_copy.apply(lambda row : row["date"].year, axis=1)
df_sales_copy["month"] = df_sales_copy.apply(lambda row : row["date"].month, axis=1)
df_sales_copy["day"] = df_sales_copy.apply(lambda row : row["date"].day, axis=1)
df_sales_copy.drop(columns=["date"], inplace=True)
df_sales_copy.drop(columns=["description"], inplace=True)
df_sales_copy.drop(columns=["status"], inplace=True)
df_sales_machine_learning = df_sales_copy.copy()

In [9]:
df_sales_machine_learning

Unnamed: 0,id,income,client_id,salesman_id,income_currency,year,month,day
0,00034292,0.03,400,34,USD,2018,11,14
1,00034306,0.05,631,33,USD,2018,11,16
2,00034340,112.85,588,37,USD,2018,11,26
3,00034346,982.69,271,32,USD,2018,11,26
4,00034347,1611.32,250,31,USD,2018,11,26
...,...,...,...,...,...,...,...,...
21660,042087,45.44,178,42,USD,2021,4,22
21661,00003484,8.52,264,34,USD,2019,11,21
21662,NE00004937,45.44,178,34,USD,2021,3,4
21663,NE006477,38.00,1043,39,USD,2021,6,2


In [10]:
split_point = 2021
df_sales_machine_learning.reset_index(inplace=True)

df_sales_machine_learning
data_train = df_sales_machine_learning.loc[df_sales_machine_learning['year'] < split_point].copy()
data_valid = df_sales_machine_learning.loc[df_sales_machine_learning['year'] >= split_point].copy()

In [11]:
df_groupby_sales = data_train.groupby(["year", "month"])

data = {
    'income': df_groupby_sales['income'].sum(),
    'count': df_groupby_sales['id'].count()
}

df_sales_per_month_train = pd.DataFrame(data)
df_sales_per_month_train

Unnamed: 0_level_0,Unnamed: 1_level_0,income,count
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,11,52361.97,144
2018,12,44421.13,138
2019,1,76548.24,149
2019,2,14674.46,106
2019,3,48730.99,71
2019,4,182488.12,201
2019,5,171914.8,272
2019,6,189254.46,222
2019,7,242337.45,230
2019,8,114204.27,236


In [12]:
df_groupby_sales = data_valid.groupby(["year", "month"])

data = {
    'income': df_groupby_sales['income'].sum(),
    'count': df_groupby_sales['id'].count()
}

df_sales_per_month_valid = pd.DataFrame(data)
df_sales_per_month_valid

Unnamed: 0_level_0,Unnamed: 1_level_0,income,count
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1
2021,1,158706.08,595
2021,2,257297.47,872
2021,3,192503.59,777
2021,4,206944.43,710
2021,5,166545.74,739
2021,6,166283.13,745
2021,7,158943.72,876
2021,8,138120.65,816
2021,9,175135.81,908
2021,10,279933.26,1114


In [13]:
df_sales_per_month_train.reset_index(inplace=True)
df_sales_per_month_valid.reset_index(inplace=True)

In [14]:
df_sales_per_month_train["sales_next_month"] = df_sales_per_month_train["income"].shift(-1)
df_sales_per_month_valid["sales_next_month"] = df_sales_per_month_valid["income"].shift(-1)

In [15]:

df_sales_per_month_train["sales_next_month_count"] = df_sales_per_month_train["count"].shift(-1)
df_sales_per_month_valid["sales_next_month_count"] = df_sales_per_month_valid["count"].shift(-1)

In [16]:
df_sales_per_month_train.dropna(inplace=True)
df_sales_per_month_train["lag_sales_next_month"] = df_sales_per_month_train["income"].shift(12)
df_sales_per_month_train["lag_sales_next_month_count"] = df_sales_per_month_train["count"].shift(12)


In [17]:
df_sales_per_month_valid["lag_sales_next_month"] = df_sales_per_month_valid["income"].shift(1)
df_sales_per_month_valid["lag_sales_next_month_count"] = df_sales_per_month_valid["count"].shift(1)

In [18]:
df_sales_per_month_train["diff_sales_next_month"] = df_sales_per_month_train["income"].diff(1)
df_sales_per_month_train["diff_sales_next_month_count"] = df_sales_per_month_train["count"].diff(1)

In [19]:
df_sales_per_month_valid["diff_sales_next_month"] = df_sales_per_month_valid["income"].diff(1)
df_sales_per_month_valid["diff_sales_next_month_count"] = df_sales_per_month_valid["count"].diff(1)

In [20]:
y_pred = df_sales_per_month_valid["sales_next_month_count"]
y_true = df_sales_per_month_valid["count"]

In [21]:
mape(y_true, y_pred)

0.17414738123962337

In [22]:
features = ["income", "count", "lag_sales_next_month", "lag_sales_next_month_count", "diff_sales_next_month", "diff_sales_next_month_count"]

In [23]:
imputer = SimpleImputer()
X_train = imputer.fit_transform(df_sales_per_month_train[features])
y_train = df_sales_per_month_train["sales_next_month_count"]

model = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
model.fit(X_train, y_train)

RandomForestRegressor(n_jobs=6, random_state=0)

In [24]:
X_validation = imputer.transform(df_sales_per_month_valid[features])
y_validation = df_sales_per_month_valid["sales_next_month_count"]

predicctor = model.predict(X_validation)

df_forest = pd.DataFrame({"predicted": predicctor, "true": y_validation, "diff": y_validation - predicctor})
df_forest

Unnamed: 0,predicted,true,diff
0,591.27,872.0,280.73
1,665.62,777.0,111.38
2,629.72,710.0,80.28
3,618.02,739.0,120.98
4,620.42,745.0,124.58
5,613.04,876.0,262.96
6,634.33,816.0,181.67
7,623.49,908.0,284.51
8,628.13,1114.0,485.87
9,667.37,1106.0,438.63


In [25]:
mape(y_validation, predicctor)

0.27017663763495814

In [26]:
wmape(y_validation, predicctor)

0.28188857584295196