Esse notebook tem por objetivo fazer a relação de empresas que importam somente uma marca, facilitando a relação importador-

In [37]:
# Importing the modules needed
import sys

import pandas as pd

sys.path.append("../src/")

from src.data.dremio_utils import *
# Data Handling
from dotenv import dotenv_values 

import numpy as np
from tqdm import tqdm
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType, TimestampType

from xgboost import XGBRegressor

from sklearn.preprocessing import LabelEncoder

from src.utils.utils import get_logger
import logging

logger = logging.getLogger(__name__)
logger = get_logger(logger=logger)
# import pyspark.pandas as pd

In [38]:
spark = SparkSession.builder.master("local[1]").appName("attributes_dict").getOrCreate()

In [39]:
config = dotenv_values(".env")
bds = BaseDremioService(config)

## 1. Getting Merged Data

In [40]:
grouped_data = pd.read_parquet("../data/processed/average_unity_price_historic.parquet")
grouped_data.dropna(inplace=True)
grouped_data = grouped_data[(grouped_data["importador_uf"] != "") & (grouped_data["importador_municipio"] != "")].copy()

In [41]:
grouped_data.shape

(9390031, 10)

In [42]:
grouped_data = grouped_data.groupby(['ncm', 'importador_uf', 'importador_municipio', 'urf', 'id_pais_origem', 'ano', 'semestre'], 
                                    as_index=False).mean('avg_valor_item')

## 2. Feature Engineering

In [43]:
# years_df = pd.DataFrame.from_dict({"ano": [2022, 2023]})
years_df = pd.DataFrame.from_dict({"ano": [2018, 2019, 2020, 2021, 2022, 2023]})
semesters_df = pd.DataFrame.from_dict({"semestre": [1, 2]})
gabarito_datas = years_df.join(semesters_df, how="cross")
gabarito_datas["ano_semestre"] =  gabarito_datas["ano"]*100 + gabarito_datas["semestre"] 
gabarito_datas = gabarito_datas[gabarito_datas["ano_semestre"] <= 202301].copy() 
# gabarito_datas.drop(columns=["ano", "semestre"], inplace=True)

In [44]:
unique_keys = grouped_data[["ncm", "id_pais_origem", "importador_uf", "importador_municipio", "urf"]].drop_duplicates()
df_2b_filled = unique_keys.join(gabarito_datas, how="cross")

In [45]:
grouped_data["ano_semestre"] = grouped_data["ano_semestre"].astype(int)
grouped_data["ano"] = grouped_data["ano"].astype(int)
grouped_data["semestre"] = grouped_data["semestre"].astype(int)

In [46]:
df_filled = df_2b_filled.merge(grouped_data, how='left', 
                               on=["ncm", "id_pais_origem", "importador_uf", "importador_municipio", "urf", "ano_semestre", "ano", "semestre"])

In [47]:
categorical_columns =["id_pais_origem",
"importador_municipio",
"importador_uf",
"ncm",
"urf"]

label_encoders = {}
with tqdm(total=len(categorical_columns)) as pbar:
    for column in categorical_columns:
        pbar.set_description(f"Converting categorical column {column}")
        le = LabelEncoder()
        unique_values = df_filled[column].unique()
        le.fit(unique_values)
        encoded_column = column + "_label"
        df_filled[encoded_column] = le.transform(df_filled[column])
        label_encoders[column] = le
        pbar.update(1)

Converting categorical column urf: 100%|██████████| 5/5 [00:25<00:00,  5.19s/it]                 


In [48]:
df_filled["ano"] = df_filled["ano_semestre"].astype(str).str[:4].astype(int)

In [49]:
df_filled["semestre"] = df_filled["ano_semestre"].astype(str).str[-1:].astype(int)

In [50]:
df_filled = df_filled[df_filled["avg_valor_item"] > 0].copy()

In [51]:
# df_filled = df_filled[df_filled["ano_semestre"] >=20222]

In [52]:

# Correção de estados e municipios
estados_br = ["AC", "AL", "AP", "AM", "BA", "CE", "ES", "GO", "MA", "MT", "MS", "MG", "PA", "PB", "PR", "PE",
              "PI", "RJ", "RN", "RS", "RO", "RR", "SC", "SP", "SE", "TO", "DF"]

df_filled["old_municipio"] = grouped_data["importador_municipio"]

logger.info("\tajustando importador UF")
df_filled["importador_uf_new"] = grouped_data.apply(lambda x: x["importador_uf"] \
    if x["importador_uf"] in estados_br else x["importador_municipio"], axis=1)

logger.info("\tajustando importador municipio")
df_filled["importador_municipio_new"] = grouped_data.apply(lambda x: x["old_municipio"] \
    if x["old_municipio"] not in estados_br else x["importador_uf"], axis=1)

df_filled.drop(columns=['importador_municipio', "importador_uf", "old_municipio"], inplace=True)
df_filled.rename(columns={"importador_municipio_new": "importador_municipio",
                   "importador_uf_new": "importador_uf"}, inplace=True)

2024-03-11 18:15:14,096 - __main__ - INFO - 	ajustando importador UF
2024-03-11 18:15:14,096 - __main__ - INFO - 	ajustando importador UF
2024-03-11 18:15:44,693 - __main__ - INFO - 	ajustando importador municipio
2024-03-11 18:15:44,693 - __main__ - INFO - 	ajustando importador municipio


KeyError: 'old_municipio'

In [None]:
df_filled.dropna(subset="avg_valor_item", axis=0, inplace=True)
x = df_filled[["ncm_label", "id_pais_origem_label", "importador_uf_label", "importador_municipio_label", "urf_label", "ano_semestre",
               "ano", "semestre"]].copy()
y = df_filled["avg_valor_item"]

In [None]:
xgbr = XGBRegressor()
xgbr.fit(x, y)

In [None]:
predict_df = pd.DataFrame.from_dict({"ano": [2023], "semestre": [2]})
predict_df["ano_semestre"] = predict_df["ano"]*10 + predict_df["semestre"] 
unique_keys = df_filled[["ncm_label", "id_pais_origem_label", "importador_uf_label", "importador_municipio_label", "urf_label"]].drop_duplicates()
df_2b_predicted = unique_keys.join(predict_df, how="cross")

In [None]:
x_pred = df_2b_predicted[["ncm_label", "id_pais_origem_label", "importador_uf_label", "importador_municipio_label", "urf_label", "ano_semestre", "ano", "semestre"]]
y_pred = xgbr.predict(x_pred)

In [None]:
predicted_df = pd.DataFrame(x_pred)
with tqdm(total=len(categorical_columns)) as pbar:
    for column in categorical_columns:
        pbar.set_description(f"Unconverting categorical column {column}")
        le = label_encoders[column]
        predicted_df[column] = le.inverse_transform(predicted_df[column + "_label"])
        predicted_df.drop(columns=[column + "_label"], inplace=True)
        pbar.update(1)

In [None]:
predicted_df["avg_valor_item"] = y_pred
predicted_df.head()

In [None]:
predicted_df = pd.concat([df_filled, predicted_df]).head()
predicted_df.sort_values(by=["ncm", "id_pais_origem", "importador_municipio", "urf", "ano", "semestre"], inplace=True)
# Interpolar por grupos

In [None]:
predicted_df.to_parquet("../data/processed/xgboost_trendline_v2.parquet")