Esse notebook tem por objetivo fazer a relação de empresas que importam somente uma marca, facilitando a relação importador-

In [77]:
# Importing the modules needed
import sys

import pandas as pd

sys.path.append("../src/")

from src.data.dremio_utils import *
# Data Handling
from dotenv import dotenv_values 

import numpy as np
from tqdm import tqdm
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType, TimestampType

from xgboost import XGBRegressor

from sklearn.preprocessing import LabelEncoder
# import pyspark.pandas as pd

In [78]:
spark = SparkSession.builder.master("local[1]").appName("attributes_dict").getOrCreate()

In [79]:
config = dotenv_values(".env")
bds = BaseDremioService(config)

## 1. Getting Merged Data

In [80]:
grouped_data = pd.read_parquet("../data/processed/average_unity_price_historic.parquet")
grouped_data.dropna(inplace=True)
grouped_data = grouped_data[(grouped_data["importador_uf"] != "") & (grouped_data["importador_municipio"] != "")].copy()

In [81]:
grouped_data.shape

(9390031, 10)

In [82]:
grouped_data = grouped_data.groupby(['ncm', 'importador_uf', 'importador_municipio', 'urf', 'id_pais_origem', 'ano', 'semestre'], 
                                    as_index=False).mean('avg_valor_item')

## 2. Feature Engineering

In [83]:
# years_df = pd.DataFrame.from_dict({"ano": [2022, 2023]})
years_df = pd.DataFrame.from_dict({"ano": [2018, 2019, 2020, 2021, 2022, 2023]})
semesters_df = pd.DataFrame.from_dict({"semestre": [1, 2]})
gabarito_datas = years_df.join(semesters_df, how="cross")
gabarito_datas["ano_semestre"] =  gabarito_datas["ano"]*100 + gabarito_datas["semestre"] 
# gabarito_datas.drop(columns=["ano", "semestre"], inplace=True)

In [84]:
unique_keys = grouped_data[["ncm", "id_pais_origem", "importador_uf", "importador_municipio", "urf"]].drop_duplicates()
df_2b_filled = unique_keys.join(gabarito_datas, how="cross")

In [85]:
grouped_data["ano_semestre"] = grouped_data["ano_semestre"].astype(int)
grouped_data["ano"] = grouped_data["ano"].astype(int)
grouped_data["semestre"] = grouped_data["semestre"].astype(int)

In [86]:
df_filled = df_2b_filled.merge(grouped_data, how='left', 
                               on=["ncm", "id_pais_origem", "importador_uf", "importador_municipio", "urf", "ano_semestre", "ano", "semestre"])

In [87]:
# df_filled["avg_valor_item"] = df_filled["avg_valor_item"].interpolate() 

In [88]:
df_filled.shape

(28499640, 9)

In [89]:
# Saving the interpolation
# df_filled.to_parquet("../data/interim/interpolated_data.parquet")
# df_filled.shape[0]

In [90]:
categorical_columns =["id_pais_origem",
"importador_municipio",
"importador_uf",
"ncm",
"urf"]

label_encoders = {}
with tqdm(total=len(categorical_columns)) as pbar:
    for column in categorical_columns:
        pbar.set_description(f"Converting categorical column {column}")
        le = LabelEncoder()
        unique_values = df_filled[column].unique()
        le.fit(unique_values)
        encoded_column = column + "_label"
        df_filled[encoded_column] = le.transform(df_filled[column])
        label_encoders[column] = le
        pbar.update(1)

Converting categorical column urf: 100%|██████████| 5/5 [00:17<00:00,  3.49s/it]                 


In [91]:
df_filled["ano"] = df_filled["ano_semestre"].astype(str).str[:4].astype(int)

In [92]:
df_filled["semestre"] = df_filled["ano_semestre"].astype(str).str[-1:].astype(int)

In [93]:
df_filled = df_filled[df_filled["avg_valor_item"] > 0].copy()

In [94]:
# df_filled = df_filled[df_filled["ano_semestre"] >=20222]

In [95]:
df_filled.dropna(subset="avg_valor_item", axis=0, inplace=True)
x = df_filled[["ncm_label", "id_pais_origem_label", "importador_uf_label", "importador_municipio_label", "urf_label", "ano_semestre",
               "ano", "semestre"]].copy()
y = df_filled["avg_valor_item"]

In [96]:
xgbr = XGBRegressor()
xgbr.fit(x, y)

In [97]:
predict_df = pd.DataFrame.from_dict({"ano": [2024, 2024], "semestre": [1, 2]})
predict_df["ano_semestre"] = predict_df["ano"]*10 + predict_df["semestre"] 
unique_keys = df_filled[["ncm_label", "id_pais_origem_label", "importador_uf_label", "importador_municipio_label", "urf_label"]].drop_duplicates()
df_2b_predicted = unique_keys.join(predict_df, how="cross")

In [98]:
x_pred = df_2b_predicted[["ncm_label", "id_pais_origem_label", "importador_uf_label", "importador_municipio_label", "urf_label", "ano_semestre", "ano", "semestre"]]
y_pred = xgbr.predict(x_pred)

In [99]:
predicted_df = pd.DataFrame(x_pred)
with tqdm(total=len(categorical_columns)) as pbar:
    for column in categorical_columns:
        pbar.set_description(f"Unconverting categorical column {column}")
        le = label_encoders[column]
        predicted_df[column] = le.inverse_transform(predicted_df[column + "_label"])
        predicted_df.drop(columns=[column + "_label"], inplace=True)
        pbar.update(1)

Unconverting categorical column urf: 100%|██████████| 5/5 [00:01<00:00,  3.63it/s]                 


In [100]:
predicted_df["avg_valor_item"] = y_pred
predicted_df.head()

Unnamed: 0,ano_semestre,ano,semestre,id_pais_origem,importador_municipio,importador_uf,ncm,urf,avg_valor_item
0,20241,2024,1,ARGENTINA,RS,PASSO FUNDO,10011100.0,AEROPORTO INTERNACIONAL DE SAO PAULO/GUARULHOS,-186855.25
1,20242,2024,2,ARGENTINA,RS,PASSO FUNDO,10011100.0,AEROPORTO INTERNACIONAL DE SAO PAULO/GUARULHOS,-103427.773438
2,20241,2024,1,ARGENTINA,RS,PASSO FUNDO,10011100.0,ALF - URUGUAIANA,81168.015625
3,20242,2024,2,ARGENTINA,RS,PASSO FUNDO,10011100.0,ALF - URUGUAIANA,34145.5625
4,20241,2024,1,URUGUAI,RS,PASSO FUNDO,10011100.0,IRF SANTANA DO LIVRAMENTO,-71863.898438


In [101]:
predicted_df.to_parquet("../data/processed/xgboost_trendline.parquet")