In [28]:
import os
import pandas as pd
import numpy as np
from scipy.constants import value
from tqdm import tqdm
import sys

from src.utils.utils import get_logger
import logging

from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from xgboost.spark import SparkXGBClassifier

import seaborn as sns
import pandas as pd

logger = logging.getLogger(__name__)
logger = get_logger(logger=logger)

In [29]:
!python --version

Python 3.11.8


In [30]:
# Execution Flags
PRE_INTERPOLATED_DATA = False
REMOVE_OUTLIER_PRICES = False

In [31]:
conf=SparkConf()
spark = SparkSession.builder.master("local[1]").appName("trend_line").getOrCreate()

In [32]:
spark

In [33]:
# Leitura dos dados históricos
if PRE_INTERPOLATED_DATA:
    grouped_data = pd.read_parquet("../data/processed/pd_trended_data_interpolated.parquet")
    grouped_fata = grouped_data[grouped_data["ano_semestre"] < 202400].copy()
else:
    grouped_data = pd.read_parquet("../data/processed/average_unity_price_historic.parquet")

In [34]:
# Deleção dos dados nulos
logger.info("Filtrando e corrigindo dados")
grouped_data.dropna(inplace=True)
grouped_data = grouped_data[(grouped_data["importador_uf"] != "") \
                            & (grouped_data["importador_municipio"] != "")].copy()

2024-03-20 11:49:07,602 - __main__ - INFO - Filtrando e corrigindo dados
2024-03-20 11:49:07,602 - __main__ - INFO - Filtrando e corrigindo dados


## 1. Correção de estados e municipios

In [35]:
ESTADOS_BR = ["AC", "AL", "AP", "AM", "BA", "CE", "ES", "GO", "MA", "MT", "MS", "MG", "PA", "PB", "PR", "PE",
              "PI", "RJ", "RN", "RS", "RO", "RR", "SC", "SP", "SE", "TO", "DF"]

grouped_data["old_municipio"] = grouped_data["importador_municipio"]

logger.info("\tajustando importador UF")
grouped_data["importador_uf_new"] = grouped_data.apply(lambda x: x["importador_uf"] \
    if x["importador_uf"] in ESTADOS_BR else x["importador_municipio"], axis=1)

logger.info("\tajustando importador municipio")
grouped_data["importador_municipio_new"] = grouped_data.apply(lambda x: x["old_municipio"] \
    if x["old_municipio"] not in ESTADOS_BR else x["importador_uf"], axis=1)

grouped_data.drop(columns=['importador_municipio', "importador_uf", "old_municipio"], inplace=True)
grouped_data.rename(columns={"importador_municipio_new": "importador_municipio",
                   "importador_uf_new": "importador_uf"}, inplace=True)

2024-03-20 11:49:23,328 - __main__ - INFO - 	ajustando importador UF
2024-03-20 11:49:23,328 - __main__ - INFO - 	ajustando importador UF
2024-03-20 11:50:37,629 - __main__ - INFO - 	ajustando importador municipio
2024-03-20 11:50:37,629 - __main__ - INFO - 	ajustando importador municipio


## 2. Montagem do gabarito de datas

### 2.1 Criação do gabarito

In [36]:
logger.info("Criando dataframes de datas")
# Creation of historic dates template
years_df = pd.DataFrame.from_dict({"ano": [2018, 2019, 2020, 2021, 2022, 2023]})
# years_df = pd.DataFrame.from_dict({"ano": [2022, 2023]})
semesters_df = pd.DataFrame.from_dict({"semestre": [1, 2]})
dates_template = years_df.join(semesters_df, how="cross")
dates_template["ano_semestre"] = dates_template["ano"] * 100 + dates_template["semestre"]
dates_template.drop(columns=["ano", "semestre"], inplace=True)

2024-03-20 11:51:20,385 - __main__ - INFO - Criando dataframes de datas
2024-03-20 11:51:20,385 - __main__ - INFO - Criando dataframes de datas


### 2.2 Left Join com dados Crus

In [37]:
grouped_data["ncm"] = grouped_data["ncm"].astype(float).astype(int).astype(str)

In [38]:
# Criação de chaves únicas
grouped_data["key"] = grouped_data["ncm"].astype(str) + '-' + grouped_data["id_pais_origem"] + '-' + \
                      grouped_data['importador_municipio'] + '-' + grouped_data['urf']

In [39]:
# Seleção somente do que é recorrente
gp_data = grouped_data.groupby(["key", "ano", "semestre"], as_index=False).mean("avg_valor_item")
df_count = gp_data["key"].value_counts().reset_index()
recurrent_keys = df_count[df_count["key"] >= 4]["index"]
recurrent_keys

0                              29342031-CHINA-ITAJAI-ITAJAI
1         40169990-BÉLGICA-SAO JOSE DOS CAMPOS-PORTO DE ...
2         81089000-FRANÇA-SAO JOSE DOS CAMPOS-AEROPORTO ...
3         83012000-ITÁLIA-SETE LAGOAS-PORTO DO RIO DE JA...
4                     84821010-JAPÃO-SUZANO-PORTO DE SANTOS
                                ...                        
396630    84799090-ALEMANHA-BARRO ALTO-AEROPORTO INTERNA...
396631               96035000-CHINA-CAJAMAR-PORTO DE SANTOS
396632    96035000-CHINA-CAJAMAR-AEROPORTO INTERNACIONAL...
396633    22030000-PARAGUAI-JACAREI-AEROPORTO INTERNACIO...
396634    84823000-MALÁSIA-BARUERI-AEROPORTO INTERNACION...
Name: index, Length: 396635, dtype: object

In [40]:
grouped_data = grouped_data[grouped_data["key"].isin(recurrent_keys)].copy()
grouped_data.to_parquet("../data/interim/grouped_data.parquet", index=False)
unique_keys = grouped_data.drop_duplicates(subset="key")[["key", "ncm", "id_pais_origem", "importador_municipio", "urf"]]
cross_template = unique_keys.merge(dates_template, how="cross")

In [41]:
cross_template.head(5)

Unnamed: 0,key,ncm,id_pais_origem,importador_municipio,urf,ano_semestre
0,83021000-COREIA DO SUL-ITU-AEROPORTO INTERNACI...,83021000,COREIA DO SUL,ITU,AEROPORTO INTERNACIONAL DE VIRACOPOS,201801
1,83021000-COREIA DO SUL-ITU-AEROPORTO INTERNACI...,83021000,COREIA DO SUL,ITU,AEROPORTO INTERNACIONAL DE VIRACOPOS,201802
2,83021000-COREIA DO SUL-ITU-AEROPORTO INTERNACI...,83021000,COREIA DO SUL,ITU,AEROPORTO INTERNACIONAL DE VIRACOPOS,201901
3,83021000-COREIA DO SUL-ITU-AEROPORTO INTERNACI...,83021000,COREIA DO SUL,ITU,AEROPORTO INTERNACIONAL DE VIRACOPOS,201902
4,83021000-COREIA DO SUL-ITU-AEROPORTO INTERNACI...,83021000,COREIA DO SUL,ITU,AEROPORTO INTERNACIONAL DE VIRACOPOS,202001


In [42]:
cross_template.to_parquet("../data/interim/cross_template.parquet", index=False)
del cross_template, grouped_data

In [43]:
cross_template_sp = spark.read.parquet("../data/interim/cross_template.parquet")
grouped_data = spark.read.parquet("../data/interim/grouped_data.parquet")

In [44]:
# Grouped Data Treatment
grouped_data = grouped_data.groupBy(["key", "ano_semestre"]).avg("avg_valor_item")
grouped_data = grouped_data.withColumnRenamed("avg(avg_valor_item)", "avg_valor_item")

In [45]:
# preenchimento dos dados históricos
df_filled = cross_template_sp.join(grouped_data, on=["key","ano_semestre"],  how="left")

In [46]:
df_filled.show(5)

+--------------------+------------+--------+--------------+--------------------+--------------------+------------------+
|                 key|ano_semestre|     ncm|id_pais_origem|importador_municipio|                 urf|    avg_valor_item|
+--------------------+------------+--------+--------------+--------------------+--------------------+------------------+
|83021000-COREIA D...|      201802|83021000| COREIA DO SUL|                 ITU|AEROPORTO INTERNA...| 61.74749916791916|
|83021000-COREIA D...|      201901|83021000| COREIA DO SUL|                 ITU|AEROPORTO INTERNA...|17.366388967964383|
|83021000-COREIA D...|      201902|83021000| COREIA DO SUL|                 ITU|AEROPORTO INTERNA...|21.336110929648083|
|83021000-COREIA D...|      202002|83021000| COREIA DO SUL|                 ITU|AEROPORTO INTERNA...|              NULL|
|83021000-COREIA D...|      201801|83021000| COREIA DO SUL|                 ITU|AEROPORTO INTERNA...| 1.840000033378601|
+--------------------+----------

## 3. Transformação em categórico

### 3.1 Pivot dos semestres

In [47]:
df_filled_pivot = df_filled.groupBy("key", "ncm", "id_pais_origem", "importador_municipio", "urf").pivot("ano_semestre").avg("avg_valor_item")
# df_filled_pivot.show()

In [48]:
# df_filled_pivot.fillna(value=0)
df_filled_pivot.write.parquet('../data/interim/2_interpolate', mode="overwrite")

### 3.2 Criação da função para detectar alta ou queda

In [49]:
from pathlib import Path



In [50]:
# List existing files
from pathlib import Path
COLUMNS_2_INTERPOLATE = ['201801', '201802', '201901', '201902', '202001', '202002', '202101', '202102', '202201', '202202', '202301', '202302']

def get_upsndowns(row):
    if row['202301'] < row['202302']:
        status="aumento"
    elif row['202301'] > row['202302']:
        status="queda"
    else:
        status="manteve"
    return status
data_dir = Path('../data/interim/2_interpolate')
files = [parquet_file for parquet_file in data_dir.glob('*.parquet')]
with tqdm(total=len(files), desc="Interpolating and transforming values") as pbar:
    for file in files:
        df_aux = pd.read_parquet(file)
        df_aux.dropna(subset=['202301', '202302'], inplace=True)
        df_aux[COLUMNS_2_INTERPOLATE] = df_aux[COLUMNS_2_INTERPOLATE].interpolate(axis=1, method="linear")
        df_aux["status"] = df_aux.apply(lambda x: get_upsndowns(x), axis=1)
        # Removing outliers
        # train_df[np.abs((stats.zscore(train_df["valor_fob_da"])) < 3)]
        df_aux.to_parquet(f'../data/interim/interpolated_categorized/{file.name}')
        pbar.update(1)

Interpolating and transforming values: 100%|██████████| 3/3 [00:05<00:00,  1.85s/it]


### 3.3 Melt dos valores para preparação

In [51]:
df_aux.set_index("key", inplace=True)

In [52]:
df_aux.columns

Index(['ncm', 'id_pais_origem', 'importador_municipio', 'urf', '201801',
       '201802', '201901', '201902', '202001', '202002', '202101', '202102',
       '202201', '202202', '202301', '202302', 'status'],
      dtype='object')

In [53]:
## 4. Preparação para treinamento
from sklearn.preprocessing import LabelEncoder

# df_to_train = pd.read_parquet('../data/interim/interpolated_categorized/')
CATEGORICAL_COLUMNS = ['ncm', 'id_pais_origem', 'importador_municipio', 'urf', "status"]

data_dir = Path('../data/interim/interpolated_categorized')
files = [parquet_file for parquet_file in data_dir.glob('*.parquet')]

with tqdm(total=len(files), desc="Interpolating and transforming values") as pbar:
    for file in files:
        df_aux = pd.read_parquet(file)
        # df_aux.set_index("key", inplace=True)
        with tqdm(total=len(CATEGORICAL_COLUMNS), desc=f"\tfor file {file.name}") as pbar_s:
            for column in CATEGORICAL_COLUMNS:
                le = LabelEncoder()
                df_aux[column + "_le"] = le.fit_transform(df_aux[column])
                if column == "status":
                    print(le.classes_)
                df_aux.drop(columns=column, inplace=True)
                pbar_s.update(1)
        df_aux.to_parquet(f'../data/interim/ready_to_train/{file.name}')
        pbar.update(1)

Interpolating and transforming values:   0%|          | 0/9 [00:00<?, ?it/s]
	for file part-00000-0c5d4e99-2137-4330-89e6-3939a8ec780d-c000.snappy.parquet:   0%|          | 0/5 [00:00<?, ?it/s][A
	for file part-00000-0c5d4e99-2137-4330-89e6-3939a8ec780d-c000.snappy.parquet: 100%|██████████| 5/5 [00:00<00:00, 32.39it/s][A


['aumento' 'manteve' 'queda']


Interpolating and transforming values:  11%|█         | 1/9 [00:00<00:04,  1.87it/s]
	for file part-00000-25fbe3f5-21ee-4e97-8c89-62a54fa8fa1f-c000.snappy.parquet: 100%|██████████| 5/5 [00:00<00:00, 76.84it/s]
Interpolating and transforming values:  22%|██▏       | 2/9 [00:00<00:02,  2.81it/s]

['aumento' 'manteve' 'queda']



	for file part-00000-bc5dadc7-908c-4b5c-97d6-25dab4e21923-c000.snappy.parquet: 100%|██████████| 5/5 [00:00<00:00, 73.32it/s]
Interpolating and transforming values:  33%|███▎      | 3/9 [00:00<00:01,  3.36it/s]

['aumento' 'manteve' 'queda']



	for file part-00001-0c5d4e99-2137-4330-89e6-3939a8ec780d-c000.snappy.parquet:   0%|          | 0/5 [00:00<?, ?it/s][A
	for file part-00001-0c5d4e99-2137-4330-89e6-3939a8ec780d-c000.snappy.parquet: 100%|██████████| 5/5 [00:00<00:00, 36.62it/s][A
Interpolating and transforming values:  44%|████▍     | 4/9 [00:01<00:01,  2.68it/s]

['aumento' 'manteve' 'queda']



	for file part-00001-25fbe3f5-21ee-4e97-8c89-62a54fa8fa1f-c000.snappy.parquet: 100%|██████████| 5/5 [00:00<00:00, 76.27it/s]
Interpolating and transforming values:  56%|█████▌    | 5/9 [00:01<00:01,  2.99it/s]

['aumento' 'manteve' 'queda']



	for file part-00001-bc5dadc7-908c-4b5c-97d6-25dab4e21923-c000.snappy.parquet: 100%|██████████| 5/5 [00:00<00:00, 74.80it/s]
Interpolating and transforming values:  67%|██████▋   | 6/9 [00:01<00:00,  3.39it/s]

['aumento' 'manteve' 'queda']



	for file part-00002-0c5d4e99-2137-4330-89e6-3939a8ec780d-c000.snappy.parquet:   0%|          | 0/5 [00:00<?, ?it/s][A
	for file part-00002-0c5d4e99-2137-4330-89e6-3939a8ec780d-c000.snappy.parquet: 100%|██████████| 5/5 [00:00<00:00, 39.83it/s][A
Interpolating and transforming values:  78%|███████▊  | 7/9 [00:02<00:00,  3.08it/s]

['aumento' 'manteve' 'queda']



	for file part-00002-25fbe3f5-21ee-4e97-8c89-62a54fa8fa1f-c000.snappy.parquet: 100%|██████████| 5/5 [00:00<00:00, 101.23it/s]
Interpolating and transforming values:  89%|████████▉ | 8/9 [00:02<00:00,  3.58it/s]

['aumento' 'manteve' 'queda']



	for file part-00002-bc5dadc7-908c-4b5c-97d6-25dab4e21923-c000.snappy.parquet:   0%|          | 0/5 [00:00<?, ?it/s][A

['aumento' 'manteve' 'queda']


	for file part-00002-bc5dadc7-908c-4b5c-97d6-25dab4e21923-c000.snappy.parquet: 100%|██████████| 5/5 [00:00<00:00, 96.34it/s]
Interpolating and transforming values: 100%|██████████| 9/9 [00:02<00:00,  3.32it/s]


In [54]:
df_raw = pd.read_parquet(f'../data/interim/ready_to_train/')
# ToDo: Test without the melt value
df_raw = df_raw.melt(id_vars=["key","ncm_le", "id_pais_origem_le", "importador_municipio_le", "urf_le", "status_le"])
df_raw.rename(columns={"variable": "ano_semestre", "value": "avg_valor_unitario"}, inplace=True)
df_raw["ano"] = df_raw["ano_semestre"].str[:4]
df_raw["semestre"] = df_raw["ano_semestre"].str[-1]
df_raw.dropna(subset="avg_valor_unitario", inplace=True)
df_raw.to_parquet("../data/interim/data_ready_to_train.parquet")