In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from src.utils.utils import get_logger
import logging

from IPython.display import clear_output

logger = logging.getLogger(__name__)
logger = get_logger(logger=logger)

In [2]:
# Reading Data
grouped_data = pd.read_parquet("../data/processed/average_unity_price_historic.parquet")

logger.info("Filtrando e corrigindo dados")

# Deleting all null values and empty spaces in important columns
grouped_data.dropna(inplace=True)
grouped_data = grouped_data[(grouped_data["importador_uf"] != "") \
                            & (grouped_data["importador_municipio"] != "")].copy()

# Correction of states and counties
estados_br = ["AC", "AL", "AP", "AM", "BA", "CE", "ES", "GO", "MA", "MT", "MS", "MG", "PA", "PB", "PR", "PE",
              "PI", "RJ", "RN", "RS", "RO", "RR", "SC", "SP", "SE", "TO", "DF"]

grouped_data["old_municipio"] = grouped_data["importador_municipio"]

# Treating the data

logger.info("\tajustando importador UF")
grouped_data["importador_uf_new"] = grouped_data.apply(lambda x: x["importador_uf"] \
    if x["importador_uf"] in estados_br else x["importador_municipio"], axis=1)

logger.info("\tajustando importador municipio")
grouped_data["importador_municipio_new"] = grouped_data.apply(lambda x: x["old_municipio"] \
    if x["old_municipio"] not in estados_br else x["importador_uf"], axis=1)

grouped_data.drop(columns=['importador_municipio', "importador_uf", "old_municipio"], inplace=True)
grouped_data.rename(columns={"importador_municipio_new": "importador_municipio",
                   "importador_uf_new": "importador_uf"}, inplace=True)
    

2024-03-18 11:06:08,260 - __main__ - INFO - Filtrando e corrigindo dados
2024-03-18 11:06:17,600 - __main__ - INFO - 	ajustando importador UF
2024-03-18 11:07:06,364 - __main__ - INFO - 	ajustando importador municipio


In [3]:
grouped_data = grouped_data.groupby(['ncm', 'importador_uf', 'importador_municipio', 'urf', 'id_pais_origem',
                                     'ano', 'semestre'], as_index=False).mean('avg_valor_item')

In [4]:
logger.info("Criando dataframes de datas")
# Creation of historic dates template
years_df = pd.DataFrame.from_dict({"ano": [2018, 2019, 2020, 2021, 2022, 2023]})
# years_df = pd.DataFrame.from_dict({"ano": [2022, 2023]})
semesters_df = pd.DataFrame.from_dict({"semestre": [1, 2]})
dates_template = years_df.join(semesters_df, how="cross")
dates_template["ano_semestre"] = dates_template["ano"] * 100 + dates_template["semestre"]
dates_template = dates_template[dates_template["ano_semestre"] < 20232]
dates_template.drop(columns=["ano", "semestre"], inplace=True)

2024-03-18 11:08:04,843 - __main__ - INFO - Criando dataframes de datas


In [5]:
# Creation of dates template for the trend line
new_data = {"ano": [2023],
            "semestre": [2]}
df_new_data = pd.DataFrame(new_data)
df_new_data["ano_semestre"] = df_new_data["ano"] * 100 + df_new_data["semestre"]

In [6]:
logger.info("Filtrando dataframe somente com chaves válidas não processadas")

# Reading the data already processed (create a empty dataframe if theres no processed data)
if os.path.isfile("../data/processed/pd_trended_data_interpolated.parquet"):
    df_total = pd.read_parquet("../data/processed/pd_trended_data_interpolated.parquet")

    # Creating the list of already processed and valid keys
    keys_processed = df_total[['id_pais_origem', 'ncm', 'importador_municipio', 'urf']].drop_duplicates()
    keys_processed["key"] = keys_processed["ncm"].astype(str) + '-' + keys_processed["id_pais_origem"] + '-' + \
                            keys_processed['importador_municipio'] + '-' + keys_processed['urf']

    already_processed = keys_processed["key"].to_list()

else:
    df_total = pd.DataFrame()
    already_processed = []

2024-03-18 11:08:04,904 - __main__ - INFO - Filtrando dataframe somente com chaves válidas não processadas


In [7]:
already_processed = []
df_total = pd.DataFrame()

In [8]:
grouped_data

Unnamed: 0,ncm,importador_uf,importador_municipio,urf,id_pais_origem,ano,semestre,avg_valor_item,ano_semestre
0,1012100.00,GO,FORMOSA,AEROPORTO INTERNACIONAL DE VIRACOPOS,ESTADOS UNIDOS,2020,2,12900.000000,202002.0
1,1012100.00,GO,FORMOSA,AEROPORTO INTERNACIONAL DE VIRACOPOS,ESTADOS UNIDOS,2021,1,10000.000000,202101.0
2,1012100.00,MG,BOA ESPERANCA,AEROPORTO INTERNACIONAL DE VIRACOPOS,ESTADOS UNIDOS,2019,1,27120.000000,201901.0
3,1012100.00,MG,BOA ESPERANCA,AEROPORTO INTERNACIONAL DE VIRACOPOS,FRANÇA,2021,2,17621.890625,202102.0
4,1012100.00,MG,MATOZINHOS,AEROPORTO INTERNACIONAL DE VIRACOPOS,BÉLGICA,2019,2,1782.250000,201902.0
...,...,...,...,...,...,...,...,...,...
4582695,97069000.00,SP,SAO PAULO,AEROPORTO INTERNACIONAL DE SAO PAULO/GUARULHOS,FRANÇA,2022,2,50000.000000,202202.0
4582696,99999942.00,GO,ANAPOLIS,AEROPORTO INTERNACIONAL DE BRASÍLIA,SUÉCIA,2019,2,197342.593750,201902.0
4582697,99999942.00,GO,ANAPOLIS,AEROPORTO INTERNACIONAL DE BRASÍLIA,SUÉCIA,2021,1,139064.257778,202101.0
4582698,99999942.00,GO,ANAPOLIS,AEROPORTO INTERNACIONAL DE BRASÍLIA,SUÉCIA,2021,2,65031.512500,202102.0


In [9]:
# Filtering only for the data in the last five years

# ToDo: Separar somente o que for de 2023 pra trás
# grouped_data = grouped_data[(grouped_data["ano_semestre"] < 202302) & (grouped_data["ano_semestre"] > 202201)]


grouped_data["key"] = grouped_data["ncm"].astype(str) + '-' + grouped_data["id_pais_origem"] + '-' + \
                      grouped_data['importador_municipio'] + '-' + grouped_data['urf']


selected_group = grouped_data[(grouped_data["ano_semestre"] == 202302) & (grouped_data["avg_valor_item"] > 0)]
selected_keys = selected_group["key"].unique()
grouped_data = grouped_data[grouped_data["ano_semestre"] < 202302]


# Filtering the keys that is constantily repeated (so we could make a good trend line)
count = pd.DataFrame(grouped_data["key"].value_counts())
threshold_count = 2
count = count[count["key"] >= threshold_count].copy()
keys_2_process = count.reset_index()["index"].to_list()

# Filtering the dataset for keys not processed and recurrent keys
grouped_data = grouped_data[~grouped_data["key"].isin(already_processed)]
grouped_data = grouped_data[grouped_data["key"].isin(keys_2_process)]
grouped_data = grouped_data[grouped_data["key"].isin(selected_keys)]

In [10]:
df_total = pd.DataFrame()

In [11]:
grouped_data.shape[0]

1334531

grouped_data[
(grouped_data['id_pais_origem'] == key[0]) &
(grouped_data['ncm'] == key[1]) &
(grouped_data['importador_municipio'] == key[2]) &
(grouped_data['importador_uf'] == key[3]) &
(grouped_data['urf'] == key[4])
].groupby(["ano_semestre"], as_index=False).mean("avg_valor_item")

In [12]:
grouped_data["ncm"] = grouped_data["ncm"].astype(int) 

In [14]:
logger.info("Iniciando criacao da linha")
file_count = 0
grouped = grouped_data.groupby(['id_pais_origem', 'ncm', 'importador_municipio', 'importador_uf' ,'urf'])
groups_qtd = grouped_data[['id_pais_origem', 'ncm', 'importador_municipio', 'importador_uf' ,'urf']].drop_duplicates().shape[0]
with tqdm(total=groups_qtd, desc="Criando linha de tendencia para preco unitario") as pbar:
    for key, df_group in grouped:

        df_aux_hist = grouped_data[
            (grouped_data['id_pais_origem'] == key[0]) &
            (grouped_data['ncm'] == key[1]) &
            (grouped_data['importador_municipio'] == key[2]) &
            (grouped_data['importador_uf'] == key[3]) &
            (grouped_data['urf'] == key[4])
            ].groupby(["ano_semestre"], as_index=False).mean("avg_valor_item")

        group_key = key[0] + '-' + key[1].astype(str) + '-' + key[2] + '-' + key[3]
        if (df_aux_hist.shape[0] > 0) and (group_key not in already_processed):
            if len(df_aux_hist["ano_semestre"].unique()) < 2:
                # Interpolate if it hasn't enough data to infer
                gabarito_aux = dates_template.copy()
                df_aux_hist = gabarito_aux.merge(df_aux_hist, on=['ano_semestre'], how='left')
                df_aux_hist["ano"] = df_aux_hist['ano_semestre'].astype(str).str[:4]
                df_aux_hist["semestre"] = df_aux_hist['ano_semestre'].astype(str).str[-1:]
                df_aux_hist["avg_valor_item"] = df_aux_hist["avg_valor_item"].interpolate()
                df_aux_hist.dropna(axis=0, inplace=True)

            df_aux_trend = df_new_data.copy()
            to_create_trend = df_aux_hist[df_aux_hist["ano_semestre"]>20221].copy()
            if to_create_trend.shape[0] > 0:
                z = np.polyfit(to_create_trend["ano_semestre"], to_create_trend["avg_valor_item"], 1)
                p = np.poly1d(z)
                df_aux_trend["avg_valor_item"] = p(df_aux_trend["ano_semestre"])
    
                final_aux = pd.concat([df_aux_hist, df_aux_trend])
                final_aux['id_pais_origem'] = key[0]
                final_aux['ncm'] = key[1]
                final_aux['importador_municipio'] = key[2]
                final_aux['importador_uf'] = key[3]
                final_aux['urf'] = key[4]
                final_aux["ano"] = final_aux["ano"].astype(int)
                final_aux["semestre"] = final_aux["semestre"].astype(int)
    
                df_total = pd.concat([df_total, final_aux])
                file_count += 1
    
                # For each 200 groups processed, it'll update the final dataframe
                # if file_count % 200 == 0:
                if file_count % 1000 == 0:
                    df_total.to_parquet(f"../data/processed/trend_values_v4/trend_lines_{int(file_count/1000)}.parquet", index=False)
                    df_total = pd.DataFrame()
                    clear_output()
                if file_count % 3000 == 0:
                    break
                   
        pbar.update(1)
        
df_total.to_parquet(f"../data/processed/trend_values_v4/trend_lines_end.parquet", index=False)

Criando linha de tendencia para preco unitario:   1%|          | 2999/242791 [09:03<12:04:51,  5.51it/s]


In [15]:
# At the end, it should save at the end of executors
from pathlib import Path
import pandas as pd

data_dir = Path("../data/processed/exp_g3/")
full_df = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in data_dir.glob('*.parquet')
)
full_df.to_parquet('../data/processed/trend_values_v4/trended_datav4_interpolated.parquet')

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import coalesce
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType, TimestampType

In [None]:
spark = SparkSession.builder.master("local[1]").appName("attributes_dict").getOrCreate()
df = spark.read.parquet("../data/processed/trend_values/")
df.coalesce(1).write.parquet('../data/processed/trended_data_interpolated.parquet')
spark.sparkContext.stop()