Esse notebook tem por objetivo fazer a relação de empresas que importam somente uma marca, facilitando a relação importador-

In [1]:
# Importing the modules needed
import sys

import pandas as pd

sys.path.append("../src/")

from src.data.dremio_utils import *
# Data Handling
from dotenv import dotenv_values 

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType, TimestampType

In [3]:
spark = SparkSession.builder.master("local[1]").appName("attributes_dict").getOrCreate()

In [3]:
config = dotenv_values(".env")
bds = BaseDremioService(config)

## 1. Getting Merged Data

In [4]:
grouped_data = pd.read_parquet("../data/processed/average_unity_price_historic.parquet")

In [5]:
grouped_data = grouped_data.groupby(['ncm', 'importador_uf', 'importador_municipio', 'urf', 'id_pais_origem', 'ano', 'semestre'], as_index=False).mean('avg_valor_item')

# 2. Gabarito de datas e combinações 

In [6]:
unique_combinations = grouped_data[['ncm', 'importador_uf', 'importador_municipio', 'urf', 'id_pais_origem']].drop_duplicates()
unique_combinations.dropna(inplace=True)
unique_combinations.head()

Unnamed: 0,ncm,importador_uf,importador_municipio,urf,id_pais_origem
0,1012100.0,,,AEROPORTO INTERNACIONAL DE VIRACOPOS,ALEMANHA
6,1012100.0,,,AEROPORTO INTERNACIONAL DE VIRACOPOS,BRASIL
8,1012100.0,,,AEROPORTO INTERNACIONAL DE VIRACOPOS,BÉLGICA
14,1012100.0,,,AEROPORTO INTERNACIONAL DE VIRACOPOS,CANADÁ
15,1012100.0,,,AEROPORTO INTERNACIONAL DE VIRACOPOS,CHILE


In [7]:
years_df = pd.DataFrame.from_dict({"ano": [2018, 2019, 2020, 2021, 2022, 2023]})
semesters_df = pd.DataFrame.from_dict({"semestre":[1, 2]})
gabarito_datas = years_df.join(semesters_df, how="cross")

In [8]:
gabarito_comb = unique_combinations.join(gabarito_datas, how="cross")
gabarito_comb.head()

MemoryError: Unable to allocate 7.18 GiB for an array with shape (4, 240851532) and data type object

In [10]:
df_2b_filled = gabarito_comb.merge(grouped_data, how="left", on=['ncm', 'importador_uf', 'importador_municipio', 'urf', 'id_pais_origem', 'ano', 'semestre'])
df_2b_filled.head(15)

MemoryError: cannot allocate memory for array

In [None]:
df_filled = df_2b_filled.copy()
df_filled["avg_valor_item"] = df_filled["avg_valor_item"].interpolate()

In [None]:
df_filled.tail(15)

## 3. Inference of the value for the first semesters of 2024

### 3.1 Inference using linear interpolation

In [None]:
years_df = pd.DataFrame.from_dict({"ano": [2018, 2019, 2020, 2021, 2022, 2023, 2024]})
semesters_df = pd.DataFrame.from_dict({"semestre": [1, 2]})
gabarito_datas = years_df.join(semesters_df, how="cross")
gabarito_comb = unique_combinations.join(gabarito_datas, how="cross")

In [None]:
df_2b_infer = gabarito_comb.merge(df_filled, how="left", on=['ncm', 'importador_uf', 'importador_municipio', 'urf', 'name_pt', 'ano', 'semestre'])
df_2b_infer.head()

In [None]:
df_2b_infer["avg_valor_item"] = df_2b_infer["avg_valor_item"].interpolate()

In [None]:
df_2b_infer["anosem"] = df_2b_infer["ano"].astype(str) + df_2b_infer["semestre"].astype(str) 

In [None]:
for _, df in df_2b_infer.groupby(['ncm', 'importador_uf', 'importador_municipio', 'urf', 'name_pt']):
    print(df["avg_valor_item"].values)
    df.plot(y="avg_valor_item", x="anosem")
    break

### 3.1 Inference using linear interpolation