Esse notebook tem por objetivo fazer a relação de empresas que importam somente uma marca, facilitando a relação importador-

In [3]:
# Importing the modules needed
import sys

import pandas as pd

sys.path.append("../src/")

from src.data.dremio_utils import *
# Data Handling
from dotenv import dotenv_values 

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType, TimestampType

In [4]:
spark = SparkSession.builder.master("local[1]").appName("attributes_dict").getOrCreate()

In [5]:
config = dotenv_values(".env")
bds = BaseDremioService(config)

## 1. Getting Merged Data

In [6]:
grouped_data = spark.read.parquet("../data/processed/average_unity_price_historic.parquet")

In [7]:
grouped_data = grouped_data.groupBy(['ncm', 'importador_uf', 'importador_municipio', 'urf', 'id_pais_origem', 'ano', 'semestre']).avg('avg_valor_item')

# 2. Gabarito de datas e combinações 

In [8]:
unique_combinations = grouped_data.select('ncm', 'importador_uf', 'importador_municipio', 'urf', 'id_pais_origem').distinct()
unique_combinations.show(5)

+---------+--------------------+--------------------+--------------------+--------------+
|      ncm|       importador_uf|importador_municipio|                 urf|id_pais_origem|
+---------+--------------------+--------------------+--------------------+--------------+
|     null|              TOLEDO|                  PR|     PORTO DE SANTOS|    BANGLADESH|
|2023000.0|             GOIANIA|                  GO|IRF - PORTO DE SUAPE|     AUSTRÁLIA|
|3055310.0|CABO DE SANTO AGO...|                  PE|              ITAJAI|       NORUEGA|
|4071900.0|               BAURU|                  SP|AEROPORTO INTERNA...|ESTADOS UNIDOS|
|6029029.0|                  SP|            HOLAMBRA|AEROPORTO INTERNA...|       HOLANDA|
+---------+--------------------+--------------------+--------------------+--------------+


In [9]:
anos_data = [{"ano": 2018}, {"ano": 2019}, {"ano": 2020}, {"ano": 2021}, {"ano": 2022}, {"ano": 2023}]
anos_schema =StructType([StructField("ano", IntegerType())])
 
years_df = spark.createDataFrame(data=anos_data, schema=anos_schema)

semestres_data = [{"semestre": 1}, {"semestre": 2}]
semestres_schema =StructType([StructField("semestre", IntegerType())])

semesters_df = spark.createDataFrame(data=semestres_data, schema=semestres_schema)

gabarito_datas = years_df.crossJoin(semesters_df)
gabarito_datas.show(5)

+----+--------+
| ano|semestre|
+----+--------+
|2018|       1|
|2018|       2|
|2019|       1|
|2019|       2|
|2020|       1|
+----+--------+


In [10]:
gabarito_comb = unique_combinations.crossJoin(gabarito_datas)
gabarito_comb.show(5)

+----+-------------+--------------------+---------------+--------------+----+--------+
| ncm|importador_uf|importador_municipio|            urf|id_pais_origem| ano|semestre|
+----+-------------+--------------------+---------------+--------------+----+--------+
|null|       TOLEDO|                  PR|PORTO DE SANTOS|    BANGLADESH|2018|       1|
|null|       TOLEDO|                  PR|PORTO DE SANTOS|    BANGLADESH|2018|       2|
|null|       TOLEDO|                  PR|PORTO DE SANTOS|    BANGLADESH|2019|       1|
|null|       TOLEDO|                  PR|PORTO DE SANTOS|    BANGLADESH|2019|       2|
|null|       TOLEDO|                  PR|PORTO DE SANTOS|    BANGLADESH|2020|       1|
+----+-------------+--------------------+---------------+--------------+----+--------+


In [11]:
df_2b_filled = gabarito_comb.join(grouped_data, ['ncm', 'importador_uf', 'importador_municipio', 'urf', 'id_pais_origem', 'ano', 'semestre'], "left")
df_2b_filled.head(15)

Py4JJavaError: An error occurred while calling o88.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 11.0 failed 1 times, most recent failure: Lost task 3.0 in stage 11.0 (TID 834, 192.168.0.7, executor driver): java.lang.OutOfMemoryError: GC overhead limit exceeded

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:467)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:420)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3450)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3447)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded


In [None]:
df_2b_filled = df_2b_filled.withColumnRenamed("avg(avg_valor_item)", "avg_valor_item")

In [None]:
df_2b_filled.write.parquet("../data/processed/data_2b_filled")

In [None]:
del df_2b_filled, gabarito_comb, grouped_data

In [None]:
import os
from tqdm import tqdm

# Caminho para a pasta contendo os arquivos Parquet
caminho = '../data/processed/data_2b_filled/'

# Lista de todos os arquivos Parquet
arquivos_parquet = [arq for arq in os.listdir(caminho) if arq.lower().endswith('.parquet')]

concat_df = pd.DataFrame()
with tqdm(desc="concatenating parquet", total=len(arquivos_parquet)) as pbar:
    for arquivo in arquivos_parquet:
        df_aux = pd.read_parquet(caminho+arquivo)
        concat_df = pd.concat([concat_df,df_aux])
        pbar.update(1)

## 3. Inference of the value for the first semesters of 2024

### 3.1 Inference using linear interpolation

In [None]:
years_df = pd.DataFrame.from_dict({"ano": [2018, 2019, 2020, 2021, 2022, 2023, 2024]})
semesters_df = pd.DataFrame.from_dict({"semestre": [1, 2]})
gabarito_datas = years_df.join(semesters_df, how="cross")
gabarito_comb = unique_combinations.join(gabarito_datas, how="cross")

In [None]:
df_2b_infer = gabarito_comb.merge(df_filled, how="left", on=['ncm', 'importador_uf', 'importador_municipio', 'urf', 'name_pt', 'ano', 'semestre'])
df_2b_infer.head()

In [None]:
df_2b_infer["avg_valor_item"] = df_2b_infer["avg_valor_item"].interpolate()

In [None]:
df_2b_infer["anosem"] = df_2b_infer["ano"].astype(str) + df_2b_infer["semestre"].astype(str) 

In [None]:
for _, df in df_2b_infer.groupby(['ncm', 'importador_uf', 'importador_municipio', 'urf', 'name_pt']):
    print(df["avg_valor_item"].values)
    df.plot(y="avg_valor_item", x="anosem")
    break

### 3.1 Inference using linear interpolation