Esse notebook tem por objetivo fazer a relação de empresas que importam somente uma marca, facilitando a relação importador-

In [2]:
# Importing the modules needed
import sys

import pandas as pd

sys.path.append("../src/")

from src.data.dremio_utils import *
# Data Handling
from dotenv import dotenv_values 

import numpy as np
from tqdm import tqdm

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.window import Window
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

# import pyspark.pandas as pd

In [3]:
spark = SparkSession.builder.master("local[1]").appName("attributes_dict").getOrCreate()

In [4]:
config = dotenv_values(".env")
bds = BaseDremioService(config)

## 1. Getting Merged Data

In [5]:
grouped_data = spark.read.parquet("../data/processed/average_unity_price_historic.parquet")
grouped_data = grouped_data.dropna()
grouped_data = grouped_data.filter(
    (grouped_data.importador_uf != "") &
    (grouped_data.importador_municipio != "")   
)

```python

# MVP
teste_base = {"ncm": ['123456','123456','123456','123456'],
              "importador_uf": ['123456', '123456','123456','123456',],
              "ano": [2018, 2018, 2019, 2020],
              "semestre": [1,2,1,1],
              "valor_unitario": [1.2, 1.1, 1.3, 1.4]}

df_base = pd.DataFrame(teste_base)
df_base["ano_semestre"] = df_base["ano"]*100 + df_base["semestre"]

new_data = {"ano": [2024, 2024],
            "semestre": [1,2]}
df_new_data = pd.DataFrame(new_data)

df_new_data["ncm"] = "123456"
df_new_data["importador_uf"] = "123456"
df_new_data["ano_semestre"] = df_new_data["ano"]*100 + df_new_data["semestre"]

df_last = pd.concat([df_base, df_new_data])
# df_last["ano_semestre"] = df_last["ano"]*100 + df_last["semestre"] 
z = np.polyfit(df_base["ano_semestre"], df_base["valor_unitario"], 1)
p = np.poly1d(z)
p(df_last["ano_semestre"])
```

## 3.1 Usage in the grouped dataset

### 3.1.1 Create the data to be trended

In [6]:
new_data = [(2024, 1), (2024, 2)]
new_data_schema =StructType([
    StructField("ano", IntegerType()),
    StructField("semestre", IntegerType())
])
 
df_new_data = spark.createDataFrame(data=new_data, schema=new_data_schema)

### 3.1.2 Create the X-axis

In [7]:
df_new_data = df_new_data.withColumn("ano_semestre", df_new_data.ano*100 + df_new_data.semestre)
df_new_data.show(5)
# df_new_data.withColumn("ano_semestre", df_new_data.ano*100 + df_new_data.semestre)
# grouped_data["ano_semestre"] = grouped_data["ano"]*100 + grouped_data["semestre"]

+----+--------+------------+
| ano|semestre|ano_semestre|
+----+--------+------------+
|2024|       1|      202401|
|2024|       2|      202402|
+----+--------+------------+


### 3.1.3 Iter over the groups

In [8]:
@pandas_udf(StructType([
    StructField("ncm", IntegerType()),
    StructField("importador_municipio", IntegerType()),
    StructField("id_pais_origem", IntegerType()),
    StructField("avg_valor_item", DoubleType())
]), PandasUDFType.GROUPED_MAP)
def calculate_trend_line(pdf):
    z = np.polyfit(pdf["ano_semestre"], pdf["avg_valor_item"], 1)
    p = np.poly1d(z)
    pdf["avg_valor_item"] = p(pdf["ano_semestre"])
    return pdf

In [13]:
# Apply the UDF to each group
# window_spec = Window.partitionBy("ncm", "importador_municipio", "id_pais_origem")
grouped_schema = StructType([
    StructField("ncm", IntegerType()),
    StructField("importador_municipio", IntegerType()),
    StructField("id_pais_origem", IntegerType()),
    StructField("avg_valor_item", DoubleType())
])
df_trend = grouped_data.groupBy("ncm", "importador_municipio", "id_pais_origem").applyInPandas(calculate_trend_line, schema=grouped_schema)
# #df_trend = grouped_data.withColumn("row_num", col("ncm")).withColumn("row_num", col("row_num").over(window_spec))


ValueError: Invalid function: pandas_udf with function type GROUPED_MAP or the function in groupby.applyInPandas must take either one argument (data) or two arguments (key, data).

In [None]:
df_trend.show()

In [None]:
# Only the useful data
grouped_count["combinations"] =  grouped_count['ncm'].astype(str) + grouped_count['importador_municipio'] + grouped_count['id_pais_origem']
better_combinations = grouped_count[grouped_count["key"]>2]["combinations"].to_list()

In [None]:
groups_qtd = grouped_data[['ncm', 'importador_municipio', 'id_pais_origem']].drop_duplicates().shape[0]
groups_qtd

In [None]:
"".join(str(key)).replace("(","").replace(")","").replace(", ","").replace("''","_")

In [None]:
grouped_data[['ncm', 'importador_municipio', 'id_pais_origem']].count()

In [None]:
file_id = 0
grouped = grouped_data.copy()
grouped["key"] = grouped['ncm'].astype(str) + grouped['importador_municipio'] + grouped['id_pais_origem']
grou
grouped = grouped_data.groupby(['ncm', 'importador_municipio', 'id_pais_origem'])
grouped["key"] = grouped['ncm'].astype(str) + grouped['importador_municipio'] + grouped['id_pais_origem']

In [None]:
with tqdm(total = groups_qtd, desc="Creating a trend line for unity price") as pbar:
    for key, df_group in grouped:
        # print(key)
        df_aux_hist = grouped_data[
                            (grouped_data['ncm'] == key[0]) &
                            (grouped_data['importador_municipio'] == key[1]) &
                            (grouped_data['id_pais_origem'] == key[2])].copy()
        
        df_aux_trend = df_new_data.copy()
        
        z = np.polyfit(df_aux_hist["ano_semestre"], df_aux_hist["avg_valor_item"], 1)
        p = np.poly1d(z)
        df_aux_trend['ncm'] = key[0]
        df_aux_trend['importador_municipio'] = key[1]
        df_aux_trend['id_pais_origem'] = key[2]
        df_aux_trend["avg_valor_item"] = p(df_aux_trend["ano_semestre"])
        
        df_final = pd.concat([df_group, df_aux_trend])
        key_to_save = "".join(str(key)).replace("(","").replace(")","").replace(", ","").replace("''","_")
        df_final.to_parquet(f"../data/processed/trend_values/{file_id}.parquet")
        file_id += 1
        pbar.update(1)
        # print(df_final)
        # break

# 2. Iteração sobre os grupos 

In [None]:
for key, df_group in grouped_data.groupby(['ncm', 'importador_uf', 'importador_municipio', 'urf', 'id_pais_origem']):
    df_group = 
    break

In [None]:
df_group['importador_uf'][0]

In [None]:
years_df = pd.DataFrame.from_dict({"ano": [2018, 2019, 2020, 2021, 2022, 2023]})
semesters_df = pd.DataFrame.from_dict({"semestre":[1, 2]})
gabarito_datas = years_df.join(semesters_df, how="cross")

In [None]:
gabarito_comb = unique_combinations.join(gabarito_datas, how="cross")
gabarito_comb.head()

In [None]:
df_2b_filled = gabarito_comb.merge(grouped_data, how="left", on=['ncm', 'importador_uf', 'importador_municipio', 'urf', 'id_pais_origem', 'ano', 'semestre'])
df_2b_filled.head(15)

In [None]:
df_filled = df_2b_filled.copy()
df_filled["avg_valor_item"] = df_filled["avg_valor_item"].interpolate()

In [None]:
df_filled.tail(15)

## 3. Inference of the value for the first semesters of 2024

### 3.1 Inference using linear interpolation

In [None]:
years_df = pd.DataFrame.from_dict({"ano": [2018, 2019, 2020, 2021, 2022, 2023, 2024]})
semesters_df = pd.DataFrame.from_dict({"semestre": [1, 2]})
gabarito_datas = years_df.join(semesters_df, how="cross")
gabarito_comb = unique_combinations.join(gabarito_datas, how="cross")

In [None]:
df_2b_infer = gabarito_comb.merge(df_filled, how="left", on=['ncm', 'importador_uf', 'importador_municipio', 'urf', 'name_pt', 'ano', 'semestre'])
df_2b_infer.head()

In [None]:
df_2b_infer["avg_valor_item"] = df_2b_infer["avg_valor_item"].interpolate()

In [None]:
df_2b_infer["anosem"] = df_2b_infer["ano"].astype(str) + df_2b_infer["semestre"].astype(str) 

In [None]:
for _, df in df_2b_infer.groupby(['ncm', 'importador_uf', 'importador_municipio', 'urf', 'name_pt']):
    print(df["avg_valor_item"].values)
    df.plot(y="avg_valor_item", x="anosem")
    break

### 3.1 Inference using linear interpolation