Esse notebook tem por objetivo fazer a relação de empresas que importam somente uma marca, facilitando a relação importador-

In [None]:
# Importing the modules needed
import sys

import pandas as pd

sys.path.append("../src/")

from src.data.dremio_utils import *
# Data Handling
from dotenv import dotenv_values 

import numpy as np
from tqdm import tqdm
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType, TimestampType

# import pyspark.pandas as pd

In [None]:
spark = SparkSession.builder.master("local[1]").appName("attributes_dict").getOrCreate()

In [None]:
config = dotenv_values(".env")
bds = BaseDremioService(config)

## 1. Getting Merged Data

In [None]:
grouped_data = pd.read_parquet("../data/processed/average_unity_price_historic.parquet")
grouped_data.dropna(inplace=True)
grouped_data = grouped_data[(grouped_data["importador_uf"] != "") & (grouped_data["importador_municipio"] != "")].copy()

In [None]:
# desinverter UF e municipio
estados_br = ["AC","AL","AP","AM","BA","CE","ES","GO","MA","MT","MS","MG","PA","PB","PR","PE",
              "PI","RJ","RN","RS","RO","RR","SC","SP","SE","TO","DF"]

grouped_data["old_municipio"] = grouped_data["importador_municipio"] 

In [None]:
print("Ajustando importador UF")
grouped_data["importador_uf_new"] = grouped_data.apply(lambda x: x["importador_uf"] if x["importador_uf"] in estados_br \
    else x["importador_municipio"], axis=1)

print("Ajustando importador municipio")
grouped_data["importador_municipio_new"] = grouped_data.apply(lambda x: x["old_municipio"] if x["old_municipio"] not in estados_br \
    else x["importador_uf"],axis=1)

In [None]:
grouped_data = grouped_data.groupby(['ncm', 'importador_uf', 'importador_municipio', 'urf', 'id_pais_origem', 'ano', 'semestre'], 
                as_index=False).mean('avg_valor_item')

```python
# MVP
teste_base = {"ncm": ['123456','123456','123456','123456'],
              "importador_uf": ['123456', '123456','123456','123456',],
              "ano": [2018, 2018, 2019, 2020],
              "semestre": [1,2,1,1],
              "valor_unitario": [1.2, 1.1, 1.3, 1.4]}

df_base = pd.DataFrame(teste_base)
df_base["ano_semestre"] = df_base["ano"]*100 + df_base["semestre"]

new_data = {"ano": [2024, 2024],
            "semestre": [1,2]}
df_new_data = pd.DataFrame(new_data)

df_new_data["ncm"] = "123456"
df_new_data["importador_uf"] = "123456"
df_new_data["ano_semestre"] = df_new_data["ano"]*100 + df_new_data["semestre"]

df_last = pd.concat([df_base, df_new_data])
# df_last["ano_semestre"] = df_last["ano"]*100 + df_last["semestre"] 
z = np.polyfit(df_base["ano_semestre"], df_base["valor_unitario"], 1)
p = np.poly1d(z)
p(df_last["ano_semestre"])
# p
```

## 3.1 Usage in the grouped dataset

### 3.1.1 Create the data to be trended

In [None]:
years_df = pd.DataFrame.from_dict({"ano": [2018, 2019, 2020, 2021, 2022, 2023]})
semesters_df = pd.DataFrame.from_dict({"semestre": [1, 2]})
gabarito_datas = years_df.join(semesters_df, how="cross")
gabarito_datas["ano_semestre"] =  gabarito_datas["ano"]*100 + gabarito_datas["semestre"] 
gabarito_datas.drop(columns=["ano", "semestre"], inplace=True)

In [None]:
new_data = {"ano": [2024, 2024],
            "semestre": [1,2]}
df_new_data = pd.DataFrame(new_data)

### 3.1.2 Create the X-axis

In [None]:
df_new_data["ano_semestre"] = df_new_data["ano"]*100 + df_new_data["semestre"]

### 3.1.3 Iter over the groups

```python
# Snippet to select only the recurrent data 
grouped_count = grouped_data.copy()
grouped_count["key"] =  grouped_count['ncm'].astype(str) + grouped_count['importador_municipio'] + grouped_count['id_pais_origem']
grouped_count = grouped_count.groupby(['ncm', 'importador_municipio', 'id_pais_origem'], as_index=False)["key"].count()
grouped_count.head()
# Only the useful data
grouped_count["combinations"] =  grouped_count['ncm'].astype(str) + grouped_count['importador_municipio'] + grouped_count['id_pais_origem']
better_combinations = grouped_count[grouped_count["key"]>2]["combinations"].to_list()
```

```python
# Snippet to check the possible combinations
groups_qtd = grouped_data[['ncm', 'id_pais_origem']].drop_duplicates().shape[0]
print("Qtt of combinations: ", groups_qtd)
groups_qtd = grouped_data[['ncm', 'id_pais_origem', "importador_uf"]].drop_duplicates().shape[0]
print("Qtt of combinations using more fields: ", groups_qtd)
```

```python
# Creation of the key field, useless for now
grouped = grouped_data.copy()
grouped["key"] = grouped['ncm'].astype(str) + grouped['importador_municipio'] + grouped['id_pais_origem']
grouped = grouped_data.groupby(['ncm', 'importador_municipio', 'id_pais_origem'])
grouped["key"] = grouped['ncm'].astype(str) + grouped['importador_municipio'] + grouped['id_pais_origem']
```

In [None]:
df_total = pd.DataFrame()

In [None]:
df_total = pd.read_parquet("../data/processed/trend_values/trend_lines.parquet")

In [None]:
keys_processed = df_total[["ncm", "id_pais_origem"]].drop_duplicates()
keys_processed["key"] = keys_processed["ncm"].astype(str) + '-' + keys_processed["id_pais_origem"]
already_processed = keys_processed["key"].to_list()

In [None]:
grouped_data = grouped_data[grouped_data["ano"] < 2024]

In [None]:
grouped_data.columns

In [None]:
grouped_data["key"] = grouped_data["ncm"].astype(str) + '-' + grouped_data["id_pais_origem"]  + '-' + \
                      grouped_data['importador_uf']  + '-' + grouped_data['importador_municipio'] + '-' + grouped_data['urf']

In [None]:
count = pd.DataFrame(grouped_data["key"].value_counts())
count["key"].describe()

In [None]:
threshold_count = 4
count = count[count["key"]>=threshold_count]
keys_2_process = count.reset_index()["index"].to_list()

In [None]:
grouped_data = grouped_data[~grouped_data["key"].isin(already_processed)]
grouped_data = grouped_data[grouped_data["key"].isin(keys_2_process)]

In [None]:
grouped_data.columns

In [None]:
# Interpolate the missing values
# if len(df_aux_hist["ano_semestre"].unique()) < 10:
# Interpolate if it hasn't enough data to infer
grouped_data_inter = gabarito_datas.merge(grouped_data, on=['ano_semestre'], how='left')
grouped_data_inter["avg_valor_item"] = grouped_data_inter["avg_valor_item"].interpolate()
grouped_data_inter.dropna(axis=0, inplace=True)

In [None]:
grouped_data_inter.sort_values(by='key').head()

In [None]:
file_count = 0
grouped = grouped_data_inter.groupby(['id_pais_origem', 'ncm', 'importador_municipio', 'urf'])
groups_qtd = grouped_data_inter[['id_pais_origem', 'ncm', 'importador_municipio', 'urf']].drop_duplicates().shape[0]
with tqdm(total = groups_qtd, desc="Creating a trend line for unity price") as pbar:
    for key, df_group in grouped:
        # print(key)
        # Create the trendline if there's enough data for it
        df_aux_hist = grouped_data[
                    (grouped_data['id_pais_origem'] == key[0]) &
                    (grouped_data['ncm'] == key[1]) &
                    # (grouped_data['importador_uf'] == key[2]) &
                    (grouped_data['importador_municipio'] == key[2]) &
                    (grouped_data['urf'] == key[3])
        ].groupby(["ano_semestre"], as_index=False).mean("avg_valor_item")
        
        if (df_aux_hist.shape[0] > 0) and (key[0]+ '-' + key[1] not in already_processed):             
            
            df_aux_trend = df_new_data.copy()
            z = np.polyfit(df_aux_hist["ano_semestre"], df_aux_hist["avg_valor_item"], 1)
            p = np.poly1d(z)
            df_aux_trend["avg_valor_item"] = p(df_aux_trend["ano_semestre"])
        
            df_aux_trend['ncm'] = key[0]
            df_aux_trend['id_pais_origem'] = key[1]
            df_aux_trend["ano"] = df_aux_trend["ano"].astype(int)
            
            df_total = pd.concat([df_total,df_aux_trend])
            file_count += 1
            
            
            if file_count%200 == 0:
                # df_aux_trend.to_parquet(f"../data/processed/trend_values/trend_lines.parquet", index=False, engine='fastparquet', append=True)
                df_total.to_parquet(f"../data/processed/trend_values/trend_lines.parquet", index=False)
                
        pbar.update(1)
        # break
        # print(df_final)
        # break

In [None]:
df_total.to_parquet(f"../data/processed/trend_values/trend_lines.parquet", index=False)

In [None]:
df_aux_hist

In [None]:
pd.concat(gabarito_aux, df_aux_hist, on=['ano_semestre', 'ncm', 'id_pais_origem'])

# 2. Iteração sobre os grupos 

In [None]:
for key, df_group in grouped_data.groupby(['ncm', 'importador_uf', 'importador_municipio', 'urf', 'id_pais_origem']):
    df_group = 
    break

In [None]:
df_group['importador_uf'][0]

In [None]:
years_df = pd.DataFrame.from_dict({"ano": [2018, 2019, 2020, 2021, 2022, 2023]})
semesters_df = pd.DataFrame.from_dict({"semestre":[1, 2]})
gabarito_datas = years_df.join(semesters_df, how="cross")

In [None]:
gabarito_comb = unique_combinations.join(gabarito_datas, how="cross")
gabarito_comb.head()

In [None]:
df_2b_filled = gabarito_comb.merge(grouped_data, how="left", on=['ncm', 'importador_uf', 'importador_municipio', 'urf', 'id_pais_origem', 'ano', 'semestre'])
df_2b_filled.head(15)

In [None]:
df_filled = df_2b_filled.copy()
df_filled["avg_valor_item"] = df_filled["avg_valor_item"].interpolate()

In [None]:
df_filled.tail(15)

## 3. Inference of the value for the first semesters of 2024

### 3.1 Inference using linear interpolation

In [None]:
years_df = pd.DataFrame.from_dict({"ano": [2018, 2019, 2020, 2021, 2022, 2023, 2024]})
semesters_df = pd.DataFrame.from_dict({"semestre": [1, 2]})
gabarito_datas = years_df.join(semesters_df, how="cross")
gabarito_comb = unique_combinations.join(gabarito_datas, how="cross")

In [None]:
df_2b_infer = gabarito_comb.merge(df_filled, how="left", on=['ncm', 'importador_uf', 'importador_municipio', 'urf', 'name_pt', 'ano', 'semestre'])
df_2b_infer.head()

In [None]:
df_2b_infer["avg_valor_item"] = df_2b_infer["avg_valor_item"].interpolate()

In [None]:
df_2b_infer["anosem"] = df_2b_infer["ano"].astype(str) + df_2b_infer["semestre"].astype(str) 

In [None]:
for _, df in df_2b_infer.groupby(['ncm', 'importador_uf', 'importador_municipio', 'urf', 'name_pt']):
    print(df["avg_valor_item"].values)
    df.plot(y="avg_valor_item", x="anosem")
    break

### 3.1 Inference using linear interpolation