In [1]:
# Libraries
from pathlib import Path
import pandas as pd

In [2]:
# Import datasets
ncm_sh = pd.read_csv('output/NCM_SH.csv', sep=';', decimal=',', encoding='latin1')
urf = pd.read_csv('data/URF.csv', sep=';', decimal=',', encoding='latin1')

In [3]:
# Adjust datasets
ncm_sh["CO_SH2"] = ncm_sh["CO_SH2"].astype("Int64").astype("string").str.zfill(2)
sh2_secrom = ncm_sh.drop_duplicates(subset=['CO_SH2']).set_index('CO_SH2')['CO_NCM_SECROM'].to_dict()

In [4]:
# Process IMP ComexStat base
def process_imp_comexstat(
    input_path: str = "data/IMP_COMPLETA.csv",
    output_path: str = "output/imp_comexstat.csv",
    sh2_secrom_map=None,
    urf_df=None,
):
    """
    Read the full IMP base, filter and enrich it, and save a cleaned import file.

    Steps:
    1) Keep only records from 2010 onwards (CO_ANO >= 2010)
    2) Ensure CO_NCM is an 8-character zero-padded string
    3) Create CO_SH6, CO_SH4, CO_SH2 from CO_NCM
    4) Map CO_SH2 to CO_NCM_SECROM (secrom section)
    5) Map CO_URF to NO_URF (unit name) and trim the first 10 characters
    6) Drop unused columns (CO_UNID, QT_ESTAT)
    7) Save to CSV in the output folder
    """

    # 1) Read full import base
    imp = pd.read_csv(input_path, sep=";", decimal=",", encoding="latin1")

    # 2) Filter by year
    imp = imp[imp["CO_ANO"] >= 2010].copy()

    # 3) Normalize CO_NCM as 8-digit string
    imp["CO_NCM"] = imp["CO_NCM"].astype("string").str.zfill(8)

    # 4) Create SH2 / SH4 / SH6 codes
    imp["CO_SH6"] = imp["CO_NCM"].str[:6]
    imp["CO_SH4"] = imp["CO_NCM"].str[:4]
    imp["CO_SH2"] = imp["CO_NCM"].str[:2]

    # 5) Map SH2 → SECROM section code (CO_NCM_SECROM)
    if sh2_secrom_map is not None:
        imp["CO_NCM_SECROM"] = imp["CO_SH2"].map(sh2_secrom_map)

    # 6) Map CO_URF → NO_URF and trim first 10 characters
    if urf_df is not None:
        urf_map = urf_df.set_index("CO_URF")["NO_URF"].to_dict()
        imp["NO_URF"] = imp["CO_URF"].map(urf_map)

        mask = imp["NO_URF"].notna()
        imp.loc[mask, "NO_URF"] = imp.loc[mask, "NO_URF"].astype(str).str[10:]

    # 7) Drop unused columns
    imp = imp.drop(columns=["CO_UNID", "QT_ESTAT"], errors="ignore")

    # 8) Save final file
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    imp.to_csv(output_path, sep=";", decimal=",", index=False, encoding="latin1")


process_imp_comexstat(
    input_path="data/IMP_COMPLETA.csv",
    output_path="output/imp_comexstat.csv",
    sh2_secrom_map=sh2_secrom,
    urf_df=urf,
)