# 01 – Cleaning

Notebook de limpieza sobre `dataset.csv`.


In [1]:
from pathlib import Path
import sys

cwd = Path().resolve()
PROJECT_ROOT = None

for parent in [cwd, *cwd.parents]:
    if (parent / "src").is_dir():
        PROJECT_ROOT = parent
        break

if PROJECT_ROOT is None:
    raise RuntimeError("No se encontró carpeta 'src' en la jerarquía.")

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.config.settings import (
    PROJECT_ROOT as CFG_PROJECT_ROOT,
    RAW_DATA_PATH,
    PROCESSED_DIR,
    CLEAN_DATASET_NAME,
)

print("PROJECT_ROOT:", CFG_PROJECT_ROOT)
print("RAW_DATA_PATH:", RAW_DATA_PATH)
print("OUTPUT_PATH :", PROCESSED_DIR / CLEAN_DATASET_NAME)


PROJECT_ROOT: D:\Users\dhcertug\OneDrive - Crystal S.A.S\Documentos\HOME\00_PERSONAL\02_CURSOS\PROYECTO\Proyecto_analisis_intermedio_udea\src
RAW_DATA_PATH: D:\Users\dhcertug\OneDrive - Crystal S.A.S\Documentos\HOME\00_PERSONAL\02_CURSOS\PROYECTO\Proyecto_analisis_intermedio_udea\src\data\raw\dataset.csv
OUTPUT_PATH : D:\Users\dhcertug\OneDrive - Crystal S.A.S\Documentos\HOME\00_PERSONAL\02_CURSOS\PROYECTO\Proyecto_analisis_intermedio_udea\src\data\processed\dataset_cleaned.csv


## Imports y configuración básica

Usamos solo las librerías necesarias para inspección y el pipeline de limpieza.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.load_data import load_raw_dataset
from src.data.preprocess import clean_dataset

pd.set_option("display.max_columns", 100)
pd.set_option("display.float_format", lambda x: f"{x:,.2f}")
sns.set(style="whitegrid")


## Cargar dataset crudo

Leemos el dataset original desde `data/raw` 


In [3]:

data_raw = load_raw_dataset()
print(data_raw.shape)
data_raw.head()


  return pd.read_csv(path)


(437968, 32)


Unnamed: 0,anio_mes,semana_anio,Tur,planta_id,seccion_id,maq_id,Tipo_TEJ,Tecnologia,Pas,C,producto_id,categoria_producto,estilo_id,Tal,Col,Tal_Fert,Col_Fert,Componentes,g_art_id,mp_id,MP,mp_categoria,Co_Dano,Descr_Dano,Gr_Dano_Dano,Gr_Dano_Secc,Und_1a,Und_2a,Tipo_2a,Rechazo_comp,rechazo_flag,Reprogramado
0,2025-03,10,1,1,7,394,Liso,GOAL,55.0,Nac,209,GLXT1,3,7,9905,7,9905,5,17,1,ALG,ALG,190.0,Segunda no digitada PLC,TEJ,TEJ,109.0,3.0,2a Rep,,0,
1,2025-03,10,3,1,7,394,Liso,GOAL,55.0,Nac,209,GLXT1,3,7,7047,7,9905,5,17,1,ALG,ALG,,,,,70.0,0.0,,2.0,1,
2,2025-03,10,1,1,7,394,Liso,GOAL,55.0,Nac,209,GLXT1,3,7,7047,7,9905,5,17,1,ALG,ALG,,,,,14.0,0.0,,2.0,1,
3,2023-11,47,1,1,7,288,Acanalado,SILV,56.0,Exp,109,FIIUS,10,1,961,1,9905,1,12,1,ALG,ALG,,,,,122.0,0.0,,3.0,1,Por Segundas
4,2024-05,19,1,1,7,288,Acanalado,SILV,56.0,Exp,109,FIMUS,10,2,961,2,9905,1,12,1,ALG,ALG,,,,,20.0,0.0,,3.0,1,


## Aplicar pipeline de limpieza

Se ejecuta `clean_dataset` que consolida los pasos:

- Conversión de `anio_mes` a `datetime`.
- Imputación de `Und_1a`, `Und_2a`, `Rechazo_comp`.
- Relleno de `Tipo_2a` con `"Unknown"`.
- Eliminación de `Reprogramado` (si existe).
- Drop de filas con `C`, `MP`, `mp_categoria` nulos.
- Construcción de `total_und` y `Und_2a_percentage`.
- Drop de columnas de daño y `anio_mes`.
- Imputación de `Tecnologia`, `Pas`, `rechazo_flag`.


In [4]:
data = clean_dataset(data_raw)

data.info()
data.head()


<class 'pandas.core.frame.DataFrame'>
Index: 364832 entries, 0 to 398006
Data columns (total 27 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   semana_anio         364832 non-null  int64  
 1   Tur                 364832 non-null  int64  
 2   planta_id           364832 non-null  int64  
 3   seccion_id          364832 non-null  int64  
 4   maq_id              364832 non-null  int64  
 5   Tipo_TEJ            364832 non-null  object 
 6   Tecnologia          364832 non-null  object 
 7   Pas                 364832 non-null  float64
 8   C                   364832 non-null  object 
 9   producto_id         364832 non-null  int64  
 10  categoria_producto  364832 non-null  object 
 11  estilo_id           364832 non-null  int64  
 12  Tal                 364832 non-null  int64  
 13  Col                 364832 non-null  int64  
 14  Tal_Fert            364832 non-null  int64  
 15  Col_Fert            364832 non-null  in

Unnamed: 0,semana_anio,Tur,planta_id,seccion_id,maq_id,Tipo_TEJ,Tecnologia,Pas,C,producto_id,categoria_producto,estilo_id,Tal,Col,Tal_Fert,Col_Fert,Componentes,g_art_id,mp_id,MP,mp_categoria,Und_1a,Und_2a,Rechazo_comp,rechazo_flag,total_und,Und_2a_percentage
0,10,1,1,7,394,Liso,GOAL,55.0,Nac,209,GLXT1,3,7,9905,7,9905,5,17,1,ALG,ALG,109.0,3.0,0.0,0,112.0,0.03
1,10,3,1,7,394,Liso,GOAL,55.0,Nac,209,GLXT1,3,7,7047,7,9905,5,17,1,ALG,ALG,70.0,0.0,2.0,1,70.0,0.0
2,10,1,1,7,394,Liso,GOAL,55.0,Nac,209,GLXT1,3,7,7047,7,9905,5,17,1,ALG,ALG,14.0,0.0,2.0,1,14.0,0.0
3,47,1,1,7,288,Acanalado,SILV,56.0,Exp,109,FIIUS,10,1,961,1,9905,1,12,1,ALG,ALG,122.0,0.0,3.0,1,122.0,0.0
4,19,1,1,7,288,Acanalado,SILV,56.0,Exp,109,FIMUS,10,2,961,2,9905,1,12,1,ALG,ALG,20.0,0.0,3.0,1,20.0,0.0


## Guardar dataset limpio

Persistimos el resultado en `data/processed/dataset_cleaned.csv` para su uso en EDA y modelos.


In [5]:
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
output_path = PROCESSED_DIR / CLEAN_DATASET_NAME

data.to_csv(output_path, index=False)
output_path


WindowsPath('D:/Users/dhcertug/OneDrive - Crystal S.A.S/Documentos/HOME/00_PERSONAL/02_CURSOS/PROYECTO/Proyecto_analisis_intermedio_udea/src/data/processed/dataset_cleaned.csv')