In [13]:
# Importar pandas y otras bibliotecas necesarias
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Cargar el dataset
df = pd.read_csv('AirQualityUCIFase_2.csv')

# Eliminar las columnas vacías 'Unnamed: 15' y 'Unnamed: 16'
df_clean = df.drop(columns=['Unnamed: 15', 'Unnamed: 16'], errors='ignore')

# Separar características y variable objetivo
X = df_clean.drop(columns=['CO(GT)'])  # características
y = df_clean['CO(GT)']  # variable objetivo

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocesamiento de características
numeric_features = X.select_dtypes(include=['float64']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        # Puedes agregar más transformaciones para otras columnas si es necesario
    ])

# Aplicar preprocesamiento a los datos
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Crear DataFrames con las características procesadas y la variable objetivo
processed_train_df = pd.DataFrame(X_train_processed, columns=numeric_features)
processed_train_df['CO(GT)'] = y_train.values

processed_test_df = pd.DataFrame(X_test_processed, columns=numeric_features)
processed_test_df['CO(GT)'] = y_test.values

# Guardar los datasets procesados como archivos CSV sin incluir índices
processed_train_df.to_csv('Processed_AirQualityUCI_train.csv', index=False)
processed_test_df.to_csv('Processed_AirQualityUCI_test.csv', index=False)



In [16]:
# Ejemplo de construcción del Pipeline de ingeniería de características

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Definir las columnas numéricas y categóricas

numeric_features = ['PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 
                    'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 
                    'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 
                    'T', 'RH', 'AH']

categorical_features = ['Date', 'Time']


# Construir Pipeline para preprocesamiento
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Ejemplo de uso del Pipeline con datos de entrenamiento
X_train_processed = preprocessor.fit_transform(X_train)

# Guardar el Pipeline entrenado
import joblib
joblib.dump(preprocessor, 'feature_engineering_pipeline.pkl')


['feature_engineering_pipeline.pkl']