#### Refatorando engenharia de recursos (Feature engineering)

*Objetivo: Refatorar o experimento 2 para alinhar as etapas de extracao de recursos em um pipeline scikit-learn.*

Recursos a partir da concentração horária de CO:
- Recursos de data e hora
- Recursos de atraso
- Recursos da janela
- Recursos cíclicos
- Remover dados ausentes


In [1]:
import pandas as pd

from feature_engine.creation import CyclicalFeatures
from feature_engine.datetime import DatetimeFeatures
from feature_engine.imputation import DropMissingData
from feature_engine.selection import DropFeatures
from feature_engine.timeseries.forecasting import (
    LagFeatures,
    WindowFeatures,
)

from sklearn.pipeline import Pipeline

In [2]:
# Carregar os dados (selecionar colunas, converter index para datetime, ordenar, restrição de datas, remoção de outliers)
def load_data():

    file = "../datasets/air_quality_uci.csv"

    data = pd.read_csv(
        file,
        sep=",",
        usecols=["Date_Time", "CO_sensor", "RH"],
        index_col=["Date_Time"],
    )
    data.index = pd.to_datetime(data.index, format='%d/%m/%Y %H:%M:%S')
    data.columns = data.columns.str.lower()
    data.index.name = data.index.name.lower()

    data.sort_index(inplace=True)
    data = data.loc["2004-04-01":"2005-04-30"]
    data = data.loc[(data["co_sensor"] >= 0) & (data["rh"] >= 0)]

    return data

In [3]:
data = load_data()
data.head()

Unnamed: 0_level_0,co_sensor,rh
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-04-01 00:00:00,1143.0,61.6
2004-04-01 01:00:00,1044.0,63.9
2004-04-01 02:00:00,1034.0,67.2
2004-04-01 03:00:00,956.0,73.1
2004-04-01 04:00:00,909.0,66.6


In [4]:
# Extrair recursos de data/hora automaticamente utilizando feature-engine

dtf = DatetimeFeatures(
    variables="index",
    
    features_to_extract=[
        "month",
        "week",
        "day_of_week",
        "day_of_month",
        "hour",
        "weekend",
    ],
)

data = dtf.fit_transform(data)
data.head()

Unnamed: 0_level_0,co_sensor,rh,month,week,day_of_week,day_of_month,hour,weekend
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-04-01 00:00:00,1143.0,61.6,4,14,3,1,0,0
2004-04-01 01:00:00,1044.0,63.9,4,14,3,1,1,0
2004-04-01 02:00:00,1034.0,67.2,4,14,3,1,2,0
2004-04-01 03:00:00,956.0,73.1,4,14,3,1,3,0
2004-04-01 04:00:00,909.0,66.6,4,14,3,1,4,0


In [170]:
# Criar recursos de Atraso (Lag features) utilizando feature-engine

# Recursos defasados:
# - A concentração de poluentes da hora anterior (t-1).
# - A concentração de poluentes para a mesma hora do dia anterior (t-24).

In [7]:
# adicionar recursos de Atraso (Lag features) utilizando feature-engine:
# - A concentração de poluentes da hora anterior (t-1).
# - A concentração de poluentes para a mesma hora do dia anterior (t-24).

lagf = LagFeatures(
    variables=["co_sensor", "rh"],
    freq=["1H", "24H"],
    missing_values="ignore",
)

data = lagf.fit_transform(data)
data[[v for v in data.columns if "lag" in v]].head(25)

Unnamed: 0_level_0,co_sensor_lag_1H_x,rh_lag_1H_x,co_sensor_lag_24H_x,rh_lag_24H_x,co_sensor_lag_1H_y,rh_lag_1H_y,co_sensor_lag_24H_y,rh_lag_24H_y,co_sensor_lag_1H,rh_lag_1H,co_sensor_lag_24H,rh_lag_24H
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2004-04-01 00:00:00,,,,,,,,,,,,
2004-04-01 01:00:00,1143.0,61.6,,,1143.0,61.6,,,1143.0,61.6,,
2004-04-01 02:00:00,1044.0,63.9,,,1044.0,63.9,,,1044.0,63.9,,
2004-04-01 03:00:00,1034.0,67.2,,,1034.0,67.2,,,1034.0,67.2,,
2004-04-01 04:00:00,956.0,73.1,,,956.0,73.1,,,956.0,73.1,,
2004-04-01 05:00:00,909.0,66.6,,,909.0,66.6,,,909.0,66.6,,
2004-04-01 06:00:00,996.0,63.7,,,996.0,63.7,,,996.0,63.7,,
2004-04-01 07:00:00,1154.0,68.8,,,1154.0,68.8,,,1154.0,68.8,,
2004-04-01 08:00:00,1510.0,69.6,,,1510.0,69.6,,,1510.0,69.6,,
2004-04-01 09:00:00,1722.0,60.9,,,1722.0,60.9,,,1722.0,60.9,,


In [171]:
# Os valores são adiantados em 1 horas.
variables = ["co_sensor", "rh"]

# Avance 1 hr.
tmp = data[variables].shift(freq="1H")

# Renomear as colunas para indicar o avanço de 1hr
tmp.columns = [f"{v}_lag_1" for v in variables]

data_l1 = data.merge(tmp, left_index=True, right_index=True, how="left")
data_l1.head()

Unnamed: 0_level_0,co_sensor,rh,month,week,day,day_of_week,hour,is_weekend,co_sensor_lag_1,rh_lag_1
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2004-04-01 00:00:00,1143.0,61.6,4,14,1,3,0,0,,
2004-04-01 01:00:00,1044.0,63.9,4,14,1,3,1,0,1143.0,61.6
2004-04-01 02:00:00,1034.0,67.2,4,14,1,3,2,0,1044.0,63.9
2004-04-01 03:00:00,956.0,73.1,4,14,1,3,3,0,1034.0,67.2
2004-04-01 04:00:00,909.0,66.6,4,14,1,3,4,0,956.0,73.1


In [172]:
data_l1.isnull().sum()

co_sensor           0
rh                  0
month               0
week                0
day                 0
day_of_week         0
hour                0
is_weekend          0
co_sensor_lag_1    17
rh_lag_1           17
dtype: int64

In [173]:
# Os valores são adiantados 24 horas.

# Avance 24 hrs.
tmp = data[variables].shift(freq="24H")

# Renomear as colunas para indicar o avanço de 24hrs
tmp.columns = [f"{v}_lag_24" for v in variables]

data_l2 = data_l1.merge(tmp, left_index=True, right_index=True, how="left")
data_l2[["co_sensor", "co_sensor_lag_1", "co_sensor_lag_24"]].head(25)

Unnamed: 0_level_0,co_sensor,co_sensor_lag_1,co_sensor_lag_24
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2004-04-01 00:00:00,1143.0,,
2004-04-01 01:00:00,1044.0,1143.0,
2004-04-01 02:00:00,1034.0,1044.0,
2004-04-01 03:00:00,956.0,1034.0,
2004-04-01 04:00:00,909.0,956.0,
2004-04-01 05:00:00,996.0,909.0,
2004-04-01 06:00:00,1154.0,996.0,
2004-04-01 07:00:00,1510.0,1154.0,
2004-04-01 08:00:00,1722.0,1510.0,
2004-04-01 09:00:00,1512.0,1722.0,


In [174]:
# Neste procedimento, introduzimos falta dados sempre que não havia dados disponíveis nas 24 horas anteriores.

data_l2.isnull().sum()

co_sensor             0
rh                    0
month                 0
week                  0
day                   0
day_of_week           0
hour                  0
is_weekend            0
co_sensor_lag_1      17
rh_lag_1             17
co_sensor_lag_24    221
rh_lag_24           221
dtype: int64

In [175]:
## Recursos da janela - Window features

# Os recursos de janela são cálculos estatisticos dos valores dos recursos em uma janela de tempo predefinida, 
# antes do horário que queremos prever.

# Ex: média dos 3 valores anteriores do TS para prever o valor atual.

tmp = (
    data[variables]
    .rolling(window="3H")
    .mean()  # calcular a média móvel (janela de 3 horas)
    .shift(freq="1H")  # deslocar 1 hora para frente.
)

# Renomear colunas
tmp.columns = [f"{v}_window" for v in variables]
data_lw = data_l2.merge(tmp, left_index=True, right_index=True, how="left")

data_lw[["co_sensor", "co_sensor_window"]].head()



Unnamed: 0_level_0,co_sensor,co_sensor_window
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-04-01 00:00:00,1143.0,
2004-04-01 01:00:00,1044.0,1143.0
2004-04-01 02:00:00,1034.0,1093.5
2004-04-01 03:00:00,956.0,1073.666667
2004-04-01 04:00:00,909.0,1011.333333


In [176]:
# Manual calculations result
print((1215 + 1224) / 2)
print((1115 + 1215 + 1224) / 3)

1219.5
1184.6666666666667


In [177]:
## Recursos periódicos

# Alguns recursos são periódicos. Por exemplo, horas, meses e dias.

# Podemos codificar esses recursos periódicos usando uma transformação de seno e cosseno com o período do recurso.
# Isso fará com que os valores dos recursos que estão distantes se aproximem. Por exemplo, dezembro (12) está mais 
# próximo de janeiro (1) do que de junho (6). Essa relação não é capturada pela representação numérica desses 
# recursos. Mas poderíamos mudar isso, se transformássemos estas variáveis com seno e cosseno.

# Discutiremos essa técnica mais adiante no curso. Por enquanto, vamos criar esses recursos automaticamente com a 
# biblioteca de código aberto Feature-engine.

In [179]:
# Criar recursos cíclicos com Feature-engine

cyclical = CyclicalFeatures(
    variables=["month", "hour"],  # colunas para transformar.
    drop_original=False,  # deletar as colunas originais.
)

data_lw = cyclical.fit_transform(data_lw)


In [181]:
cyclical_vars = [var for var in data_lw.columns if "sin" in var or "cos" in var]

data_lw[cyclical_vars].head(25)

Unnamed: 0_level_0,month_sin,month_cos,hour_sin,hour_cos
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-04-01 00:00:00,0.866025,-0.5,0.0,1.0
2004-04-01 01:00:00,0.866025,-0.5,0.2697968,0.962917
2004-04-01 02:00:00,0.866025,-0.5,0.519584,0.854419
2004-04-01 03:00:00,0.866025,-0.5,0.730836,0.682553
2004-04-01 04:00:00,0.866025,-0.5,0.8878852,0.460065
2004-04-01 05:00:00,0.866025,-0.5,0.9790841,0.203456
2004-04-01 06:00:00,0.866025,-0.5,0.9976688,-0.068242
2004-04-01 07:00:00,0.866025,-0.5,0.9422609,-0.33488
2004-04-01 08:00:00,0.866025,-0.5,0.8169699,-0.57668
2004-04-01 09:00:00,0.866025,-0.5,0.6310879,-0.775711


In [183]:
data_lw.isnull().sum() / len(data_lw)

co_sensor           0.000000
rh                  0.000000
month               0.000000
week                0.000000
day                 0.000000
day_of_week         0.000000
hour                0.000000
is_weekend          0.000000
co_sensor_lag_1     0.002004
rh_lag_1            0.002004
co_sensor_lag_24    0.026058
rh_lag_24           0.026058
co_sensor_window    0.002004
rh_window           0.002004
month_sin           0.000000
month_cos           0.000000
hour_sin            0.000000
hour_cos            0.000000
dtype: float64

In [184]:
print("data size before")
print(data_lw.shape)

data_lw.dropna(inplace=True)

print("data size after")
print(data_lw.shape)

data size before
(8481, 18)
data size after
(8251, 18)


In [186]:
data_lw.drop("rh", inplace=True, axis=1)

data_lw.head()

Unnamed: 0_level_0,co_sensor,month,week,day,day_of_week,hour,is_weekend,co_sensor_lag_1,rh_lag_1,co_sensor_lag_24,rh_lag_24,co_sensor_window,rh_window,month_sin,month_cos,hour_sin,hour_cos
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2004-04-02 00:00:00,1139.0,4,14,2,4,0,0,1203.0,52.0,1143.0,61.6,1193.666667,48.0,0.866025,-0.5,0.0,1.0
2004-04-02 01:00:00,1072.0,4,14,2,4,1,0,1139.0,54.4,1044.0,63.9,1176.0,51.466667,0.866025,-0.5,0.269797,0.962917
2004-04-02 02:00:00,954.0,4,14,2,4,2,0,1072.0,58.8,1034.0,67.2,1138.0,55.066667,0.866025,-0.5,0.519584,0.854419
2004-04-02 03:00:00,951.0,4,14,2,4,3,0,954.0,60.9,956.0,73.1,1055.0,58.033333,0.866025,-0.5,0.730836,0.682553
2004-04-02 04:00:00,926.0,4,14,2,4,4,0,951.0,64.3,909.0,66.6,992.333333,61.333333,0.866025,-0.5,0.887885,0.460065


In [187]:
# Salvar dataset pre-processado

data_lw.to_csv("../datasets/air_quality_preprocessed.csv", index=True)