#### Engenharia de recursos (Feature engineering)

*Objetivo: criar recursos adequados para prever a concentração de CO na próxima hora.*

In [46]:
import numpy as np
import pandas as pd


In [47]:
def load_data():

    file = "../datasets/air_quality_uci.csv"

    data = pd.read_csv(
        file,
        sep=",",
        usecols=["Date_Time", "CO_sensor", "RH"],
        index_col=["Date_Time"],
    )
    data.index = pd.to_datetime(data.index, format='%d/%m/%Y %H:%M:%S')

    data.sort_index(inplace=True)
    data = data.loc["2004-04-01":"2005-04-30"]
    data = data.loc[(data["CO_sensor"] >= 0) & (data["RH"] >= 0)]

    return data

In [48]:
data = load_data()
data.head()

Unnamed: 0_level_0,CO_sensor,RH
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-04-01 00:00:00,1143.0,61.6
2004-04-01 01:00:00,1044.0,63.9
2004-04-01 02:00:00,1034.0,67.2
2004-04-01 03:00:00,956.0,73.1
2004-04-01 04:00:00,909.0,66.6


In [49]:
# Extrair recursos de data e hora

data["Month"] = data.index.month
data["Week"] = data.index.isocalendar().week
data["Day"] = data.index.day
data["Day_of_week"] = data.index.day_of_week
data["Hour"] = data.index.hour

# Encontrar o final de semana.
data["is_weekend"] = np.where(data["Day_of_week"] > 4, 1, 0)
data.head()

Unnamed: 0_level_0,CO_sensor,RH,Month,Week,Day,Day_of_week,Hour,is_weekend
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-04-01 00:00:00,1143.0,61.6,4,14,1,3,0,0
2004-04-01 01:00:00,1044.0,63.9,4,14,1,3,1,0
2004-04-01 02:00:00,1034.0,67.2,4,14,1,3,2,0
2004-04-01 03:00:00,956.0,73.1,4,14,1,3,3,0
2004-04-01 04:00:00,909.0,66.6,4,14,1,3,4,0


In [50]:
# Recursos de Atraso (Lag features)

#Os recursos de atraso são valores passados da variável que podemos usar para prever valores futuros.


# Recursos de atraso para prever a concentração de poluentes da próxima hora:

# - A concentração de poluentes da hora anterior (t-1).
# - A concentração de poluentes para a mesma hora do dia anterior (t-24).

# O raciocínio por detrás disto é que as concentrações de poluentes não mudam rapidamente e, 
# como demonstrado anteriormente, têm uma sazonalidade de 24 horas.

# Cuidado não existe valores para todos os timestamps. 
# Por seguranca, alterar os dados usando a frequência do pandas.

In [51]:
# raw time series
variables = ["CO_sensor", "RH"]

# Shift the data forward 1 Hr.
tmp = data[variables].shift(freq="1H")
print(tmp.head())

# Names for the new variables.
tmp.columns = [f"{v}_lag_1" for v in variables]
print(tmp.head())

# Add the variables to the original data.
print("Antes")
print(data.shape)

data_l1 = data.merge(tmp, left_index=True, right_index=True, how="left")

print("Depois")
print(data_l1.shape)

data_l1.head()

                     CO_sensor    RH
Date_Time                           
2004-04-01 01:00:00     1143.0  61.6
2004-04-01 02:00:00     1044.0  63.9
2004-04-01 03:00:00     1034.0  67.2
2004-04-01 04:00:00      956.0  73.1
2004-04-01 05:00:00      909.0  66.6
                     CO_sensor_lag_1  RH_lag_1
Date_Time                                     
2004-04-01 01:00:00           1143.0      61.6
2004-04-01 02:00:00           1044.0      63.9
2004-04-01 03:00:00           1034.0      67.2
2004-04-01 04:00:00            956.0      73.1
2004-04-01 05:00:00            909.0      66.6
Antes
(8481, 8)
Depois
(8481, 10)


Unnamed: 0_level_0,CO_sensor,RH,Month,Week,Day,Day_of_week,Hour,is_weekend,CO_sensor_lag_1,RH_lag_1
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2004-04-01 00:00:00,1143.0,61.6,4,14,1,3,0,0,,
2004-04-01 01:00:00,1044.0,63.9,4,14,1,3,1,0,1143.0,61.6
2004-04-01 02:00:00,1034.0,67.2,4,14,1,3,2,0,1044.0,63.9
2004-04-01 03:00:00,956.0,73.1,4,14,1,3,3,0,1034.0,67.2
2004-04-01 04:00:00,909.0,66.6,4,14,1,3,4,0,956.0,73.1


In [53]:
data_l1[["CO_sensor", "CO_sensor_lag_1"]].head()

Unnamed: 0_level_0,CO_sensor,CO_sensor_lag_1
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-04-01 00:00:00,1143.0,
2004-04-01 01:00:00,1044.0,1143.0
2004-04-01 02:00:00,1034.0,1044.0
2004-04-01 03:00:00,956.0,1034.0
2004-04-01 04:00:00,909.0,956.0


In [55]:
data_l1.isnull().sum()

CO_sensor           0
RH                  0
Month               0
Week                0
Day                 0
Day_of_week         0
Hour                0
is_weekend          0
CO_sensor_lag_1    17
RH_lag_1           17
dtype: int64

In [None]:
# Now we repeat the exercise, but this time
# the values are moved forward 24 hours.

# Move forward 24 hrs.
tmp = data[variables].shift(freq="24H")

# Rename the variables.
tmp.columns = [f"{v}_lag_24" for v in variables]

data_l2 = data_l1.merge(tmp, left_index=True, right_index=True, how="left")

data[["CO_sensor", "CO_sensor_lag_24"]].head(25)