In [5]:
import numpy as np
import pandas as pd
from feature_engine.creation import CyclicalFeatures
from sklearn.preprocessing import MinMaxScaler

## Load data

In [6]:
# This function summarizes the various steps in
# the previous notebook.

def load_hourly_data():

    filename = "gams_indoor.csv"
    # Load data:
    data = pd.read_csv(
        filename,
        parse_dates=["ts"],
        index_col=["ts"],

            )
    cols_to_scale = data.columns[:]

    # Create a scaler object and fit it on the data
    scaler = MinMaxScaler()
    scaler.fit(data[cols_to_scale])

    # Transform the data and replace the original columns in-place
    data[cols_to_scale] = scaler.transform(data[cols_to_scale])
        # Load data: only the time variable and CO.
    
    data = data.resample('H').mean()
    # Sanity: sort index.
    data.sort_index(inplace=True)
    return data

def load_15min_data():

    filename = "../Datasets/gams_indoor.csv"
    # Load data:
    data = pd.read_csv(
        filename,
        parse_dates=["ts"],
        index_col=["ts"],

            )
    cols_to_scale = data.columns[:]

    # Create a scaler object and fit it on the data
    scaler = MinMaxScaler()
    scaler.fit(data[cols_to_scale])

    # Transform the data and replace the original columns in-place
    data[cols_to_scale] = scaler.transform(data[cols_to_scale])
        # Load data: only the time variable and CO.
    
    data = data.resample('15T').mean()
    # Sanity: sort index.
    data.sort_index(inplace=True)
    return data

In [7]:
# Load data.
pollutants=['co2','humidity','pm10','pm25','temperature','voc']
data = load_hourly_data()
data.head()

Unnamed: 0_level_0,co2,humidity,pm10,pm25,temperature,voc
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-11-21 00:00:00,0.143928,0.92541,0.056407,0.087367,0.392345,0.0
2016-11-21 01:00:00,0.173406,0.79601,0.037141,0.054913,0.616867,0.000459
2016-11-21 02:00:00,0.356757,0.758707,0.100577,0.145691,0.7968,0.000619
2016-11-21 03:00:00,0.4558,0.75365,0.142179,0.214482,0.863704,0.002179
2016-11-21 04:00:00,0.462883,0.736541,0.082496,0.127187,0.890333,0.000399


In [20]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller

# Initialize empty list to store results
results = []
df=data.dropna()
# Run ADF test for each column and append results to the list
for col in df.columns:

    result = adfuller(df[col])
    results.append([col, result[0], result[1], result[4]['5%']])

# Convert results list to pandas DataFrame
results_df = pd.DataFrame(results, columns=['Variable', 'ADF Statistic', 'p-value', '5% Critical Value'])

# Print results table
results_df

Unnamed: 0,Variable,ADF Statistic,p-value,5% Critical Value
0,co2,-6.493665,1.209143e-08,-2.862514
1,humidity,-5.658085,9.518321e-07,-2.862513
2,pm10,-9.695736,1.105668e-16,-2.862513
3,pm25,-9.919901,3.010287e-17,-2.862513
4,temperature,-6.93491,1.060737e-09,-2.862514
5,voc,-8.727667,3.255272e-14,-2.862514


## Extract time related features

These are features that capture information from the timestamp.

In [89]:
# Extract date and time features.

data["Month"] = data.index.month
data["Week"] = data.index.isocalendar().week
data["Day"] = data.index.day
data["Day_of_week"] = data.index.day_of_week
data["Hour"] = data.index.hour

# find out if it is a weekend.
data["is_weekend"] = np.where(data["Day_of_week"]>4, 1, 0)

# Show new variables
data.head()

Unnamed: 0_level_0,co2,humidity,pm10,pm25,temperature,voc,Month,Week,Day,Day_of_week,Hour,is_weekend
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-11-21 00:00:00,0.143928,0.92541,0.056407,0.087367,0.392345,0.0,11,47,21,0,0,0
2016-11-21 01:00:00,0.173406,0.79601,0.037141,0.054913,0.616867,0.000459,11,47,21,0,1,0
2016-11-21 02:00:00,0.356757,0.758707,0.100577,0.145691,0.7968,0.000619,11,47,21,0,2,0
2016-11-21 03:00:00,0.4558,0.75365,0.142179,0.214482,0.863704,0.002179,11,47,21,0,3,0
2016-11-21 04:00:00,0.462883,0.736541,0.082496,0.127187,0.890333,0.000399,11,47,21,0,4,0


## Lag Features
* Lag features refer to past values of a variable that can be used to predict its future values. In this analysis, lag features will be used to predict the pollutant concentration for the next hour.

Specifically, two lag features will be used:

- The pollutant concentration from the previous hour (t-1).
- The pollutant concentration from the same hour on the previous day (t-24).

The rationale behind this approach is that pollutant concentrations tend to change slowly and exhibit a 24-hour seasonality, as demonstrated in an earlier analysis.


In [90]:
# Here, I show how to move the variables forward by 1 hr,
# so that the pollutant concentration from the previous
# hour (t-1) is aligned with the current hour (t),
# which is the forecasting point.

# raw time series
variables = pollutants

# Shift the data forward 1 Hr.
tmp = data[variables].shift(freq="1H")

# Names for the new variables.
tmp.columns = [v + "_lag_1" for v in variables]

# Add the variables to the original data.
print("data size before")
print(data.shape)

data = data.merge(tmp, left_index=True, right_index=True, how="left")

print("data size after")
print(data.shape)

data.head()

data size before
(3058, 12)
data size after
(3058, 18)


Unnamed: 0_level_0,co2,humidity,pm10,pm25,temperature,voc,Month,Week,Day,Day_of_week,Hour,is_weekend,co2_lag_1,humidity_lag_1,pm10_lag_1,pm25_lag_1,temperature_lag_1,voc_lag_1
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2016-11-21 00:00:00,0.143928,0.92541,0.056407,0.087367,0.392345,0.0,11,47,21,0,0,0,,,,,,
2016-11-21 01:00:00,0.173406,0.79601,0.037141,0.054913,0.616867,0.000459,11,47,21,0,1,0,0.143928,0.92541,0.056407,0.087367,0.392345,0.0
2016-11-21 02:00:00,0.356757,0.758707,0.100577,0.145691,0.7968,0.000619,11,47,21,0,2,0,0.173406,0.79601,0.037141,0.054913,0.616867,0.000459
2016-11-21 03:00:00,0.4558,0.75365,0.142179,0.214482,0.863704,0.002179,11,47,21,0,3,0,0.356757,0.758707,0.100577,0.145691,0.7968,0.000619
2016-11-21 04:00:00,0.462883,0.736541,0.082496,0.127187,0.890333,0.000399,11,47,21,0,4,0,0.4558,0.75365,0.142179,0.214482,0.863704,0.002179


In [91]:
data[variables].head()

Unnamed: 0_level_0,co2,humidity,pm10,pm25,temperature,voc
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-11-21 00:00:00,0.143928,0.92541,0.056407,0.087367,0.392345,0.0
2016-11-21 01:00:00,0.173406,0.79601,0.037141,0.054913,0.616867,0.000459
2016-11-21 02:00:00,0.356757,0.758707,0.100577,0.145691,0.7968,0.000619
2016-11-21 03:00:00,0.4558,0.75365,0.142179,0.214482,0.863704,0.002179
2016-11-21 04:00:00,0.462883,0.736541,0.082496,0.127187,0.890333,0.000399


In [92]:
# In this procedure, we introduced missing
# data whenever there was no data available in
# the previous hour.

data.isnull().sum()

co2                  60
humidity             60
pm10                 60
pm25                 60
temperature          60
voc                  60
Month                 0
Week                  0
Day                   0
Day_of_week           0
Hour                  0
is_weekend            0
co2_lag_1            61
humidity_lag_1       61
pm10_lag_1           61
pm25_lag_1           61
temperature_lag_1    61
voc_lag_1            61
dtype: int64

In [93]:
# Move forward 24 hrs.
tmp = data[variables].shift(freq="24H")

# Rename the variables.
tmp.columns = [v + "_lag_24" for v in variables]

# Add the features to the original data.
print("data size before")
print(data.shape)

data = data.merge(tmp, left_index=True, right_index=True, how="left")

print("data size after")
print(data.shape)

data[variables].head(25)

data size before
(3058, 18)
data size after
(3058, 24)


Unnamed: 0_level_0,co2,humidity,pm10,pm25,temperature,voc
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-11-21 00:00:00,0.143928,0.92541,0.056407,0.087367,0.392345,0.0
2016-11-21 01:00:00,0.173406,0.79601,0.037141,0.054913,0.616867,0.000459
2016-11-21 02:00:00,0.356757,0.758707,0.100577,0.145691,0.7968,0.000619
2016-11-21 03:00:00,0.4558,0.75365,0.142179,0.214482,0.863704,0.002179
2016-11-21 04:00:00,0.462883,0.736541,0.082496,0.127187,0.890333,0.000399
2016-11-21 05:00:00,0.510435,0.740581,0.069299,0.107438,0.900539,0.000462
2016-11-21 06:00:00,0.596438,0.742474,0.073075,0.113483,0.921834,0.001445
2016-11-21 07:00:00,0.746419,0.747353,0.047713,0.072885,0.950472,0.014052
2016-11-21 08:00:00,0.907359,0.732932,0.028885,0.04143,0.95765,0.007928
2016-11-21 09:00:00,0.97537,0.733449,0.022391,0.03016,0.936674,0.013557


In [94]:
# In this procedure, we introduced missing
# data whenever there was no data available in
# the previous 24 hours.

data.isnull().sum()

co2                   60
humidity              60
pm10                  60
pm25                  60
temperature           60
voc                   60
Month                  0
Week                   0
Day                    0
Day_of_week            0
Hour                   0
is_weekend             0
co2_lag_1             61
humidity_lag_1        61
pm10_lag_1            61
pm25_lag_1            61
temperature_lag_1     61
voc_lag_1             61
co2_lag_24            84
humidity_lag_24       84
pm10_lag_24           84
pm25_lag_24           84
temperature_lag_24    84
voc_lag_24            84
dtype: int64

## Window features

Window features refer to mathematical computations performed on a variable’s values over a pre-defined time window prior to the time being forecasted. For example, one could use the average of the previous 3 values of a time series to predict its current value.

To create this type of feature, one must first calculate the average of the 3 previous values and then shift that value forward in time. This process can be repeated for each timestamp in the data to create a new window feature.

In [95]:
# Use the mean of the 3 previous hours as input variables.

tmp = (
    data[variables]
    .rolling(window="3H")
    .mean()  # Average the last 3 hr values.
    .shift(freq="1H")  # Move the average 1 hour forward
)

# Rename the columns
tmp.columns = [v + "_window" for v in variables]


# view of the result
tmp.head(10)

Unnamed: 0_level_0,co2_window,humidity_window,pm10_window,pm25_window,temperature_window,voc_window
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-11-21 01:00:00,0.143928,0.92541,0.056407,0.087367,0.392345,0.0
2016-11-21 02:00:00,0.158667,0.86071,0.046774,0.07114,0.504606,0.000229
2016-11-21 03:00:00,0.224697,0.826709,0.064708,0.09599,0.602004,0.000359
2016-11-21 04:00:00,0.328654,0.769456,0.093299,0.138362,0.759124,0.001086
2016-11-21 05:00:00,0.425147,0.749633,0.108418,0.162453,0.850279,0.001066
2016-11-21 06:00:00,0.476373,0.743591,0.097992,0.149702,0.884858,0.001013
2016-11-21 07:00:00,0.523252,0.739865,0.074957,0.116036,0.904235,0.000768
2016-11-21 08:00:00,0.617764,0.743469,0.063362,0.097935,0.924282,0.00532
2016-11-21 09:00:00,0.750072,0.74092,0.049891,0.075932,0.943318,0.007808
2016-11-21 10:00:00,0.876382,0.737911,0.032996,0.048158,0.948265,0.011845


In [96]:
# Join the new variables to the original data.
print("data size before")
print(data.shape)

data = data.merge(tmp, left_index=True, right_index=True, how="left")

print("data size after")
print(data.shape)

data[ variables].head()

data size before
(3058, 24)
data size after
(3058, 30)


Unnamed: 0_level_0,co2,humidity,pm10,pm25,temperature,voc
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-11-21 00:00:00,0.143928,0.92541,0.056407,0.087367,0.392345,0.0
2016-11-21 01:00:00,0.173406,0.79601,0.037141,0.054913,0.616867,0.000459
2016-11-21 02:00:00,0.356757,0.758707,0.100577,0.145691,0.7968,0.000619
2016-11-21 03:00:00,0.4558,0.75365,0.142179,0.214482,0.863704,0.002179
2016-11-21 04:00:00,0.462883,0.736541,0.082496,0.127187,0.890333,0.000399


## Periodic features

Some features exhibit periodic behavior. For example, hours, months, and days all have a natural cycle that repeats at regular intervals.

One way to encode these periodic features is to use a sine and cosine transformation based on the feature’s period. This has the effect of bringing values that are far apart in their numerical representation closer together. For example, December (12) is closer to January (1) than it is to June (6), but this relationship is not captured by their numerical values. By transforming these variables with sine and cosine functions, this relationship can be better represented.

In [99]:
# Create features that capture the cyclical representation.

cyclical = CyclicalFeatures(
    variables=["Month", "Hour","Day"],  # The features we want to transform.
    drop_original=False,  # Whether to drop the original features.
)

data = cyclical.fit_transform(data)

In [100]:
cyclical_vars = [var for var in data.columns if "sin" in var or "cos" in var]

data[cyclical_vars].head()

Unnamed: 0_level_0,Month_sin,Month_cos,Hour_sin,Hour_cos,Day_sin,Day_cos
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-11-21 00:00:00,-0.5,0.866025,0.0,1.0,-0.897805,-0.440394
2016-11-21 01:00:00,-0.5,0.866025,0.269797,0.962917,-0.897805,-0.440394
2016-11-21 02:00:00,-0.5,0.866025,0.519584,0.854419,-0.897805,-0.440394
2016-11-21 03:00:00,-0.5,0.866025,0.730836,0.682553,-0.897805,-0.440394
2016-11-21 04:00:00,-0.5,0.866025,0.887885,0.460065,-0.897805,-0.440394


We can see the newly created features at the end of the dataframe.

## Drop missing data

When creating lag and window features, we introduced missing data. 

In [101]:
# Determine fraction of missing data.

data.isnull().sum() / len(data)

co2                   0.019621
humidity              0.019621
pm10                  0.019621
pm25                  0.019621
temperature           0.019621
voc                   0.019621
Month                 0.000000
Week                  0.000000
Day                   0.000000
Day_of_week           0.000000
Hour                  0.000000
is_weekend            0.000000
co2_lag_1             0.019948
humidity_lag_1        0.019948
pm10_lag_1            0.019948
pm25_lag_1            0.019948
temperature_lag_1     0.019948
voc_lag_1             0.019948
co2_lag_24            0.027469
humidity_lag_24       0.027469
pm10_lag_24           0.027469
pm25_lag_24           0.027469
temperature_lag_24    0.027469
voc_lag_24            0.027469
co2_window            0.015370
humidity_window       0.015370
pm10_window           0.015370
pm25_window           0.015370
temperature_window    0.015370
voc_window            0.015370
Month_sin             0.000000
Month_cos             0.000000
Hour_sin

## Imputation

There is not a lot of data missing, so I will just remove those observations.

In [102]:
print("data size before")
print(data.shape)

data.dropna(inplace=True)

print("data size after")
print(data.shape)

data size before
(3058, 36)
data size after
(2927, 36)


## Save preprocessed data

In [103]:
data.head()

Unnamed: 0_level_0,co2,humidity,pm10,pm25,temperature,voc,Month,Week,Day,Day_of_week,...,pm10_window,pm25_window,temperature_window,voc_window,Month_sin,Month_cos,Hour_sin,Hour_cos,Day_sin,Day_cos
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-11-22 00:00:00,0.125142,0.681239,0.022233,0.033495,0.592737,0.000287,11,47,22,1,...,0.026882,0.041626,0.62421,0.000309,-0.5,0.866025,0.0,1.0,-0.968077,-0.250653
2016-11-22 01:00:00,0.159592,0.659048,0.006791,0.007556,0.613171,0.000258,11,47,22,1,...,0.028853,0.044576,0.607663,0.000319,-0.5,0.866025,0.269797,0.962917,-0.968077,-0.250653
2016-11-22 02:00:00,0.316681,0.633366,0.008445,0.009361,0.645052,0.000405,11,47,22,1,...,0.023795,0.03616,0.604318,0.000239,-0.5,0.866025,0.519584,0.854419,-0.968077,-0.250653
2016-11-22 03:00:00,0.443831,0.611913,0.018425,0.024901,0.666608,0.000563,11,47,22,1,...,0.012489,0.016804,0.616987,0.000317,-0.5,0.866025,0.730836,0.682553,-0.968077,-0.250653
2016-11-22 04:00:00,0.532989,0.624371,0.025304,0.033161,0.695313,0.011083,11,47,22,1,...,0.01122,0.013939,0.64161,0.000409,-0.5,0.866025,0.887885,0.460065,-0.968077,-0.250653


In [41]:
# store new dataset

data.to_csv("gams_preprocessed.csv", index=True)