In [1]:
import sys
from pathlib import Path

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('airquality',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    sys.path.append(root_dir)
print(f"Added the following directory to the PYTHONPATH: {root_dir}")
    
# Set the environment variables from the file <root_dir>/.env
# from mlfs import config
# settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Added the following directory to the PYTHONPATH: /Users/hayleychang/Desktop/mlfs-book


<span style="font-width:bold; font-size: 3rem; color:#333;">- Part 02: Daily Feature Pipeline for Air Quality (aqicn.org) and weather (openmeteo)</span>

## üóíÔ∏è This notebook is divided into the following sections:
1. Download and Parse Data
2. Feature Group Insertion


__This notebook should be scheduled to run daily__

In the book, we use a GitHub Action stored here:
[.github/workflows/air-quality-daily.yml](https://github.com/featurestorebook/mlfs-book/blob/main/.github/workflows/air-quality-daily.yml)

However, you are free to use any Python Orchestration tool to schedule this program to run daily.

### <span style='color:#ff5f27'> üìù Imports

In [2]:
import datetime
from datetime import timedelta
import time
import requests
import pandas as pd
import hopsworks
from mlfs.airquality import util
from mlfs import config
import json
import os
import warnings
import numpy as np
warnings.filterwarnings("ignore")

## <span style='color:#ff5f27'> üåç Get the Sensor URL, Country, City, Street names from Hopsworks </span>

__Update the values in the cell below.__

__These should be the same values as in notebook 1 - the feature backfill notebook__


In [3]:
project = hopsworks.login(engine="python")
fs = project.get_feature_store() 
secrets = hopsworks.get_secrets_api()

# This line will fail if you have not registered the AQICN_API_KEY as a secret in Hopsworks
AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value
location_str = secrets.get_secret("SENSOR_LOCATION_JSON").value
location = json.loads(location_str)

country=location['country']
city=location['city']
street=location['street']
aqicn_url=location['aqicn_url']
latitude=location['latitude']
longitude=location['longitude']

today = datetime.date.today()

location_str

2025-11-14 17:51:20,847 INFO: Initializing external client
2025-11-14 17:51:20,848 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-11-14 17:51:22,616 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1286345


'{"country": "united-states-of-america", "city": "los-angeles", "street": "arroyo-seco-museum-science-magnet-span-4322", "aqicn_url": "https://api.waqi.info/feed/A399199", "latitude": 34.05, "longitude": -118.24}'

### <span style="color:#ff5f27;"> üîÆ Get references to the Feature Groups </span>

In [4]:
# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name='air_quality',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather',
    version=1,
)

lagged_pm25 = fs.get_or_create_feature_group(
    name='lagged_pm25',
    description='Lagged PM2.5 measurements',
    version=1,
    primary_key=['city', 'date'],
    event_time="date",
    expectation_suite=None
) 

---

## <span style='color:#ff5f27'> üå´ Retrieve Today's Air Quality data (PM2.5) from the AQI API</span>


In [5]:
import requests
import pandas as pd

aq_today_df = util.get_pm25(aqicn_url, country, city, street, today, AQICN_API_KEY)
aq_today_df

Unnamed: 0,pm25,country,city,street,date,url
0,35.0,united-states-of-america,los-angeles,arroyo-seco-museum-science-magnet-span-4322,2025-11-14,https://api.waqi.info/feed/A399199


In [6]:
aq_today_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   pm25     1 non-null      float32       
 1   country  1 non-null      object        
 2   city     1 non-null      object        
 3   street   1 non-null      object        
 4   date     1 non-null      datetime64[ns]
 5   url      1 non-null      object        
dtypes: datetime64[ns](1), float32(1), object(4)
memory usage: 176.0+ bytes


## <span style='color:#ff5f27'> üå¶ Get Weather Forecast data</span>

In [7]:
hourly_df = util.get_hourly_weather_forecast(city, latitude, longitude)
hourly_df = hourly_df.set_index('date')
hourly_df.dropna(inplace=True)


# We will only make 1 daily prediction, so we will replace the hourly forecasts with a single daily forecast
# We only want the daily weather data, so only get weather at 12:00
daily_df = hourly_df.between_time('11:59', '12:01')
daily_df = daily_df.reset_index()
daily_df['date'] = pd.to_datetime(daily_df['date']).dt.date
daily_df['date'] = pd.to_datetime(daily_df['date'])
daily_df['city'] = city

# Wind speed squared
daily_df['wind_speed_10m_max_squared'] = daily_df['wind_speed_10m_max'] ** 2

# Wind speed directions
daily_df['wind_u'] = daily_df['wind_speed_10m_max'] * np.sin(np.radians(daily_df['wind_direction_10m_dominant']))
daily_df['wind_v'] = daily_df['wind_speed_10m_max'] * np.cos(np.radians(daily_df['wind_direction_10m_dominant']))

# Temporal signals
daily_df['day_of_week'] = daily_df['date'].dt.dayofweek          # 0=Mon
daily_df['month'] = daily_df['date'].dt.month
daily_df['is_weekend'] = (daily_df['day_of_week'] >= 5).astype(int)
daily_df['day_of_year'] = daily_df['date'].dt.dayofyear

# Cyclical encoding for day/month so models ‚Äúfeel‚Äù seasonality
daily_df['day_of_week_sin'] = np.sin(2 * np.pi * daily_df['day_of_week'] / 7)
daily_df['day_of_week_cos'] = np.cos(2 * np.pi * daily_df['day_of_week'] / 7)
daily_df['month_sin'] = np.sin(2 * np.pi * daily_df['month'] / 12)
daily_df['month_cos'] = np.cos(2 * np.pi * daily_df['month'] / 12)

# Weather-derived interactions
daily_df['precipitation_binary'] = (daily_df['precipitation_sum'] > 0).astype(int)
daily_df['temp_wind_interaction'] = daily_df['temperature_2m_mean'] * daily_df['wind_speed_10m_max']
daily_df['precip_wind_interaction'] = daily_df['precipitation_sum'] * daily_df['wind_speed_10m_max']

daily_df["precip_wind_u"] = daily_df["precipitation_sum"] * daily_df["wind_u"]

# Anomaly weather detection
# get past 30 days of weather data
history = weather_fg.filter(
    (weather_fg.city == city) &
    (weather_fg.date >= today - timedelta(days=30)) &
    (weather_fg.date < today)
).read()

# calculate avg temp of past 30 days
if not history.empty:
    temp_30d_avg = history['temperature_2m_mean'].mean()
else:
    temp_30d_avg = daily_df['temperature_2m_mean'].iloc[0]
daily_df["temperature_30d_avg"] = temp_30d_avg.astype('float64')
# diff between current temp and avg temp of past 30 days
daily_df['temperature_anomaly'] = (daily_df['temperature_2m_mean'] - temp_30d_avg ).astype('float64')
daily_df["temp_anomaly_wind_speed"] = (daily_df["temperature_anomaly"] * daily_df["wind_speed_10m_max"]).astype('float64')

daily_df

Coordinates 34.0¬∞N -118.25¬∞E
Elevation 87.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.73s) 


Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,city,wind_speed_10m_max_squared,wind_u,wind_v,day_of_week,...,day_of_week_cos,month_sin,month_cos,precipitation_binary,temp_wind_interaction,precip_wind_interaction,precip_wind_u,temperature_30d_avg,temperature_anomaly,temp_anomaly_wind_speed
0,2025-11-14,15.9,0.3,10.739833,103.570457,los-angeles,115.344009,10.44,-2.520004,4,...,-0.900969,-0.5,0.866025,1,170.763336,3.22195,3.132,18.248003,-2.348003,-25.217164
1,2025-11-15,15.55,2.5,1.08,90.0,los-angeles,1.1664,1.08,-4.72083e-08,5,...,-0.222521,-0.5,0.866025,1,16.794001,2.7,2.7,18.248003,-2.698003,-2.913843
2,2025-11-16,11.15,0.2,3.075841,110.556129,los-angeles,9.4608,2.879999,-1.080004,6,...,0.62349,-0.5,0.866025,1,34.295631,0.615168,0.576,18.248003,-7.098003,-21.832333
3,2025-11-17,13.15,0.3,5.315336,118.300667,los-angeles,28.2528,4.680004,-2.519993,0,...,1.0,-0.5,0.866025,1,69.896667,1.594601,1.404001,18.248003,-5.098003,-27.097602
4,2025-11-18,8.2,0.0,3.259938,83.659904,los-angeles,10.627199,3.24,0.3599944,1,...,0.62349,-0.5,0.866025,0,26.731495,0.0,0.0,18.248003,-10.048003,-32.755872
5,2025-11-19,6.1,0.0,2.620839,344.054535,los-angeles,6.868799,-0.720003,2.519999,2,...,-0.222521,-0.5,0.866025,0,15.98712,0.0,-0.0,18.248003,-12.148003,-31.837963
6,2025-11-20,11.0,0.0,7.23591,95.710503,los-angeles,52.358398,7.200001,-0.7199889,3,...,-0.900969,-0.5,0.866025,0,79.595016,0.0,0.0,18.248003,-7.248003,-52.4459


In [8]:
daily_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   date                         7 non-null      datetime64[ns]
 1   temperature_2m_mean          7 non-null      float32       
 2   precipitation_sum            7 non-null      float32       
 3   wind_speed_10m_max           7 non-null      float32       
 4   wind_direction_10m_dominant  7 non-null      float32       
 5   city                         7 non-null      object        
 6   wind_speed_10m_max_squared   7 non-null      float32       
 7   wind_u                       7 non-null      float32       
 8   wind_v                       7 non-null      float32       
 9   day_of_week                  7 non-null      int32         
 10  month                        7 non-null      int32         
 11  is_weekend                   7 non-null      int6

## <span style="color:#ff5f27;">‚¨ÜÔ∏è Uploading new data to the Feature Store</span>

In [9]:
# Insert new data
air_quality_fg.insert(aq_today_df)

2025-11-14 17:51:31,249 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1286345/fs/1273967/fg/1717594


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 1/1 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: air_quality_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286345/jobs/named/air_quality_1_offline_fg_materialization/executions


(Job('air_quality_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "pm25",
           "min_value": -0.1,
           "max_value": 500.0,
           "strict_min": true
         },
         "meta": {
           "expectationId": 757764
         }
       },
       "result": {
         "observed_value": 35.0,
         "element_count": 1,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-11-14T04:51:31.000248Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     }
   ],
   "evaluation_parameters": {},
   "statistics": {
     "evaluated_expectations": 1,
     "successful_expectat

In [10]:
# Insert new data
weather_fg.insert(daily_df, wait=True)

2025-11-14 17:52:10,532 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1286345/fs/1273967/fg/1717595


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 7/7 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286345/jobs/named/weather_1_offline_fg_materialization/executions
2025-11-14 17:52:27,956 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2025-11-14 17:52:31,942 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-11-14 17:52:38,716 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: KILLED
2025-11-14 17:52:38,892 INFO: Waiting for log aggregation to finish.


RestAPIError: Metadata operation error: (url: https://c.app.hopsworks.ai/hopsworks-api/api/project/1286345/jobs/weather_1_offline_fg_materialization/executions/3828158). Server response: 
HTTP code: 404, HTTP reason: Not Found, body: b'{"errorCode":130009,"usrMsg":"jobId:weather_1_offline_fg_materialization","errorMsg":"Job not found."}', error code: 130009, error msg: Job not found., user msg: jobId:weather_1_offline_fg_materialization

### Add lagged data

In [None]:
lagged_pm25 = fs.get_or_create_feature_group(
    name='lagged_pm25',
    description='Lagged PM2.5 measurements',
    version=1,
    primary_key=['city', 'date'],
    event_time="date",
    expectation_suite=None
) 

In [None]:
# Get historical PM2.5 data (EXCLUDING today to avoid data leakage)
# These features should be computed from past data only, not today's target value
aq_history = air_quality_fg.filter(
    (air_quality_fg.city == city) &
    (air_quality_fg.date >= today - timedelta(days=30)) &
    (air_quality_fg.date < today)  # Exclude today!
).read().sort_values('date')

windows = [1, 7, 14, 21, 30]

if aq_history.empty or len(aq_history) < 2:
    # No history yet, create empty features
    df_feat = pd.DataFrame({
        'city': [city],
        'date': [pd.to_datetime(today)]  # Convert date to timestamp
    })
    for w in windows:
        df_feat[f'pm25_change_{w}d'] = np.nan
        df_feat[f'pm25_std_{w}d'] = np.nan
    # Convert NaN columns to float32 (float) to match schema
    for w in windows:
        df_feat[f'pm25_change_{w}d'] = df_feat[f'pm25_change_{w}d'].astype('float32')
        df_feat[f'pm25_std_{w}d'] = df_feat[f'pm25_std_{w}d'].astype('float32')
else:
    # Compute features from historical data only (NO data leakage)
    # pct_change(periods=w) looks BACKWARD: compares value to w periods ago
    # rolling(window=w) uses the last w values
    # We take .iloc[-1] to get the most recent historical value
    
    # --- PERCENT CHANGE FEATURES (from historical data) ---
    pct_change_features = {
        f"pm25_change_{w}d": aq_history["pm25"].pct_change(periods=w).shift(1).iloc[-1] if len(aq_history) > w else np.nan
        for w in windows
    }
    
    # --- ROLLING STD FEATURES (from historical data) ---
    std_features = {
        f"pm25_std_{w}d": aq_history["pm25"].rolling(window=w, min_periods=1).std().shift(1).iloc[-1] if len(aq_history) >= 1 else np.nan
        for w in windows
    }
    
    # --- COMBINE INTO ONE FEATURES DATAFRAME ---
    df_feat = pd.DataFrame({**pct_change_features, **std_features}, index=[0])
    df_feat['city'] = city
    df_feat["date"] = pd.to_datetime(today)  # Convert date to timestamp
    
    # Convert all numeric columns to float32 (float) to match schema
    for col in df_feat.columns:
        if col not in ['city', 'date']:
            # Ensure all numeric columns are float32 (float) to match feature group schema
            if pd.api.types.is_numeric_dtype(df_feat[col]):
                df_feat[col] = df_feat[col].astype('float32')

lagged_pm25.insert(df_feat, wait=True)

## <span style="color:#ff5f27;">‚è≠Ô∏è **Next:** Part 03: Training Pipeline
 </span> 

In the following notebook you will read from a feature group and create training dataset within the feature store
