In [None]:
import sys
from pathlib import Path

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('airquality',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    sys.path.append(root_dir)
print(f"Added the following directory to the PYTHONPATH: {root_dir}")
    
# Set the environment variables from the file <root_dir>/.env
# from mlfs import config
# settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

<span style="font-width:bold; font-size: 3rem; color:#333;">- Part 02: Daily Feature Pipeline for Air Quality (aqicn.org) and weather (openmeteo)</span>

## üóíÔ∏è This notebook is divided into the following sections:
1. Download and Parse Data
2. Feature Group Insertion


__This notebook should be scheduled to run daily__

In the book, we use a GitHub Action stored here:
[.github/workflows/air-quality-daily.yml](https://github.com/featurestorebook/mlfs-book/blob/main/.github/workflows/air-quality-daily.yml)

However, you are free to use any Python Orchestration tool to schedule this program to run daily.

### <span style='color:#ff5f27'> üìù Imports

In [None]:
import datetime
from datetime import timedelta
import time
import requests
import pandas as pd
import hopsworks
from mlfs.airquality import util
from mlfs import config
import json
import os
import warnings
import numpy as np
warnings.filterwarnings("ignore")

## <span style='color:#ff5f27'> üåç Get the Sensor URL, Country, City, Street names from Hopsworks </span>

__Update the values in the cell below.__

__These should be the same values as in notebook 1 - the feature backfill notebook__


In [None]:
project = hopsworks.login(engine="python")
fs = project.get_feature_store() 
secrets = hopsworks.get_secrets_api()

# This line will fail if you have not registered the AQICN_API_KEY as a secret in Hopsworks
AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value
location_str = secrets.get_secret("SENSOR_LOCATION_JSON").value
location = json.loads(location_str)

country=location['country']
city=location['city']
street=location['street']
aqicn_url=location['aqicn_url']
latitude=location['latitude']
longitude=location['longitude']

today = datetime.date.today()

location_str

### <span style="color:#ff5f27;"> üîÆ Get references to the Feature Groups </span>

In [None]:
# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name='air_quality',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather',
    version=1,
)
lagged_fg = fs.get_feature_group(
    name='lagged_pm25',
    version=1,
)

---

## <span style='color:#ff5f27'> üå´ Retrieve Today's Air Quality data (PM2.5) from the AQI API</span>


In [None]:
import requests
import pandas as pd

aq_today_df = util.get_pm25(aqicn_url, country, city, street, today, AQICN_API_KEY)
aq_today_df

In [None]:
aq_today_df.info()

## <span style='color:#ff5f27'> üå¶ Get Weather Forecast data</span>

In [None]:
hourly_df = util.get_hourly_weather_forecast(city, latitude, longitude)
hourly_df = hourly_df.set_index('date')
hourly_df.dropna(inplace=True)


# We will only make 1 daily prediction, so we will replace the hourly forecasts with a single daily forecast
# We only want the daily weather data, so only get weather at 12:00
daily_df = hourly_df.between_time('11:59', '12:01')
daily_df = daily_df.reset_index()
daily_df['date'] = pd.to_datetime(daily_df['date']).dt.date
daily_df['date'] = pd.to_datetime(daily_df['date'])
daily_df['city'] = city

# Wind speed squared
daily_df['wind_speed_10m_max_squared'] = daily_df['wind_speed_10m_max'] ** 2

# Wind speed directions
daily_df['wind_u'] = daily_df['wind_speed_10m_max'] * np.sin(np.radians(daily_df['wind_direction_10m_dominant']))
daily_df['wind_v'] = daily_df['wind_speed_10m_max'] * np.cos(np.radians(daily_df['wind_direction_10m_dominant']))

# Temporal signals
daily_df['day_of_week'] = daily_df['date'].dt.dayofweek          # 0=Mon
daily_df['month'] = daily_df['date'].dt.month
daily_df['is_weekend'] = (daily_df['day_of_week'] >= 5).astype(int)
daily_df['day_of_year'] = daily_df['date'].dt.dayofyear

# Cyclical encoding for day/month so models ‚Äúfeel‚Äù seasonality
daily_df['day_of_week_sin'] = np.sin(2 * np.pi * daily_df['day_of_week'] / 7)
daily_df['day_of_week_cos'] = np.cos(2 * np.pi * daily_df['day_of_week'] / 7)
daily_df['month_sin'] = np.sin(2 * np.pi * daily_df['month'] / 12)
daily_df['month_cos'] = np.cos(2 * np.pi * daily_df['month'] / 12)

# Weather-derived interactions
daily_df['precipitation_binary'] = (daily_df['precipitation_sum'] > 0).astype(int)
daily_df['temp_wind_interaction'] = daily_df['temperature_2m_mean'] * daily_df['wind_speed_10m_max']
daily_df['precip_wind_interaction'] = daily_df['precipitation_sum'] * daily_df['wind_speed_10m_max']

daily_df["precip_wind_u"] = daily_df["precipitation_sum"] * daily_df["wind_u"]

# Anomaly weather detection
# get past 30 days of weather data
history = weather_fg.filter(
    (weather_fg.city == city) &
    (weather_fg.date >= today - timedelta(days=30)) &
    (weather_fg.date < today)
).read()

# calculate avg temp of past 30 days
if not history.empty:
    temp_30d_avg = history['temperature_2m_mean'].mean()
else:
    temp_30d_avg = daily_df['temperature_2m_mean'].iloc[0]
daily_df["temperature_30d_avg"] = temp_30d_avg.astype('float64')
# diff between current temp and avg temp of past 30 days
daily_df['temperature_anomaly'] = (daily_df['temperature_2m_mean'] - temp_30d_avg ).astype('float64')
daily_df["temp_anomaly_wind_speed"] = (daily_df["temperature_anomaly"] * daily_df["wind_speed_10m_max"]).astype('float64')

daily_df

In [None]:
daily_df.info()

## <span style="color:#ff5f27;">‚¨ÜÔ∏è Uploading new data to the Feature Store</span>

In [None]:
# Insert new data
air_quality_fg.insert(aq_today_df)

In [None]:
# Insert new data
weather_fg.insert(daily_df, wait=True)

### Add lagged data

In [None]:
lagged_pm25 = fs.get_or_create_feature_group(
    name='lagged_pm25',
    description='Lagged PM2.5 measurements',
    version=1,
    primary_key=['city', 'date'],
    event_time="date",
    expectation_suite=None
) 

In [None]:
# Get historical PM2.5 data (EXCLUDING today to avoid data leakage)
# These features should be computed from past data only, not today's target value
aq_history = air_quality_fg.filter(
    (air_quality_fg.city == city) &
    (air_quality_fg.date >= today - timedelta(days=30)) &
    (air_quality_fg.date < today)  # Exclude today!
).read().sort_values('date')

windows = [1, 7, 14, 21, 30]

if aq_history.empty or len(aq_history) < 2:
    # No history yet, create empty features
    df_feat = pd.DataFrame({
        'city': [city],
        'date': [today]
    })
    for w in windows:
        df_feat[f'pm25_change_{w}d'] = np.nan
        df_feat[f'pm25_std_{w}d'] = np.nan
else:
    # Compute features from historical data only (NO data leakage)
    # pct_change(periods=w) looks BACKWARD: compares value to w periods ago
    # rolling(window=w) uses the last w values
    # We take .iloc[-1] to get the most recent historical value
    
    # --- PERCENT CHANGE FEATURES (from historical data) ---
    pct_change_features = {
        f"pm25_change_{w}d": aq_history["pm25"].pct_change(periods=w).shift(1).iloc[-1] if len(aq_history) > w else np.nan
        for w in windows
    }
    
    # --- ROLLING STD FEATURES (from historical data) ---
    std_features = {
        f"pm25_std_{w}d": aq_history["pm25"].rolling(window=w, min_periods=1).std().shift(1).iloc[-1] if len(aq_history) >= 1 else np.nan
        for w in windows
    }
    
    # --- COMBINE INTO ONE FEATURES DATAFRAME ---
    df_feat = pd.DataFrame({**pct_change_features, **std_features}, index=[0])
    df_feat['city'] = city
    df_feat["date"] = today

lagged_pm25.insert(df_feat, wait=True)

## <span style="color:#ff5f27;">‚è≠Ô∏è **Next:** Part 03: Training Pipeline
 </span> 

In the following notebook you will read from a feature group and create training dataset within the feature store
