In [1]:
import sys
from pathlib import Path

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('airquality',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    sys.path.append(root_dir)
print(f"Added the following directory to the PYTHONPATH: {root_dir}")
    
# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Added the following directory to the PYTHONPATH: /home/ryll/coding/kth/id2223/Lab1ScalableMLDL
HopsworksSettings initialized!


<span style="font-width:bold; font-size: 3rem; color:#333;">- Part 02: Daily Feature Pipeline for Air Quality (aqicn.org) and weather (openmeteo)</span>

## üóíÔ∏è This notebook is divided into the following sections:
1. Download and Parse Data
2. Feature Group Insertion


__This notebook should be scheduled to run daily__

In the book, we use a GitHub Action stored here:
[.github/workflows/air-quality-daily.yml](https://github.com/featurestorebook/mlfs-book/blob/main/.github/workflows/air-quality-daily.yml)

However, you are free to use any Python Orchestration tool to schedule this program to run daily.

### <span style='color:#ff5f27'> üìù Imports

In [2]:
import datetime
import time
import requests
import pandas as pd
import hopsworks
from mlfs.airquality import util
from mlfs.airquality.sensors import SENSORS
from mlfs import config
import json
import os
import warnings
warnings.filterwarnings("ignore")

## <span style='color:#ff5f27'> üåç Get the Sensor URL, Country, City, Street names from Hopsworks </span>

__Update the values in the cell below.__

__These should be the same values as in notebook 1 - the feature backfill notebook__


In [3]:
project = hopsworks.login(engine="python")
fs = project.get_feature_store() 
secrets = hopsworks.get_secrets_api()

# This line will fail if you have not registered the AQICN_API_KEY as a secret in Hopsworks
AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value
# location_str = secrets.get_secret("SENSOR_LOCATION_JSON").value
# location = json.loads(location_str)


country = "united-kingdom"
city = "glasgow"
latitude =  "55.8523712158203"
longitude = "-4.2244873046875"

today = datetime.date.today()


2025-11-18 11:29:21,248 INFO: Initializing external client
2025-11-18 11:29:21,250 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-11-18 11:29:22,812 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1286319


### <span style="color:#ff5f27;"> üîÆ Get references to the Feature Groups </span>

In [4]:
# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name='air_quality_glasgow',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather_glasgow',
    version=1,
)

---

## <span style='color:#ff5f27'> üå´ Retrieve Today's Air Quality data (PM2.5) from the AQI API</span>


In [20]:
today_dfs = []
for sensor in SENSORS:
    today_df = util.get_pm25(sensor['aqicn_url'], country, city, sensor['street'], today, AQICN_API_KEY)
    today_dfs.append(today_df)

aq_today_df = pd.concat(today_dfs,ignore_index=True)

In [21]:
batch_data = air_quality_fg.filter(air_quality_fg.date > today - datetime.timedelta(days = 2)).read()

# prepare lag columns from batch_data and merge into today's dataframe by street
lags_df = batch_data[['street', 'pm25', 'pm25_lag1', 'pm25_lag2']].copy()
lags_df = lags_df.rename(columns={
    'pm25': 'pm25_lag1',
    'pm25_lag1': 'pm25_lag2',
    'pm25_lag2': 'pm25_lag3',
})

aq_today_df = aq_today_df.merge(lags_df, on='street', how='left')

# ensure numeric dtype consistency
for col in ['pm25_lag1', 'pm25_lag2', 'pm25_lag3']:
    if col in aq_today_df.columns:
        aq_today_df[col] = aq_today_df[col].astype('float32')

aq_today_df


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.71s) 


Unnamed: 0,pm25,country,city,street,date,url,pm25_lag1,pm25_lag2,pm25_lag3
0,30.0,united-kingdom,glasgow,anderston,2025-11-18,https://api.waqi.info/feed/@6341,20.0,28.0,22.0
1,25.0,united-kingdom,glasgow,broomhill,2025-11-18,https://api.waqi.info/feed/@5997,15.0,26.0,19.0
2,26.0,united-kingdom,glasgow,burgher-street,2025-11-18,https://api.waqi.info/feed/@5998,20.0,24.0,20.0
3,24.0,united-kingdom,glasgow,byres-road,2025-11-18,https://api.waqi.info/feed/@5999,17.0,29.0,23.0
4,22.0,united-kingdom,glasgow,dumbarton-road,2025-11-18,https://api.waqi.info/feed/@6000,16.0,24.0,20.0
5,43.0,united-kingdom,glasgow,high-street,2025-11-18,https://api.waqi.info/feed/@8915/,17.0,20.0,17.0
6,25.0,united-kingdom,glasgow,kerbside,2025-11-18,https://api.waqi.info/feed/@6001,28.0,28.0,35.0
7,33.0,united-kingdom,glasgow,nithsdale-road,2025-11-18,https://api.waqi.info/feed/@6002,22.0,35.0,24.0
8,23.0,united-kingdom,glasgow,townhead,2025-11-18,https://api.waqi.info/feed/@6003,18.0,20.0,18.0


In [22]:
weather_fg = fs.get_feature_group(
    name='weather_glasgow',
    version=1,
)
batch_data = weather_fg.filter(weather_fg.date >= today).read()
batch_data

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.72s) 


Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,city


## <span style='color:#ff5f27'> üå¶ Get Weather Forecast data</span>

In [23]:
hourly_df = util.get_hourly_weather_forecast(city, latitude, longitude)
hourly_df = hourly_df.set_index('date')

# We will only make 1 daily prediction, so we will replace the hourly forecasts with a single daily forecast
# We only want the daily weather data, so only get weather at 12:00
daily_df = hourly_df.between_time('11:59', '12:01')
daily_df = daily_df.reset_index()
daily_df['date'] = pd.to_datetime(daily_df['date']).dt.date
daily_df['date'] = pd.to_datetime(daily_df['date'])
daily_df['city'] = city
daily_df

Coordinates 55.5¬∞N -4.5¬∞E
Elevation 15.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,city
0,2025-11-18,6.65,0.3,11.542478,266.423737,glasgow
1,2025-11-19,3.35,0.0,12.096214,323.471039,glasgow
2,2025-11-20,3.95,0.0,10.144082,332.525665,glasgow
3,2025-11-21,5.2,0.0,15.034041,196.699326,glasgow
4,2025-11-22,8.15,0.2,9.826088,208.442825,glasgow
5,2025-11-23,2.6,0.6,5.804825,29.744795,glasgow
6,2025-11-24,5.65,0.0,4.73506,188.746078,glasgow


In [24]:
daily_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   date                         7 non-null      datetime64[ns]
 1   temperature_2m_mean          7 non-null      float32       
 2   precipitation_sum            7 non-null      float32       
 3   wind_speed_10m_max           7 non-null      float32       
 4   wind_direction_10m_dominant  7 non-null      float32       
 5   city                         7 non-null      object        
dtypes: datetime64[ns](1), float32(4), object(1)
memory usage: 356.0+ bytes


## <span style="color:#ff5f27;">‚¨ÜÔ∏è Uploading new data to the Feature Store</span>

In [25]:
# Insert new data
air_quality_fg.insert(aq_today_df)

2025-11-18 11:45:30,354 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1286319/fs/1273943/fg/1718952


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 9/9 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: air_quality_glasgow_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286319/jobs/named/air_quality_glasgow_1_offline_fg_materialization/executions


(Job('air_quality_glasgow_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "pm25",
           "min_value": -0.1,
           "max_value": 500.0,
           "strict_min": true
         },
         "meta": {
           "expectationId": 759044
         }
       },
       "result": {
         "observed_value": 22.0,
         "element_count": 9,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-11-18T10:45:30.000353Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     }
   ],
   "evaluation_parameters": {},
   "statistics": {
     "evaluated_expectations": 1,
     "successful_

In [26]:
# Insert new data
weather_fg.insert(daily_df, wait=True)

2025-11-18 11:47:01,377 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1286319/fs/1273943/fg/1718953


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 7/7 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_glasgow_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286319/jobs/named/weather_glasgow_1_offline_fg_materialization/executions
2025-11-18 11:47:19,099 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-11-18 11:47:25,462 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-11-18 11:48:54,476 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-11-18 11:48:54,633 INFO: Waiting for log aggregation to finish.
2025-11-18 11:49:06,591 INFO: Execution finished successfully.


(Job('weather_glasgow_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "precipitation_sum",
           "min_value": -0.1,
           "max_value": 1000.0,
           "strict_min": true
         },
         "meta": {
           "expectationId": 759046
         }
       },
       "result": {
         "observed_value": 0.0,
         "element_count": 7,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-11-18T10:47:01.000377Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect

## <span style="color:#ff5f27;">‚è≠Ô∏è **Next:** Part 03: Training Pipeline
 </span> 

In the following notebook you will read from a feature group and create training dataset within the feature store
