In [1]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")


root_dir = Path().absolute()
# Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
if root_dir.parts[-1:] == ('notebooks',):
    root_dir = Path(*root_dir.parts[:-1])
root_dir = str(root_dir) 
print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: /Users/emaminotti/ID2223-ScalableMLDL_Project
Added the following directory to the PYTHONPATH: /Users/emaminotti/ID2223-ScalableMLDL_Project
HopsworksSettings initialized!


<span style="font-width:bold; font-size: 3rem; color:#333;">- Part 01: Feature Backfill </span>

### <span style='color:#ff5f27'> üìù Imports

In [2]:
import datetime
import requests
import pandas as pd
import hopsworks
import util
import datetime
from pathlib import Path
import json
import re
import os
import warnings
warnings.filterwarnings("ignore")

## Hopsworks API Key
You need to have registered an account on app.hopsworks.ai.

Save the HOPSWORKS_API_KEY  to ~/.env file in the root directory of your project

 * mv .env.example .env
 * edit .env

In the .env file, update HOPSWORKS_API_KEY:

`HOPSWORKS_API_KEY="put API KEY value in this string"`


In [3]:
project = hopsworks.login(engine="python")

2025-12-15 14:39:14,517 INFO: Initializing external client
2025-12-15 14:39:14,517 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-12-15 14:39:15,867 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279152


In [6]:
from hopsworks import RestAPIError

sensor_config = {
    'api_url': 'https://air-quality-api.open-meteo.com/v1/air-quality',
    'country' : 'Sweden',
    'city' : 'Stockholm',
    'latitude': '59.3345',
    'longitude': '18.0632'
}

# Set Global Location Variables
country = sensor_config['country']
city = sensor_config['city']

print(f"--- Configuration complete. Configured: {list(sensor_config.keys())} ---")

--- Configuration complete. Configured: ['api_url', 'country', 'city', 'latitude', 'longitude'] ---


## <span style='color:#ff5f27'> Read the historical data into a DataFrame </span>

The cell below will read up historical pollen levels data into a Pandas DataFrame

In [11]:
import requests
import pandas as pd
from datetime import date

# Configurazione Parametri
latitude = sensor_config['latitude']
longitude = sensor_config['longitude']

# Definiamo il periodo storico per il backfill
start_date = "2013-01-01"
end_date = date.today().strftime("%Y-%m-%d")

# Variabili relative ai pollini disponibili su Open-Meteo (European Air Quality)
# alder_pollen, birch_pollen, grass_pollen, mugwort_pollen, olive_pollen, ragweed_pollen
pollen_vars = [
    "alder_pollen", 
    "birch_pollen", 
    "grass_pollen", 
    "mugwort_pollen", 
    "olive_pollen", 
    "ragweed_pollen"
]

# Uniamo le variabili in una stringa separata da virgole per l'URL
hourly_params = ",".join(pollen_vars)

# Costruzione della richiesta API
url = (
    f"https://air-quality-api.open-meteo.com/v1/air-quality?"
    f"latitude={latitude}&longitude={longitude}&"
    f"hourly={hourly_params}&"
    f"start_date={start_date}&end_date={end_date}&"
    f"timezone=Europe%2FBerlin" # Imposta il fuso orario corretto
)

print(f"Downloading data from: {url}")

# Chiamata API e creazione DataFrame
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    
    # I dati orari sono contenuti nella chiave 'hourly'
    hourly_data = data['hourly']
    
    # Creazione del DataFrame
    df_pollen = pd.DataFrame(hourly_data)
    
    # Conversione della colonna 'time' in formato datetime
    df_pollen['time'] = pd.to_datetime(df_pollen['time'])
    
    print(f"Download completed! Downloaded {len(df_pollen)} rows.")
    print("--- DataFrame info ---")
    df_pollen.info()

    # Visualizza le prime righe che non sono tutte NaN
    pollen_cols = [col for col in df_pollen.columns if 'pollen' in col]

    # Filtriamo: mostrami le righe dove ALMENO UNA colonna di polline non √® NaN
    df_valid = df_pollen.dropna(subset=pollen_cols, how='all')

    print(f"\n--- First rows with available data ---")
    if not df_valid.empty:
        display(df_valid.head())
    else:
        print("Every data is missing.")

else:
    print(f"Error downloading the data. Code: {response.status_code}")
    print(response.text)

Downloading data from: https://air-quality-api.open-meteo.com/v1/air-quality?latitude=59.3345&longitude=18.0632&hourly=alder_pollen,birch_pollen,grass_pollen,mugwort_pollen,olive_pollen,ragweed_pollen&start_date=2013-01-01&end_date=2025-12-15&timezone=Europe%2FBerlin
Download completed! Downloaded 113568 rows.
--- DataFrame info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113568 entries, 0 to 113567
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   time            113568 non-null  datetime64[ns]
 1   alder_pollen    32136 non-null   float64       
 2   birch_pollen    32712 non-null   float64       
 3   grass_pollen    34944 non-null   float64       
 4   mugwort_pollen  29040 non-null   float64       
 5   olive_pollen    33312 non-null   float64       
 6   ragweed_pollen  34920 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 6.1 MB

--- First rows wit

Unnamed: 0,time,alder_pollen,birch_pollen,grass_pollen,mugwort_pollen,olive_pollen,ragweed_pollen
68065,2020-10-07 01:00:00,,0.0,0.0,,0.0,0.0
68066,2020-10-07 02:00:00,,0.0,0.0,,0.0,0.0
68067,2020-10-07 03:00:00,,0.0,0.0,,0.0,0.0
68068,2020-10-07 04:00:00,,0.0,0.0,,0.0,0.0
68069,2020-10-07 05:00:00,,0.0,0.0,,0.0,0.0


## <span style='color:#ff5f27'> Data cleaning</span>

In [None]:
# 1. Rimuovi eventuali duplicati basati sul timestamp
df_pollen = df_pollen.drop_duplicates(subset=['time'])

# 2. RIMOZIONE COLONNE IRRILEVANTI
# Rimuoviamo Olive (non presente a Stoccolma) e Ragweed (trascurabile)
# Usiamo una list comprehension per evitare errori se le colonne non fossero presenti
cols_to_drop = ['olive_pollen', 'ragweed_pollen']
df_pollen = df_pollen.drop(columns=[c for c in cols_to_drop if c in df_pollen.columns])

print(f"Colonne rimosse: {[c for c in cols_to_drop if c not in df_pollen.columns]}")

# 3. Identifica le colonne dei pollini RIMANENTI (quelle che finiscono con _pollen)
pollen_cols = [col for col in df_pollen.columns if '_pollen' in col]
print(f"Target rimasti per il modello: {pollen_cols}")

# 4. Gestione dei valori mancanti (NaN)
# Interpolazione lineare per mantenere la tendenza della curva nei buchi di dati
df_pollen[pollen_cols] = df_pollen[pollen_cols].interpolate(method='linear', limit_direction='both')

# Se rimangono NaN, riempiamo con 0
df_pollen[pollen_cols] = df_pollen[pollen_cols].fillna(0.0)

# 5. Sanity Check: I valori di polline non possono essere negativi
df_pollen[pollen_cols] = df_pollen[pollen_cols].clip(lower=0.0)

# 6. Ordinamento temporale
df_pollen = df_pollen.sort_values('time').reset_index(drop=True)

# 7. Preparazione colonne per Hopsworks Feature Store
# Aggiungiamo unix_time (event time) e city (partition/primary key)
df_pollen['unix_time'] = df_pollen['time'].astype('int64') // 10**6  # Convert to milliseconds
df_pollen['city'] = 'Stockholm'

# Visualizza statistiche post-pulizia
print(f"\nDataset pulito e filtrato. Dimensioni: {df_pollen.shape}")
print("Statistiche descrittive:")
display(df_pollen[pollen_cols].describe())

# Check finale: verifica se ci sono ancora NaN
if df_pollen.isnull().values.any():
    print("\nATTENZIONE: Ci sono ancora valori NaN nel dataset!")
    print(df_pollen.isnull().sum())
else:
    print("\nOttimo: Nessun valore NaN presente.")

Colonne rimosse: ['olive_pollen', 'ragweed_pollen']
Target rimasti per il modello: ['alder_pollen', 'birch_pollen', 'grass_pollen', 'mugwort_pollen']

Dataset pulito e filtrato. Dimensioni: (113568, 7)
Statistiche descrittive:


Unnamed: 0,alder_pollen,birch_pollen,grass_pollen,mugwort_pollen
count,113568.0,113568.0,113568.0,113568.0
mean,0.045648,5.958871,0.525137,0.232718
std,0.901749,49.149702,2.507345,1.535307
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,148.1,1880.5,41.7,48.0



Ottimo: Nessun valore NaN presente.


In [15]:
# Casting dei target a float32 per risparmiare memoria
df_pollen[pollen_cols] = df_pollen[pollen_cols].astype('float32')

# Verifica del cambiamento
print("Tipi di dato dopo il casting:")
print(df_pollen[pollen_cols].dtypes)

Tipi di dato dopo il casting:
alder_pollen      float32
birch_pollen      float32
grass_pollen      float32
mugwort_pollen    float32
dtype: object


## <span style='color:#ff5f27'> Add country and city to the DataFrame </span>

In [16]:
# Aggiunta colonne per identificazione geografica (utili per Primary/Partition Key)
df_pollen['country'] = "Sweden"
df_pollen['city'] = "Stockholm"

# Verifica rapida
print("Nuove colonne aggiunte:")
display(df_pollen[['country', 'city']].head(3))

Nuove colonne aggiunte:


Unnamed: 0,country,city
0,Sweden,Stockholm
1,Sweden,Stockholm
2,Sweden,Stockholm


### Adding lagged features

To capture short-term temporal dependencies, we add three new features representing the pollen levels of the previous 1, 2, and 3 days.
These lagged values will help the model learn patterns over time.

In [18]:
# 1. Ordinamento fondamentale
# I dati devono essere ordinati temporalmente per citt√† affinch√© lo shift abbia senso
df_pollen = df_pollen.sort_values(['city', 'time'])

# 2. Definizione dei ritardi (lags) in giorni
lags = [1, 2, 3]

print(f"Creazione feature lagged per: {pollen_cols}")

# 3. Creazione delle colonne lagged
for col in pollen_cols:
    for lag in lags:
        # Nome della nuova colonna, es: birch_pollen_lag_1
        new_col_name = f"{col}_lag_{lag}"
        
        # Eseguiamo lo shift raggruppando per citt√† 
        # (cos√¨ se in futuro aggiungi altre citt√†, i dati non si mischiano)
        df_pollen[new_col_name] = df_pollen.groupby('city')[col].shift(lag)

# 4. Rimozione dei valori nulli generati
# Le prime 3 righe di ogni citt√† avranno NaN perch√© non esiste un "giorno precedente".
# Dato che abbiamo anni di dati, eliminare 3 righe √® la scelta pi√π pulita.
df_pollen = df_pollen.dropna()

# 5. Casting opzionale a float32 per le nuove colonne (per risparmiare memoria)
lag_cols = [c for c in df_pollen.columns if '_lag_' in c]
df_pollen[lag_cols] = df_pollen[lag_cols].astype('float32')

# Verifica
print(f"\nNuove dimensioni del dataset: {df_pollen.shape}")
print("Esempio delle nuove colonne (Betulla):")
# Mostriamo le colonne relative alla betulla per vedere l'effetto "scala"
birch_lag_cols = [c for c in df_pollen.columns if 'birch_pollen' in c]
display(df_pollen[birch_lag_cols].head(6))

Creazione feature lagged per: ['alder_pollen', 'birch_pollen', 'grass_pollen', 'mugwort_pollen']

Nuove dimensioni del dataset: (113565, 20)
Esempio delle nuove colonne (Betulla):


Unnamed: 0,birch_pollen,birch_pollen_lag_1,birch_pollen_lag_2,birch_pollen_lag_3
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0


---

## <span style='color:#ff5f27'> Loading Weather Data from [Open Meteo](https://open-meteo.com/en/docs)

## <span style='color:#ff5f27'> Download the Historical Weather Data </span>


We will download the historical weather data for your `city` by first extracting the earliest date from your DataFrame containing the historical air quality measurements.

We will download all daily historical weather data measurements for your `city` from the earliest date in your air quality measurement DataFrame. It doesn't matter if there are missing days of air quality measurements. We can store all of the daily weather measurements, and when we build our training dataset, we will join up the air quality measurements for a given day to its weather features for that day. 


In [19]:
import requests
import pandas as pd

# 1. Estrazione della data di inizio dai dati dei pollini esistenti
# Assumiamo che df_pollen sia il dataframe creato nei passaggi precedenti
min_date = df_pollen['time'].min().strftime('%Y-%m-%d')
max_date = df_pollen['time'].max().strftime('%Y-%m-%d')

print(f"Scaricamento meteo per Stoccolma dal {min_date} al {max_date}...")

# 2. Selezione delle Feature Meteo (Strategia: Ampia selezione per Feature Selection successiva)
# Queste sono le variabili pi√π promettenti per la predizione dei pollini:
weather_features = [
    "temperature_2m_max",        # Fondamentale: il caldo innesca la fioritura
    "temperature_2m_min",        # Importante: gelate tardive bloccano i pollini
    "temperature_2m_mean",       # Media generale
    "precipitation_sum",         # La pioggia "lava" l'aria (riduce il polline)
    "rain_sum",
    "showers_sum",
    "snowfall_sum",
    "precipitation_hours",
    "wind_speed_10m_max",        # Il vento trasporta il polline (aumenta i livelli)
    "wind_gusts_10m_max",
    "wind_direction_10m_dominant", # La direzione conta se la fonte (foresta) √® a sud/nord
    "shortwave_radiation_sum",   # Luce solare: stimola la fotosintesi/fioritura
    "et0_fao_evapotranspiration" # Indice di "secchezza" del terreno/aria
]

# Uniamo le feature per l'URL
daily_params = ",".join(weather_features)

# 3. Chiamata API (Open-Meteo Historical Weather)
# Nota: usiamo l'endpoint 'archive' per i dati storici
url_weather = (
    f"https://archive-api.open-meteo.com/v1/archive?"
    f"latitude={latitude}&longitude={longitude}&" # Usa le variabili lat/lon gi√† definite
    f"start_date={min_date}&end_date={max_date}&"
    f"daily={daily_params}&"
    f"timezone=Europe%2FBerlin"
)

response_weather = requests.get(url_weather)

if response_weather.status_code == 200:
    data_weather = response_weather.json()
    
    # Creazione DataFrame
    df_weather = pd.DataFrame(data_weather['daily'])
    
    # Conversione data
    df_weather['time'] = pd.to_datetime(df_weather['time'])
    
    # Aggiunta chiavi per il join futuro
    df_weather['city'] = 'Stockholm'
    df_weather['unix_time'] = df_weather['time'].astype('int64') // 10**6
    
    # Casting a float32 per le colonne numeriche (esclusa time, city, unix_time)
    numeric_cols = [c for c in df_weather.columns if c not in ['time', 'city', 'unix_time']]
    df_weather[numeric_cols] = df_weather[numeric_cols].astype('float32')
    
    print(f"Meteo scaricato: {df_weather.shape[0]} giorni.")
    display(df_weather.head())
    
else:
    print(f"Errore download meteo: {response_weather.status_code}")
    print(response_weather.text)

Scaricamento meteo per Stoccolma dal 2013-01-01 al 2025-12-15...
Meteo scaricato: 4732 giorni.


Unnamed: 0,time,temperature_2m_max,temperature_2m_min,temperature_2m_mean,precipitation_sum,rain_sum,showers_sum,snowfall_sum,precipitation_hours,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration,city,unix_time
0,2013-01-01,4.8,1.1,3.3,1.3,1.3,0.0,0.0,5.0,25.0,49.299999,224.0,0.82,0.17,Stockholm,1356998400000
1,2013-01-02,1.4,-2.6,0.1,0.0,0.0,0.0,0.0,0.0,16.700001,33.799999,252.0,0.87,0.08,Stockholm,1357084800000
2,2013-01-03,1.5,-4.1,-0.9,1.0,0.4,0.0,0.42,4.0,19.4,36.0,264.0,0.39,0.07,Stockholm,1357171200000
3,2013-01-04,2.7,-0.3,1.6,0.0,0.0,0.0,0.0,0.0,23.200001,45.0,309.0,1.32,0.41,Stockholm,1357257600000
4,2013-01-05,-0.4,-3.6,-2.2,0.0,0.0,0.0,0.0,0.0,16.9,35.299999,322.0,1.25,0.06,Stockholm,1357344000000


In [20]:
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4732 entries, 0 to 4731
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   time                         4732 non-null   datetime64[ns]
 1   temperature_2m_max           4732 non-null   float32       
 2   temperature_2m_min           4732 non-null   float32       
 3   temperature_2m_mean          4732 non-null   float32       
 4   precipitation_sum            4732 non-null   float32       
 5   rain_sum                     4732 non-null   float32       
 6   showers_sum                  4732 non-null   float32       
 7   snowfall_sum                 4732 non-null   float32       
 8   precipitation_hours          4732 non-null   float32       
 9   wind_speed_10m_max           4732 non-null   float32       
 10  wind_gusts_10m_max           4732 non-null   float32       
 11  wind_direction_10m_dominant  4732 non-null 

### <span style="color:#ff5f27;"> Connect to Hopsworks and save the sensor country and city names as a secret</span>

In [21]:
fs = project.get_feature_store() 

### <span style="color:#ff5f27;"> Create the Feature Groups and insert the DataFrames in them </span>

In [22]:
import great_expectations as ge
from hopsworks import RestAPIError

# 1. Connessione al Feature Store
fs = project.get_feature_store()

# 2. Creazione (o recupero) del Feature Group per i Pollini
# Definiamo la chiave primaria (city + timestamp) e l'event time
try:
    pollen_fg = fs.get_or_create_feature_group(
        name="pollen_measurements",
        version=1,
        description="Hourly pollen levels measurements for Stockholm (Open-Meteo Source)",
        primary_key=['city', 'unix_time'], 
        event_time='unix_time',
        online_enabled=True # Abilita l'accesso rapido per l'inferenza online
    )
    print("Feature Group 'pollen_measurements' recuperato o creato.")
except Exception as e:
    print(f"Errore nella creazione del Feature Group: {e}")

# 3. Definizione della Suite di Validazione (Great Expectations)
# Creiamo una suite di regole che i dati devono rispettare
expectation_suite = ge.core.ExpectationSuite(expectation_suite_name="pollen_validations")

# Impostiamo un valore massimo teorico di sicurezza (es. 5000) per evitare outlier estremi
# I valori reali raramente superano i 2000, ma 5000 √® un buon "sanity check"
max_pollen_value = 5000.0

print(f"Configurazione validazioni per le colonne: {pollen_cols}")

for col in pollen_cols:
    # Regola: Il valore deve essere compreso tra 0 e max_pollen_value
    expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_between",
            kwargs={
                "column": col,
                "min_value": 0.0,
                "max_value": max_pollen_value
            }
        )
    )
    
    # Regola opzionale: La colonna non deve essere nulla (gi√† gestito nel cleaning, ma utile ribadire)
    expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_values_to_not_be_null",
            kwargs={"column": col}
        )
    )

# 4. Attacca la Suite al Feature Group su Hopsworks
# validation_ingestion_policy="STRICT" -> Blocca l'inserimento se i dati sono "sporchi"
# validation_ingestion_policy="ALWAYS" -> Inserisce comunque ma logga l'errore (utile in dev)
pollen_fg.save_expectation_suite(
    expectation_suite=expectation_suite, 
    validation_ingestion_policy="STRICT"
)
print("Expectation Suite salvata e attivata sul Feature Group.")

# 5. Inserimento dei dati (Scrittura su Hopsworks)
# Questo processo avvier√† la validazione. Se i dati del df_pollen violano le regole, riceverai un errore.
print("Inizio caricamento dati su Hopsworks...")
try:
    pollen_fg.insert(df_pollen)
    print("Caricamento completato con successo! I dati sono stati validati e salvati.")
except Exception as e:
    print(f"Errore durante l'inserimento (probabile violazione delle validazioni): {e}")

Feature Group 'pollen_measurements' recuperato o creato.
Configurazione validazioni per le colonne: ['alder_pollen', 'birch_pollen', 'grass_pollen', 'mugwort_pollen']
Expectation Suite salvata e attivata sul Feature Group.
Inizio caricamento dati su Hopsworks...
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1279152/fs/1258616/fg/1867084
2025-12-15 15:53:10,380 INFO: 	8 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279152/fs/1258616/fg/1867084


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 113565/113565 | Elapsed Time: 00:18 | Remaining Time: 00:00


Launching job: pollen_measurements_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279152/jobs/named/pollen_measurements_1_offline_fg_materialization/executions
Caricamento completato con successo! I dati sono stati validati e salvati.


#### Enter a description for each feature in the Feature Group

In [24]:
# 1. ID and Temporal Features
descriptions = {
    "city": "City of the measurement (e.g., Stockholm). Used as Primary Key / Partition Key.",
    "unix_time": "Event timestamp in milliseconds (Epoch). Used as Event Time.",
    "time": "Date and time of the measurement (datetime format).",
    "country": "Country of the measurement (e.g., Sweden)."
}

# 2. Target Features (Pollen Concentrations) - Unit: grains/m¬≥
# Base descriptions for pollen types
pollen_desc_base = {
    "alder_pollen": "Concentration of Alder pollen",
    "birch_pollen": "Concentration of Birch pollen",
    "grass_pollen": "Concentration of Grass pollen",
    "mugwort_pollen": "Concentration of Mugwort pollen"
}

# Add pollen targets to the main dictionary
for col, desc in pollen_desc_base.items():
    descriptions[col] = f"{desc} in grains/m¬≥. Source: CAMS/Open-Meteo."

# 3. Weather Features - Add units
weather_desc = {
    "temperature_2m_max": "Maximum daily temperature at 2 meters above ground (¬∞C).",
    "temperature_2m_min": "Minimum daily temperature at 2 meters above ground (¬∞C).",
    "temperature_2m_mean": "Mean daily temperature at 2 meters above ground (¬∞C).",
    "precipitation_sum": "Daily sum of precipitation (rain + snow) in mm.",
    "rain_sum": "Daily sum of rain in mm.",
    "showers_sum": "Daily sum of showers in mm.",
    "snowfall_sum": "Daily sum of snowfall in cm.",
    "precipitation_hours": "Total hours of precipitation during the day.",
    "wind_speed_10m_max": "Maximum wind speed at 10 meters (km/h).",
    "wind_gusts_10m_max": "Maximum wind gusts at 10 meters (km/h).",
    "wind_direction_10m_dominant": "Dominant wind direction at 10 meters (¬∞).",
    "shortwave_radiation_sum": "Shortwave solar radiation sum (MJ/m¬≤). Indicates solar energy received.",
    "et0_fao_evapotranspiration": "Reference evapotranspiration (ET0) in mm. Indicates environmental dryness."
}
descriptions.update(weather_desc)

# 4. Lagged Features (Auto-generated)
# Generate descriptions for _lag_1, _lag_2, _lag_3 columns
lags = [1, 2, 3]
for col_name, base_desc in pollen_desc_base.items():
    for lag in lags:
        lag_col_name = f"{col_name}_lag_{lag}"
        descriptions[lag_col_name] = f"{base_desc} recorded {lag} day(s) before (grains/m¬≥)."

# ---------------------------------------------------------
# Update Descriptions on Hopsworks
# ---------------------------------------------------------
print("Updating feature descriptions on Hopsworks...")

for feature_name, description in descriptions.items():
    try:
        # Check if feature exists in the FG before updating
        _ = pollen_fg.get_feature(feature_name)
        
        # Update description
        pollen_fg.update_feature_description(feature_name, description)        
    except Exception as e:
        # Skip if feature was dropped during cleaning
        pass

print("\n--- Feature descriptions successfully updated! ---")

Updating feature descriptions on Hopsworks...

--- Feature descriptions successfully updated! ---


### <span style='color:#ff5f27'> Weather Data

In [25]:
import great_expectations as ge

# 1. Recupera o Crea il Feature Group per il Meteo
try:
    weather_fg = fs.get_or_create_feature_group(
        name="weather_measurements",
        version=1,
        description="Daily weather measurements for Stockholm (Open-Meteo Source)",
        primary_key=['city', 'unix_time'],
        event_time='unix_time',
        online_enabled=True
    )
    print("Feature Group 'weather_measurements' recuperato o creato.")
except Exception as e:
    print(f"Errore creazione FG Meteo: {e}")

# 2. Definizione delle Validazioni (Expectation Suite)
weather_suite = ge.core.ExpectationSuite(expectation_suite_name="weather_validations")

# Regola A: La pioggia e il vento non possono essere negativi
non_negative_cols = [
    "precipitation_sum", "rain_sum", "showers_sum", "snowfall_sum", 
    "wind_speed_10m_max", "wind_gusts_10m_max", "shortwave_radiation_sum",
    "et0_fao_evapotranspiration"
]

for col in non_negative_cols:
    if col in df_weather.columns:
        weather_suite.add_expectation(
            ge.core.ExpectationConfiguration(
                expectation_type="expect_column_values_to_be_between",
                kwargs={"column": col, "min_value": 0.0, "max_value": 10000.0} # Max alto generico
            )
        )

# Regola B: Temperature "fisicamente possibili" a Stoccolma (es. tra -50¬∞C e +50¬∞C)
temp_cols = ["temperature_2m_max", "temperature_2m_min", "temperature_2m_mean"]
for col in temp_cols:
    if col in df_weather.columns:
        weather_suite.add_expectation(
            ge.core.ExpectationConfiguration(
                expectation_type="expect_column_values_to_be_between",
                kwargs={"column": col, "min_value": -50.0, "max_value": 50.0}
            )
        )

# 3. Attacca la Suite al Feature Group
weather_fg.save_expectation_suite(
    expectation_suite=weather_suite, 
    validation_ingestion_policy="STRICT"
)
print("Validazioni meteo configurate.")

# 4. Inserimento dei dati
print("Caricamento dati meteo su Hopsworks...")
try:
    weather_fg.insert(df_weather)
    print("Dati meteo caricati e validati con successo!")
except Exception as e:
    print(f"Errore inserimento meteo: {e}")

Feature Group 'weather_measurements' recuperato o creato.
Validazioni meteo configurate.
Caricamento dati meteo su Hopsworks...
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1279152/fs/1258616/fg/1866063
2025-12-15 15:57:33,417 INFO: 	11 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279152/fs/1258616/fg/1866063


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 4732/4732 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_measurements_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279152/jobs/named/weather_measurements_1_offline_fg_materialization/executions
Dati meteo caricati e validati con successo!


#### Enter a description for each feature in the Feature Group

In [27]:
# Ridefiniamo il dizionario per sicurezza (caso in cui non hai eseguito la cella precedente)
weather_desc = {
    "city": "City of the measurement (e.g., Stockholm). Partition Key.",
    "unix_time": "Event timestamp in milliseconds (Epoch).",
    "time": "Date of the measurement.",
    "temperature_2m_max": "Maximum daily temperature at 2 meters (¬∞C).",
    "temperature_2m_min": "Minimum daily temperature at 2 meters (¬∞C).",
    "temperature_2m_mean": "Mean daily temperature at 2 meters (¬∞C).",
    "precipitation_sum": "Daily sum of precipitation (rain + snow) in mm.",
    "rain_sum": "Daily sum of rain in mm.",
    "showers_sum": "Daily sum of showers in mm.",
    "snowfall_sum": "Daily sum of snowfall in cm.",
    "precipitation_hours": "Total hours of precipitation during the day.",
    "wind_speed_10m_max": "Maximum wind speed at 10 meters (km/h).",
    "wind_gusts_10m_max": "Maximum wind gusts at 10 meters (km/h).",
    "wind_direction_10m_dominant": "Dominant wind direction at 10 meters (¬∞).",
    "shortwave_radiation_sum": "Shortwave solar radiation sum (MJ/m¬≤).",
    "et0_fao_evapotranspiration": "Reference evapotranspiration (ET0) in mm."
}

print("Aggiornamento descrizioni feature meteo...")
for feature_name, description in weather_desc.items():
    try:
        # Verifica se la feature esiste nel DataFrame caricato
        if feature_name in df_weather.columns:
            weather_fg.update_feature_description(feature_name, description)
    except Exception as e:
        print(f"Skip {feature_name}: {e}")

print("--- Descrizioni Meteo Aggiornate ---")

Aggiornamento descrizioni feature meteo...
--- Descrizioni Meteo Aggiornate ---
