In [1]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

root_dir = Path().absolute()
# Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
if root_dir.parts[-1:] == ('notebooks',):
    root_dir = Path(*root_dir.parts[:-1])
root_dir = str(root_dir) 
print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: /Users/woland02/VsCode/ID2223-ScalableMLDL_Project
Added the following directory to the PYTHONPATH: /Users/woland02/VsCode/ID2223-ScalableMLDL_Project
HopsworksSettings initialized!


### <span style='color:#ff5f27'> üìù Imports

In [2]:
import datetime
import requests
import pandas as pd
import hopsworks
import util
import datetime
from pathlib import Path
import json
import re
import os
import warnings
warnings.filterwarnings("ignore")

## Hopsworks API Key
You need to have registered an account on app.hopsworks.ai.

Save the HOPSWORKS_API_KEY  to ~/.env file in the root directory of your project

In the .env file, update HOPSWORKS_API_KEY:

`HOPSWORKS_API_KEY="put API KEY value in this string"`


In [3]:
project = hopsworks.login()

2025-12-20 16:31:56,856 INFO: Initializing external client
2025-12-20 16:31:56,858 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-12-20 16:31:58,664 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1267872


In [4]:
from hopsworks import RestAPIError

api_url = "https://air-quality-api.open-meteo.com/v1/air-quality"
country = "Sweden"
city = "Stockholm"
latitude = 59.3346 
longitude = 18.0632

## <span style='color:#ff5f27'> Read the historical data into a DataFrame </span>

The cell below will read up historical pollen levels data into a Pandas DataFrame

In [5]:
# Define the date range for historical data
start_date = "2013-01-01"
end_date = datetime.date.today().strftime("%Y-%m-%d")

# Define the pollen variables available in Open-Meteo
# We don't consider olive and ragweed because they are negligible
pollen_vars = [
    "alder_pollen", 
    "birch_pollen", 
    "grass_pollen", 
    "mugwort_pollen"
]

# Set up the API parameters
params = {
    "latitude": latitude,
    "longitude": longitude,
    "hourly": ",".join(pollen_vars), # Request all pollen types
    "start_date": start_date,
    "end_date": end_date,
    "timezone": "auto" # Automatically match the city's timezone
}

# Make the API request
print(f"Fetching pollen data for {city} from {start_date} to {end_date}...")
response = requests.get(api_url, params=params)

# Check if the request was successful
if response.status_code == 200:
    response_json = response.json()
    
    # Convert the 'hourly' data into a Pandas DataFrame
    hourly_data = response_json["hourly"]
    df_pollen = pd.DataFrame(hourly_data)
    
    # Convert the 'time' column to datetime objects and set it as the index
    df_pollen["time"] = pd.to_datetime(df_pollen["time"])
    
    print("Data fetched successfully!")
    print(df_pollen.head())
    print(f"Total rows: {len(df_pollen)}")

else:
    print(f"Failed to fetch data. Status code: {response.status_code}")
    print(response.text)

Fetching pollen data for Stockholm from 2013-01-01 to 2025-12-20...
Data fetched successfully!
                 time  alder_pollen  birch_pollen  grass_pollen  \
0 2013-01-01 00:00:00           NaN           NaN           NaN   
1 2013-01-01 01:00:00           NaN           NaN           NaN   
2 2013-01-01 02:00:00           NaN           NaN           NaN   
3 2013-01-01 03:00:00           NaN           NaN           NaN   
4 2013-01-01 04:00:00           NaN           NaN           NaN   

   mugwort_pollen  
0             NaN  
1             NaN  
2             NaN  
3             NaN  
4             NaN  
Total rows: 113688


## <span style='color:#ff5f27'> Data cleaning</span>

In [6]:
# Select relevant columns
cols_to_keep = ['time', 'alder_pollen', 'birch_pollen', 'grass_pollen', 'mugwort_pollen']
df_pollen_clean = df_pollen[cols_to_keep].copy()

# Set index and Resample to Daily Mean
df_pollen_clean.set_index('time', inplace=True)
df_pollen_daily = df_pollen_clean.resample('D').mean()

# Linear Interpolation for gaps
# limit_direction='both' ensures we fill gaps based on surrounding data
df_pollen_daily = df_pollen_daily.interpolate(method='time', limit_direction='both')

df_pollen_daily.dropna(inplace=True)

# Cast types
pollen_cols = ['alder_pollen', 'birch_pollen', 'grass_pollen', 'mugwort_pollen']
df_pollen_daily[pollen_cols] = df_pollen_daily[pollen_cols].astype('float32')

# Reset index
df_pollen_daily.reset_index(inplace=True)
df_pollen_daily.rename(columns={'time': 'date'}, inplace=True)

# Add country and city columns
df_pollen_daily['country'] = country
df_pollen_daily['city'] = city

print("Data cleaning and interpolation complete.")
print(f"Remaining missing values: {df_pollen_daily.isna().sum().sum()}")

Data cleaning and interpolation complete.
Remaining missing values: 0


### Adding lagged features

To capture short-term temporal dependencies, we add three new features representing the pollen levels of the previous 1, 2, and 3 days.
These lagged values will help the model learn patterns over time.

In [7]:
# Define the pollen columns we want to lag
pollen_cols = ['alder_pollen', 'birch_pollen', 'grass_pollen', 'mugwort_pollen']

# Create the lagged features
for col in pollen_cols:
    for lag in range(1, 4):  # Loops through 1, 2, 3
        # Name the new column, e.g., 'alder_pollen_lag_1'
        lag_col_name = f"{col}_lag_{lag}"
        # Shift the column data down by 'lag' rows
        df_pollen_daily[lag_col_name] = df_pollen_daily[col].shift(lag)

# Drop the first 3 rows which now contain NaNs (since we shifted up to 3 days)
df_pollen_daily.dropna(inplace=True)

# Reset index is generally good practice after dropping rows, though not strictly necessary if date is a column
df_pollen_daily.reset_index(drop=True, inplace=True)

# Check the result
print("Data with lagged features:")
print(df_pollen_daily[['date', 'birch_pollen', 'birch_pollen_lag_1', 'birch_pollen_lag_2']].head())
print(f"\nNew DataFrame shape: {df_pollen_daily.shape}")

Data with lagged features:
        date  birch_pollen  birch_pollen_lag_1  birch_pollen_lag_2
0 2013-01-04           0.0                 0.0                 0.0
1 2013-01-05           0.0                 0.0                 0.0
2 2013-01-06           0.0                 0.0                 0.0
3 2013-01-07           0.0                 0.0                 0.0
4 2013-01-08           0.0                 0.0                 0.0

New DataFrame shape: (4734, 19)


---

## <span style='color:#ff5f27'> Loading Weather Data from [Open Meteo](https://open-meteo.com/en/docs)

## <span style='color:#ff5f27'> Download the Historical Weather Data </span>


We will download the historical weather data for your `city` by first extracting the earliest date from your DataFrame containing the historical air quality measurements.

We will download all daily historical weather data measurements for your `city` from the earliest date in your air quality measurement DataFrame. It doesn't matter if there are missing days of air quality measurements. We can store all of the daily weather measurements, and when we build our training dataset, we will join up the air quality measurements for a given day to its weather features for that day. 


In [8]:
# Get the earliest date from the pollen DataFrame
start_date = df_pollen_daily['date'].min().strftime('%Y-%m-%d')
end_date = datetime.date.today().strftime('%Y-%m-%d')

print(f"Fetching historical weather data from {start_date} to {end_date}...")

# Define the Weather API URL (Archive API for history)
weather_api_url = "https://archive-api.open-meteo.com/v1/archive"

# Define relevant weather variables for pollen prediction
weather_vars = [
    "temperature_2m_max",
    "temperature_2m_min", 
    "temperature_2m_mean",
    "precipitation_sum",
    "rain_sum",
    "snowfall_sum",
    "wind_speed_10m_max",
    "wind_direction_10m_dominant",
    "weather_code"
]

# Set up the API parameters
params = {
    "latitude": latitude,
    "longitude": longitude,
    "start_date": start_date,
    "end_date": end_date,
    "daily": ",".join(weather_vars),
    "timezone": "auto"
}

# Make the API request
response = requests.get(weather_api_url, params=params)

if response.status_code == 200:
    response_json = response.json()
    
    # Convert to DataFrame
    daily_data = response_json["daily"]
    df_weather = pd.DataFrame(daily_data)
    
    # Data Cleaning for Weather Data
    # Convert 'time' to datetime
    df_weather["time"] = pd.to_datetime(df_weather["time"])
    df_weather.rename(columns={"time": "date"}, inplace=True)
    
    # Cast float columns to float32 to save memory
    float_cols = [c for c in df_weather.columns if c != "date"]
    df_weather[float_cols] = df_weather[float_cols].astype("float32")

    df_weather["country"] = country
    df_weather["city"] = city
    
    print("Weather data fetched successfully!")
    print(df_weather.head())
    print(f"Shape: {df_weather.shape}")
    
else:
    print(f"Failed to fetch weather data. Status: {response.status_code}")
    print(response.text)

Fetching historical weather data from 2013-01-04 to 2025-12-20...
Weather data fetched successfully!
        date  temperature_2m_max  temperature_2m_min  temperature_2m_mean  \
0 2013-01-04                 2.7                -0.3                  1.6   
1 2013-01-05                -0.4                -3.6                 -2.2   
2 2013-01-06                -0.2                -3.8                 -1.7   
3 2013-01-07                 0.8                -4.8                 -1.2   
4 2013-01-08                 1.3                -2.5                  0.0   

   precipitation_sum  rain_sum  snowfall_sum  wind_speed_10m_max  \
0                0.0       0.0          0.00           23.200001   
1                0.0       0.0          0.00           16.900000   
2                0.0       0.0          0.00            9.400000   
3                0.0       0.0          0.00           15.100000   
4                2.5       0.1          1.75           18.400000   

   wind_direction_10m_domin

In [9]:
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4734 entries, 0 to 4733
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   date                         4734 non-null   datetime64[ns]
 1   temperature_2m_max           4734 non-null   float32       
 2   temperature_2m_min           4734 non-null   float32       
 3   temperature_2m_mean          4734 non-null   float32       
 4   precipitation_sum            4734 non-null   float32       
 5   rain_sum                     4734 non-null   float32       
 6   snowfall_sum                 4734 non-null   float32       
 7   wind_speed_10m_max           4734 non-null   float32       
 8   wind_direction_10m_dominant  4734 non-null   float32       
 9   weather_code                 4734 non-null   float32       
 10  country                      4734 non-null   object        
 11  city                         4734 non-null 

In [10]:
fs = project.get_feature_store() 

### <span style="color:#ff5f27;"> Create the Feature Groups and insert the DataFrames in them </span>

In [20]:
import pandas as pd

# Ensure correct dtype and PK column required by the existing FG
df_pollen_daily["date"] = pd.to_datetime(df_pollen_daily["date"])
df_pollen_daily["date_str"] = df_pollen_daily["date"].dt.strftime("%Y-%m-%d")

# Optional: drop unused columns (BUT KEEP date_str!)
df_pollen_daily = df_pollen_daily.drop(columns=["country", "city"], errors="ignore")

# IMPORTANT: use the existing FG schema (version=1 expects date_str as PK)
pollen_fg = fs.get_or_create_feature_group(
    name="pollen_measurements",
    description="Daily average pollen levels for Stockholm",
    version=1,
    primary_key=["date_str"],   # must match the existing FG
    event_time="date"
)

# Insert
pollen_fg.insert(df_pollen_daily)


2025-12-20 16:39:56,001 INFO: 	4 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1267872/fs/1254483/fg/1867187


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 4734/4734 | Elapsed Time: 00:03 | Remaining Time: 00:00


Launching job: pollen_measurements_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1267872/jobs/named/pollen_measurements_1_offline_fg_materialization/executions


(Job('pollen_measurements_1_offline_fg_materialization', 'PYSPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_values_to_be_between",
         "kwargs": {
           "column": "mugwort_pollen",
           "min_value": 0.0,
           "max_value": 10000.0
         },
         "meta": {
           "expectationId": 794654
         }
       },
       "result": {
         "element_count": 4734,
         "missing_count": 0,
         "missing_percent": 0.0,
         "unexpected_count": 0,
         "unexpected_percent": 0.0,
         "unexpected_percent_total": 0.0,
         "unexpected_percent_nonmissing": 0.0,
         "partial_unexpected_list": []
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-12-20T03:39:56.000000Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exc

In [21]:
pollen_fg.update_feature_description(
    "date", "Date of the pollen measurement (Timestamp)"
)
pollen_fg.update_feature_description(
    "date_str", "Date of the pollen measurement (YYYY-MM-DD, primary key)"
)
pollen_fg.update_feature_description(
    "alder_pollen", "Daily average Alder pollen concentration (grains/m¬≥)"
)
pollen_fg.update_feature_description(
    "birch_pollen", "Daily average Birch pollen concentration (grains/m¬≥)"
)
pollen_fg.update_feature_description(
    "grass_pollen", "Daily average Grass pollen concentration (grains/m¬≥)"
)
pollen_fg.update_feature_description(
    "mugwort_pollen", "Daily average Mugwort pollen concentration (grains/m¬≥)"
)

# Lagged features
for pollen_type in ["alder_pollen", "birch_pollen", "grass_pollen", "mugwort_pollen"]:
    for i in range(1, 4):
        col_name = f"{pollen_type}_lag_{i}"
        if col_name in df_pollen_daily.columns:
            pollen_fg.update_feature_description(
                col_name,
                f"{pollen_type} concentration {i} day(s) prior"
            )


### <span style='color:#ff5f27'> Weather Data

In [33]:
import great_expectations as ge
import pandas as pd

# -----------------------------
# Prepare / clean data (stable)
# -----------------------------
df_weather["date"] = pd.to_datetime(df_weather["date"])

# Add a daily identifier (useful PK for daily data)
df_weather["date_str"] = df_weather["date"].dt.strftime("%Y-%m-%d")

# Ensure PK uniqueness (recommended for idempotent inserts)
df_weather = (
    df_weather
    .reset_index(drop=True)
)

# Get the Feature Store handle
fs = project.get_feature_store()

# -----------------------------
# Feature Group (stable schema)
# -----------------------------
weather_fg = fs.get_or_create_feature_group(
    name="weather_measurements",
    version=1,
    primary_key=["country", "city", "date_str"],  # <-- daily PK
    event_time="date",                            # <-- event time for point-in-time
    description="Daily weather data for Stockholm",
    online_enabled=True
)

# -----------------------------
# Expectation Suite (idempotent)
# -----------------------------
weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

# --- Temperature (Stockholm extremes) ---
for temp_col in ["temperature_2m_max", "temperature_2m_min", "temperature_2m_mean"]:
    weather_expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_between",
            kwargs={"column": temp_col, "min_value": -50.0, "max_value": 50.0}
        )
    )

# --- Non-negative physical quantities ---
non_negative_cols = ["precipitation_sum", "rain_sum", "snowfall_sum", "wind_speed_10m_max"]
for col in non_negative_cols:
    weather_expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_between",
            kwargs={"column": col, "min_value": 0.0, "max_value": 1000.0}
        )
    )

# --- Wind Direction (0-360 degrees) ---
weather_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={"column": "wind_direction_10m_dominant", "min_value": 0.0, "max_value": 360.0}
    )
)

# Save / Update Expectations (idempotent for HSFS)
try:
    weather_fg.save_expectation_suite(expectation_suite=weather_expectation_suite)
    print("Expectation Suite created and attached.")
except Exception as e:
    msg = str(e)
    if ("already attached" in msg) or ("409" in msg) or ("Conflict" in msg):
        print("Expectation Suite already exists -> deleting & re-attaching...")
        # HSFS doesn't have update_expectation_suite in your version
        weather_fg.delete_expectation_suite()
        weather_fg.save_expectation_suite(expectation_suite=weather_expectation_suite)
        print("Expectation Suite re-attached.")
    else:
        raise


# --- Align dataframe to existing FG schema (v1 has NO country/city) ---
df_weather = df_weather.drop(columns=["country", "city"], errors="ignore")


# -----------------------------
# Insert data (rerunnable)
# -----------------------------
# IMPORTANT: avoid overwrite=True here (can trigger backend clear/metadata ops)
weather_fg.insert(df_weather)

# -----------------------------
# Feature descriptions
# -----------------------------
weather_fg.update_feature_description("date", "Date of the weather measurement (Timestamp)")
weather_fg.update_feature_description("date_str", "Date string (YYYY-MM-DD) used as daily primary key")

weather_fg.update_feature_description("temperature_2m_max", "Maximum daily air temperature at 2 meters (¬∞C)")
weather_fg.update_feature_description("temperature_2m_min", "Minimum daily air temperature at 2 meters (¬∞C)")
weather_fg.update_feature_description("temperature_2m_mean", "Mean daily air temperature at 2 meters (¬∞C)")
weather_fg.update_feature_description("precipitation_sum", "Sum of daily precipitation (rain + showers + snow) (mm)")
weather_fg.update_feature_description("rain_sum", "Sum of daily rain (mm)")
weather_fg.update_feature_description("snowfall_sum", "Sum of daily snowfall (cm)")
weather_fg.update_feature_description("wind_speed_10m_max", "Maximum wind speed at 10 meters (km/h)")
weather_fg.update_feature_description("wind_direction_10m_dominant", "Dominant wind direction at 10 meters (¬∞)")
weather_fg.update_feature_description("weather_code", "WMO Weather code (0-99) indicating general conditions")


Expectation Suite already exists -> deleting & re-attaching...
Attached expectation suite to Feature Group, edit it at https://c.app.hopsworks.ai:443/p/1267872/fs/1254483/fg/1840724
Expectation Suite re-attached.
2025-12-20 16:50:41,033 INFO: 	8 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1267872/fs/1254483/fg/1840724


Uploading Dataframe: 100.00% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| Rows 4734/4734 | Elapsed Time: 00:03 | Remaining Time: 00:00


<hsfs.feature_group.FeatureGroup at 0x163091990>