In [None]:
import sys
from pathlib import Path

root_dir = Path().absolute()
# Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
if root_dir.parts[-1:] == ('notebooks',):
    root_dir = Path(*root_dir.parts[:-1])
    sys.path.append(str(root_dir))
if root_dir.parts[-1:] == ('airquality',):
    root_dir = Path(*root_dir.parts[:-1])
    sys.path.append(str(root_dir))
root_dir = str(root_dir) 

print(f"Root dir: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

# <span style="font-width:bold; font-size: 3rem; color:#333;">Training Pipeline</span>

## üóíÔ∏è This notebook is divided into the following sections:

1. Select features for the model and create a Feature View with the selected features
2. Create training data using the feature view
3. Train model
4. Evaluate model performance
5. Save model to model registry

### <span style='color:#ff5f27'> üìù Imports

In [None]:
from datetime import datetime, timedelta
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error, r2_score
import hopsworks
from airquality import util
import json
import os

import warnings
warnings.filterwarnings("ignore")

## <span style="color:#ff5f27;"> üì° Connect to Hopsworks Feature Store </span>

In [None]:
project = hopsworks.login()
fs = project.get_feature_store() 

secrets = hopsworks.get_secrets_api()
location_str = secrets.get_secret("SENSOR_LOCATION_JSON").value
location = json.loads(location_str)
country=location['country']
city=location['city']
street=location['street']

In [None]:
# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name='air_quality',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather',
    version=1,
)

--- 

## <span style="color:#ff5f27;"> üñç Feature View Creation and Retrieving </span>

In [None]:
# Select features for training data.
selected_features = air_quality_fg.select(['pm25', 'date']).join(weather_fg.select_features(), on=['city'])

### Feature Views

`Feature Views` are selections of features from different **Feature Groups** that make up the input and output API (or schema) for a model. A **Feature Views** can create **Training Data** and also be used in Inference to retrieve inference data.

The Feature Views allows a schema in form of a query with filters, defining a model target feature/label and additional transformation functions (declarative feature encoding).

In order to create Feature View we can use `FeatureStore.get_or_create_feature_view()` method.

You can specify the following parameters:

- `name` - name of a feature group.

- `version` - version of a feature group.

- `labels`- our target variable.

- `transformation_functions` - declarative feature encoding (not used here)

- `query` - selected features/labels for the model 

In [None]:
feature_view = fs.get_or_create_feature_view(
    name='air_quality_fv',
    description="weather features with air quality as the target",
    version=1,
    labels=['pm25'],
    query=selected_features,
)

## <span style="color:#ff5f27;">ü™ù Split the training data into train/test data sets </span>

We use a time-series split here. The split is parameterized by `test_days` (number of days for test set).

The script dynamically calculates the test start date based on the most recent data available.

In [None]:
# Parameters: Read from environment variables with defaults
test_days = int(os.getenv('TEST_DAYS', '30'))
min_train_days = int(os.getenv('MIN_TRAIN_DAYS', '180'))

print(f"Training parameters:")
print(f"  Test days: {test_days}")
print(f"  Minimum training days: {min_train_days}")

# Read data to determine available date range
print(f"\nCalculating train/test split with test_days={test_days}...")
query = feature_view.query
df_temp = query.read()

if df_temp.empty:
    raise ValueError("No data available in feature view")

# Ensure date column is datetime
df_temp['date'] = pd.to_datetime(df_temp['date'])

# Sort by date to find the most recent available data
df_temp_sorted = df_temp.sort_values('date')

# Get the most recent date in the available data
max_date = df_temp_sorted['date'].max()
min_date = df_temp_sorted['date'].min()
total_days = (max_date - min_date).days

print(f"Data available from {min_date.date()} to {max_date.date()} ({total_days} days)")

# Calculate test_start date: take the last test_days from the most recent available date
test_start = max_date - timedelta(days=test_days - 1)  # -1 to include max_date in test set

# Calculate actual training days
train_days = (test_start - min_date).days

print(f"Proposed split:")
print(f"  Training: {min_date.date()} to {test_start.date()} ({train_days} days)")
print(f"  Testing:  {test_start.date()} to {max_date.date()} ({test_days} days)")

# Validation: Check if we have enough training data
if train_days < min_train_days:
    raise ValueError(
        f"Insufficient training data. Requested {test_days} test days leaves only "
        f"{train_days} training days, but minimum {min_train_days} required. "
        f"Total available: {total_days} days. "
        f"Try reducing test_days or min_train_days."
    )

print(f"\nUsing test_start date: {test_start.date()}")

In [None]:
X_train, X_test, y_train, y_test = feature_view.train_test_split(
    test_start=test_start
)

In [None]:
X_train

In [None]:
X_features = X_train.drop(columns=['date'])
X_test_features = X_test.drop(columns=['date'])

In [None]:
y_train

The `Feature View` is now saved in Hopsworks and you can retrieve it using `FeatureStore.get_feature_view(name='...', version=1)`.

---

## <span style="color:#ff5f27;">üß¨ Modeling</span>

We will train a regression model to predict pm25 using our 4 features (wind_speed, wind_dir, temp, precipitation)

In [None]:
# Creating an instance of the XGBoost Regressor
xgb_regressor = XGBRegressor()

# Fitting the XGBoost Regressor to the training data
xgb_regressor.fit(X_features, y_train)


In [None]:
# Predicting target values on the test set
y_pred = xgb_regressor.predict(X_test_features)

# Calculating Mean Squared Error (MSE) using sklearn
mse = mean_squared_error(y_test.iloc[:,0], y_pred)
print("MSE:", mse)

# Calculating R squared using sklearn
r2 = r2_score(y_test.iloc[:,0], y_pred)
print("R squared:", r2)

In [None]:
df = y_test
df['predicted_pm25'] = y_pred

In [None]:
df['date'] = X_test['date']
df = df.sort_values(by=['date'])
df.head(5)

In [None]:
# Creating a directory for the model artifacts if it doesn't exist
model_dir = "air_quality_model"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
images_dir = model_dir + "/images"
if not os.path.exists(images_dir):
    os.mkdir(images_dir)

In [None]:
file_path = images_dir + "/pm25_hindcast.png"
plt = util.plot_air_quality_forecast(city, street, df, file_path, hindcast=True) 
plt.show()

In [None]:
# Plotting feature importances using the plot_importance function from XGBoost
plot_importance(xgb_regressor)
feature_importance_path = images_dir + "/feature_importance.png"
plt.savefig(feature_importance_path)
plt.show()

---

## <span style='color:#ff5f27'>üóÑ Model Registry</span>

One of the features in Hopsworks is the model registry. This is where you can store different versions of models and compare their performance. Models from the registry can then be served as API endpoints.

In [None]:
# Saving the XGBoost regressor object as a json file in the model directory
xgb_regressor.save_model(model_dir + "/model.json")

In [None]:
import numpy as np

res_dict = { 
        "MSE": str(mse),
        "R squared": str(r2) if not np.isnan(r2) else "0.0",
        "test_days": str((pd.to_datetime(X_test['date']).max() - pd.to_datetime(X_test['date']).min()).days),
        "train_samples": str(len(X_train)),
        "test_samples": str(len(X_test)),
    }

In [None]:
mr = project.get_model_registry()

# Creating a Python model in the model registry named 'air_quality_xgboost_model'

aq_model = mr.python.create_model(
    name="air_quality_xgboost_model", 
    metrics= res_dict,
    feature_view=feature_view,
    description="Air Quality (PM2.5) predictor",
)

# Saving the model artifacts to the 'air_quality_model' directory in the model registry
aq_model.save(model_dir)

---
## <span style="color:#ff5f27;">‚è≠Ô∏è **Next:** Part 04: Batch Inference</span>

In the following notebook you will use your model for Batch Inference.
