### Imports

In [1]:
import datetime
import io
import requests
import pandas as pd
import hopsworks
import os
import warnings
import great_expectations as ge
from dotenv import load_dotenv 

load_dotenv()

api_key = os.getenv("HOPSWORKS_API_KEY")

project = hopsworks.login(host="eu-west.cloud.hopsworks.ai", project="jim", api_key_value=api_key)  

warnings.filterwarnings("ignore")

2025-12-18 20:41:43,887 INFO: Initializing external client
2025-12-18 20:41:43,889 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2025-12-18 20:41:45,286 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/120


In [2]:
fs = project.get_feature_store()

## Fetch NOAA GML Global monthly mean CO‚ÇÇ üå´Ô∏è

We use NOAA GML's public text file for **Global monthly mean CO‚ÇÇ** and parse it into a clean Pandas DataFrame.

Source: NOAA GML CO‚ÇÇ trends (Global monthly mean).


In [3]:
NOAA_CO2_URL = "https://gml.noaa.gov/webdata/ccgg/trends/co2/co2_mm_gl.txt"

response = requests.get(NOAA_CO2_URL)
response.raise_for_status()

# NOAA file has commented header lines starting with '#'
lines = response.text.splitlines()
data_lines = [ln for ln in lines if ln.strip() and not ln.startswith("#")]

raw_text = "\n".join(data_lines)

# Columns in this file (see header in the NOAA text):
# year, month, decimal_date, average, average_unc, trend, trend_unc
co2_df = pd.read_csv(
    io.StringIO(raw_text),
    delim_whitespace=True,
    header=None,
    names=["year", "month", "decimal_date", "average", "average_unc", "trend", "trend_unc"],
)

# Build a proper datetime (first day of each month)
co2_df["date"] = pd.to_datetime(
    {
        "year": co2_df["year"].astype(int),
        "month": co2_df["month"].astype(int),
        "day": 1,
    }
)

# Replace NOAA missing value marker (-99.99) with NaN and drop missing rows
for col in ["average", "trend", "average_unc", "trend_unc"]:
    co2_df[col] = co2_df[col].replace(-99.99, pd.NA).astype("float32")

co2_df = co2_df.dropna(subset=["average"]).copy()

# Keep a tidy subset of columns we care about
co2_df = co2_df[["date", "average", "trend", "average_unc", "trend_unc"]].sort_values("date").reset_index(drop=True)

co2_df.head()


Unnamed: 0,date,average,trend,average_unc,trend_unc
0,1979-01-01,336.559998,335.920013,0.11,0.09
1,1979-02-01,337.290009,336.26001,0.09,0.09
2,1979-03-01,337.880005,336.51001,0.11,0.09
3,1979-04-01,338.320007,336.720001,0.13,0.1
4,1979-05-01,338.26001,336.709991,0.04,0.1


### Add lag and rolling mean features

We enrich the monthly CO‚ÇÇ series with simple lag features and rolling means on the `average` series, which the model can later use for forecasting.


In [4]:
import numpy as np

# Sort just in case and reset index
co2_df = co2_df.sort_values("date").reset_index(drop=True)

# Add lag features for the main target series (average CO‚ÇÇ) and for the trend
for k in [1, 2, 3, 6, 12]:
    co2_df[f"average_lag_{k}"] = co2_df["average"].shift(k)
    co2_df[f"trend_lag_{k}"] = co2_df["trend"].shift(k)

# Add rolling means over the average and trend series
# IMPORTANT: shift by 1 so rolling windows use only *past* months (no leakage of current month)
shifted_avg = co2_df["average"].shift(1)
shifted_trend = co2_df["trend"].shift(1)
co2_df["average_roll_3"] = shifted_avg.rolling(window=3).mean()
co2_df["average_roll_12"] = shifted_avg.rolling(window=12).mean()
co2_df["trend_roll_3"] = shifted_trend.rolling(window=3).mean()
co2_df["trend_roll_12"] = shifted_trend.rolling(window=12).mean()

# Drop rows that don't have full history for all lags/rolls
co2_df = co2_df.dropna().reset_index(drop=True)

co2_df.head()


Unnamed: 0,date,average,trend,average_unc,trend_unc,average_lag_1,trend_lag_1,average_lag_2,trend_lag_2,average_lag_3,trend_lag_3,average_lag_6,trend_lag_6,average_lag_12,trend_lag_12,average_roll_3,average_roll_12,trend_roll_3,trend_roll_12
0,1980-01-01,338.579987,337.940002,0.13,0.1,337.890015,337.660004,337.359985,337.679993,336.359985,337.709991,335.570007,336.410004,336.559998,335.920013,337.203328,336.855835,337.683329,336.855835
1,1980-02-01,339.26001,338.230011,0.09,0.09,338.579987,337.940002,337.890015,337.660004,337.359985,337.679993,334.369995,336.660004,337.290009,336.26001,337.943329,337.024167,337.76,337.024167
2,1980-03-01,339.600006,338.230011,0.08,0.09,339.26001,338.230011,338.579987,337.940002,337.890015,337.660004,335.019989,337.410004,337.880005,336.51001,338.57667,337.188334,337.943339,337.188334
3,1980-04-01,340.0,338.399994,0.08,0.08,339.600006,338.230011,339.26001,338.230011,338.579987,337.940002,336.359985,337.709991,338.320007,336.720001,339.146667,337.331668,338.133341,337.331668
4,1980-05-01,340.440002,338.890015,0.11,0.07,340.0,338.399994,339.600006,338.230011,339.26001,338.230011,337.359985,337.679993,338.26001,336.709991,339.620005,337.471667,338.286672,337.471667


### Create Hopsworks Feature Group for Global CO‚ÇÇ 

We now create (or get) a single feature group `global_co2` and insert the enriched `co2_df` DataFrame into Hopsworks.


In [5]:
# Create or get the feature group
# Note: online_enabled=False is required for this project (online storage fails)
co2_fg = fs.get_or_create_feature_group(
    name="global_co2",
    version=3,
    description="Global monthly mean atmospheric CO2 from NOAA GML with lags and rolling means",
    primary_key=["date"],
    event_time="date",
)

# Insert data into the feature group
co2_fg.insert(co2_df)

Feature Group created successfully, explore it at 
https://eu-west.cloud.hopsworks.ai:443/p/120/fs/68/fg/3151
2025-12-18 20:42:09,159 INFO: Computing insert statistics


(None, None)

### Add Feature Descriptions

We add descriptions for each feature to improve documentation and discoverability in Hopsworks.


In [6]:
# Update feature descriptions
co2_fg.update_feature_description("date", "Date of measurement (first day of each month)")
co2_fg.update_feature_description("average", "Global monthly mean atmospheric CO2 concentration in ppm (parts per million)")
co2_fg.update_feature_description("trend", "Deseasonalized global monthly mean CO2 concentration in ppm (seasonal cycle removed)")
co2_fg.update_feature_description("average_unc", "Uncertainty (standard deviation) of the average CO2 measurement in ppm")
co2_fg.update_feature_description("trend_unc", "Uncertainty (standard deviation) of the trend CO2 measurement in ppm")

# Lag features
for k in [1, 2, 3, 6, 12]:
    co2_fg.update_feature_description(f"average_lag_{k}", f"Average CO2 concentration {k} month(s) ago")
    co2_fg.update_feature_description(f"trend_lag_{k}", f"Trend CO2 concentration {k} month(s) ago")

# Rolling mean features
co2_fg.update_feature_description("average_roll_3", "3-month rolling mean of average CO2 concentration")
co2_fg.update_feature_description("average_roll_12", "12-month rolling mean of average CO2 concentration")
co2_fg.update_feature_description("trend_roll_3", "3-month rolling mean of trend CO2 concentration")
co2_fg.update_feature_description("trend_roll_12", "12-month rolling mean of trend CO2 concentration")

<hsfs.feature_group.FeatureGroup at 0x732ed9850dc0>

## Fetch NASA GISS Global Monthly Mean Temperature Anomaly üå°Ô∏è

We load a **global monthly surface temperature anomaly** time series from a local CSV file and prepare it for feature engineering.


In [7]:
# Fetch NASA GISTEMP global land‚Äìocean monthly temperature anomalies (GLB.Ts+dSST)
GISTEMP_URL = "https://data.giss.nasa.gov/gistemp/tabledata_v4/GLB.Ts+dSST.csv"

response = requests.get(GISTEMP_URL)
response.raise_for_status()

# Read CSV, skipping the first descriptive line so the header row is used
wide_df = pd.read_csv(io.StringIO(response.text), skiprows=1)
# Strip any whitespace from column names
wide_df.columns = [c.strip() for c in wide_df.columns]

# Expected monthly columns in GISTEMP table
month_cols = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
wide_df = wide_df[["Year"] + month_cols]

# Reshape to long format: one row per (year, month)
long_df = wide_df.melt(id_vars="Year", value_vars=month_cols,
                       var_name="month", value_name="temp_anomaly")

# Drop missing values (marked as *** in original file)
long_df = long_df.replace("***", pd.NA).dropna(subset=["temp_anomaly"]).copy()

# Map month names to month numbers
month_map = {"Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6,
             "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12}
long_df["Year"] = long_df["Year"].astype(int)
long_df["month_num"] = long_df["month"].map(month_map)

# Build a proper datetime (first day of each month)
long_df["date"] = pd.to_datetime({
    "year": long_df["Year"],
    "month": long_df["month_num"],
    "day": 1,
})

# Convert anomaly to float (values are in ¬∞C anomalies)
long_df["temp_anomaly"] = long_df["temp_anomaly"].astype("float32")

# Final tidy DataFrame
temp_df = long_df[["date", "temp_anomaly"]].sort_values("date").reset_index(drop=True)

temp_df.head()


Unnamed: 0,date,temp_anomaly
0,1880-01-01,-0.19
1,1880-02-01,-0.25
2,1880-03-01,-0.1
3,1880-04-01,-0.17
4,1880-05-01,-0.11


### Add lag and rolling mean features for Global Temperature

We enrich the global temperature anomaly series with lagged values and rolling means, similar to the CO‚ÇÇ pipeline.


In [8]:
# Ensure the DataFrame is sorted by date
temp_df = temp_df.sort_values("date").reset_index(drop=True)

# Add lag features for the main target series (temperature anomaly)
for k in [1, 2, 3, 6, 12]:
    temp_df[f"temp_anomaly_lag_{k}"] = temp_df["temp_anomaly"].shift(k)

# Add rolling means over the temperature anomaly series
# IMPORTANT: shift by 1 so the rolling window uses only *past* months (no leakage of current month)
shifted_temp = temp_df["temp_anomaly"].shift(1)
temp_df["temp_anomaly_roll_3"] = shifted_temp.rolling(window=3).mean()
temp_df["temp_anomaly_roll_12"] = shifted_temp.rolling(window=12).mean()

# Drop rows that don't have full history for all lags/rolls
temp_df = temp_df.dropna().reset_index(drop=True)

temp_df.head()


Unnamed: 0,date,temp_anomaly,temp_anomaly_lag_1,temp_anomaly_lag_2,temp_anomaly_lag_3,temp_anomaly_lag_6,temp_anomaly_lag_12,temp_anomaly_roll_3,temp_anomaly_roll_12
0,1881-01-01,-0.2,-0.18,-0.23,-0.24,-0.19,-0.19,-0.216667,-0.178333
1,1881-02-01,-0.14,-0.2,-0.18,-0.23,-0.11,-0.25,-0.203333,-0.179167
2,1881-03-01,0.03,-0.14,-0.2,-0.18,-0.15,-0.1,-0.173333,-0.17
3,1881-04-01,0.05,0.03,-0.14,-0.2,-0.24,-0.17,-0.103333,-0.159167
4,1881-05-01,0.06,0.05,0.03,-0.14,-0.23,-0.11,-0.02,-0.140833


### Create Hopsworks Feature Group for Global Temperature

We now create (or get) a `global_temperature` feature group and insert the enriched `temp_df` DataFrame into Hopsworks.


In [9]:
# Build a Great Expectations suite for the global_temperature feature group

# Build expectations on the engineered temp_df
ge_temp_df = ge.from_pandas(temp_df.copy())

# Basic sanity checks
ge_temp_df.expect_column_values_to_not_be_null("date")
ge_temp_df.expect_column_values_to_be_unique("date")

# Temperature anomaly should be within a reasonable physical range (¬∞C)
ge_temp_df.expect_column_values_to_be_between(
    "temp_anomaly", min_value=-10, max_value=10
)

# Lags and rolling means should be non-null (after dropna in feature engineering)
for k in [1, 2, 3, 6, 12]:
    ge_temp_df.expect_column_values_to_not_be_null(f"temp_anomaly_lag_{k}")

ge_temp_df.expect_column_values_to_not_be_null("temp_anomaly_roll_3")
ge_temp_df.expect_column_values_to_not_be_null("temp_anomaly_roll_12")

# Export expectation suite so we can attach it when creating the feature group
temperature_expectation_suite = ge_temp_df.get_expectation_suite()

print("Great Expectations suite built for global_temperature.")

2025-12-18 20:42:33,907 INFO: 	10 expectation(s) included in expectation_suite. result_format settings filtered.
Great Expectations suite built for global_temperature.


In [10]:
# Create or get the feature group for global temperature
# Attach the Great Expectations suite we built above
temp_fg = fs.get_or_create_feature_group(
    name="global_temperature",
    version=4,
    description="Global monthly mean surface temperature anomaly with lags and rolling means",
    primary_key=["date"],
    event_time="date",
)

# Insert data into the feature group
temp_fg.insert(temp_df)

temp_fg


Feature Group created successfully, explore it at 
https://eu-west.cloud.hopsworks.ai:443/p/120/fs/68/fg/3152
2025-12-18 20:42:45,203 INFO: Computing insert statistics


<hsfs.feature_group.FeatureGroup at 0x732ed268b610>

In [11]:
# Add feature descriptions for the global_temperature feature group

temp_fg.update_feature_description("date", "Date of measurement (first day of each month)")
temp_fg.update_feature_description("temp_anomaly", "Global monthly mean surface temperature anomaly relative to 1951-1980 (¬∞C)")

# Lag features
for k in [1, 2, 3, 6, 12]:
    temp_fg.update_feature_description(f"temp_anomaly_lag_{k}", f"Global temperature anomaly {k} month(s) ago (¬∞C)")

# Rolling mean features (based on past values only)
temp_fg.update_feature_description("temp_anomaly_roll_3", "3-month rolling mean of past global temperature anomalies (excluding current month)")
temp_fg.update_feature_description("temp_anomaly_roll_12", "12-month rolling mean of global temperature anomalies over the previous 12 months (excluding current month)")

<hsfs.feature_group.FeatureGroup at 0x732ed268b610>