### Imports

In [None]:
import io
import requests
import pandas as pd
import hopsworks
import os
import warnings
import great_expectations as ge
import numpy as np
from dotenv import load_dotenv 


load_dotenv()

api_key = os.getenv("HOPSWORKS_API_KEY")

project = hopsworks.login(host="eu-west.cloud.hopsworks.ai",
project="EarthTamagotchi", api_key_value=api_key)

warnings.filterwarnings("ignore")

2025-12-22 14:31:12,194 INFO: Initializing external client
2025-12-22 14:31:12,194 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2025-12-22 14:31:13,166 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/2177


In [2]:
fs = project.get_feature_store()

## Fetch NOAA GML Global monthly mean CO‚ÇÇ üå´Ô∏è

We use NOAA GML's public text file for **Global monthly mean CO‚ÇÇ** and parse it into a clean Pandas DataFrame.

Source: NOAA GML CO‚ÇÇ trends (Global monthly mean).


In [3]:
NOAA_CO2_URL = "https://gml.noaa.gov/webdata/ccgg/trends/co2/co2_mm_mlo.txt"

response = requests.get(NOAA_CO2_URL)
response.raise_for_status()

# NOAA file has commented header lines starting with '#'
lines = response.text.splitlines()
data_lines = [ln for ln in lines if ln.strip() and not ln.startswith("#")]

raw_text = "\n".join(data_lines)

# Columns in Mauna Loa file (see header in the NOAA text):
# year, month, decimal_date, average, trend, #days, st.dev, unc. of mon mean
# Note: trend_unc is not available in Mauna Loa data, so we don't include it in our feature group
co2_df = pd.read_csv(
    io.StringIO(raw_text),
    delim_whitespace=True,
    header=None,
    names=["year", "month", "decimal_date", "average", "trend", "ndays", "stdev", "average_unc"],
)

# Build a proper datetime (first day of each month)
co2_df["date"] = pd.to_datetime(
    {
        "year": co2_df["year"].astype(int),
        "month": co2_df["month"].astype(int),
        "day": 1,
    }
)

# Replace NOAA missing value marker (-99.99 or -9.99) with NaN
for col in ["average", "trend", "average_unc"]:
    co2_df[col] = co2_df[col].replace([-99.99, -9.99], pd.NA).astype("float32")

co2_df = co2_df.dropna(subset=["average"]).copy()

# Keep a tidy subset of columns we care about
# Note: trend_unc is not available in Mauna Loa data, so we don't include it
co2_df = co2_df[["date", "average", "trend", "average_unc"]].sort_values("date").reset_index(drop=True)

co2_df.head()


Unnamed: 0,date,average,trend,average_unc
0,1958-03-01,315.709991,314.440002,-0.99
1,1958-04-01,317.450012,315.160004,-0.99
2,1958-05-01,317.51001,314.690002,-0.99
3,1958-06-01,317.269989,315.149994,-0.99
4,1958-07-01,315.869995,315.200012,-0.99


### Add lag and rolling mean features

We enrich the monthly CO‚ÇÇ series with simple lag features and rolling means on the `average` series, which the model can later use for forecasting.


In [4]:
# Sort just in case and reset index
co2_df = co2_df.sort_values("date").reset_index(drop=True)

# Add lag features for the main target series (average CO‚ÇÇ) and for the trend
for k in [1, 2, 3, 6, 12]:
    co2_df[f"average_lag_{k}"] = co2_df["average"].shift(k)
    co2_df[f"trend_lag_{k}"] = co2_df["trend"].shift(k)

# Add rolling means over the average and trend series
# IMPORTANT: shift by 1 so rolling windows use only *past* months (no leakage of current month)
shifted_avg = co2_df["average"].shift(1)
shifted_trend = co2_df["trend"].shift(1)
co2_df["average_roll_3"] = shifted_avg.rolling(window=3).mean()
co2_df["average_roll_12"] = shifted_avg.rolling(window=12).mean()
co2_df["trend_roll_3"] = shifted_trend.rolling(window=3).mean()
co2_df["trend_roll_12"] = shifted_trend.rolling(window=12).mean()

# Drop rows that don't have full history for all lags/rolls
co2_df = co2_df.dropna().reset_index(drop=True)

# Add time-based features for trend modeling
co2_df.head()


Unnamed: 0,date,average,trend,average_unc,average_lag_1,trend_lag_1,average_lag_2,trend_lag_2,average_lag_3,trend_lag_3,average_lag_6,trend_lag_6,average_lag_12,trend_lag_12,average_roll_3,average_roll_12,trend_roll_3,trend_roll_12
0,1959-03-01,316.649994,315.369995,-0.99,316.48999,315.839996,315.579987,315.519989,314.670013,315.429993,313.209991,316.109985,315.709991,314.440002,315.579997,315.369998,315.596659,315.364164
1,1959-04-01,317.720001,315.420013,-0.99,316.649994,315.369995,316.48999,315.839996,315.579987,315.519989,312.420013,315.410004,317.450012,315.160004,316.23999,315.448331,315.57666,315.441663
2,1959-05-01,318.290009,315.459991,-0.99,317.720001,315.420013,316.649994,315.369995,316.48999,315.839996,313.329987,315.209991,317.51001,314.690002,316.953328,315.47083,315.543335,315.463331
3,1959-06-01,318.149994,316.0,-0.99,318.290009,315.459991,317.720001,315.420013,316.649994,315.369995,314.670013,315.429993,317.269989,315.149994,317.553335,315.53583,315.416667,315.527496
4,1959-07-01,316.540009,315.869995,-0.99,318.149994,316.0,318.290009,315.459991,317.720001,315.420013,315.579987,315.519989,315.869995,315.200012,318.053335,315.609164,315.626668,315.59833


### Add time-based features for trend modeling

In [5]:
# Add time-based features for trend modeling
co2_df['year'] = pd.to_datetime(co2_df['date']).dt.year
co2_df['month'] = pd.to_datetime(co2_df['date']).dt.month
# Create cyclical month features (sin/cos for seasonal patterns)
co2_df['month_sin'] = np.sin(2 * np.pi * co2_df['month'] / 12)
co2_df['month_cos'] = np.cos(2 * np.pi * co2_df['month'] / 12)
# Normalize year for trend modeling (relative to first year in dataset)
year_min = co2_df['year'].min()
year_max = co2_df['year'].max()
co2_df['year_normalized'] = (co2_df['year'] - year_min) / (year_max - year_min)
# Add polynomial year term to capture acceleration
co2_df['year_normalized_squared'] = co2_df['year_normalized'] ** 2
# Add interaction terms: year * seasonality
co2_df['year_month_sin'] = co2_df['year_normalized'] * co2_df['month_sin']
co2_df['year_month_cos'] = co2_df['year_normalized'] * co2_df['month_cos']

co2_df.head()

Unnamed: 0,date,average,trend,average_unc,average_lag_1,trend_lag_1,average_lag_2,trend_lag_2,average_lag_3,trend_lag_3,...,trend_roll_3,trend_roll_12,year,month,month_sin,month_cos,year_normalized,year_normalized_squared,year_month_sin,year_month_cos
0,1959-03-01,316.649994,315.369995,-0.99,316.48999,315.839996,315.579987,315.519989,314.670013,315.429993,...,315.596659,315.364164,1959,3,1.0,6.123234000000001e-17,0.0,0.0,0.0,0.0
1,1959-04-01,317.720001,315.420013,-0.99,316.649994,315.369995,316.48999,315.839996,315.579987,315.519989,...,315.57666,315.441663,1959,4,0.8660254,-0.5,0.0,0.0,0.0,-0.0
2,1959-05-01,318.290009,315.459991,-0.99,317.720001,315.420013,316.649994,315.369995,316.48999,315.839996,...,315.543335,315.463331,1959,5,0.5,-0.8660254,0.0,0.0,0.0,-0.0
3,1959-06-01,318.149994,316.0,-0.99,318.290009,315.459991,317.720001,315.420013,316.649994,315.369995,...,315.416667,315.527496,1959,6,1.224647e-16,-1.0,0.0,0.0,0.0,-0.0
4,1959-07-01,316.540009,315.869995,-0.99,318.149994,316.0,318.290009,315.459991,317.720001,315.420013,...,315.626668,315.59833,1959,7,-0.5,-0.8660254,0.0,0.0,-0.0,-0.0


### Create Hopsworks Feature Group for Global CO‚ÇÇ 

We now create (or get) a single feature group `global_co2` and insert the enriched `co2_df` DataFrame into Hopsworks.


In [6]:
# Create or get the feature group
# Note: online_enabled=False is required for this project (online storage fails)
# Increment version since we added time-based features
co2_fg = fs.get_or_create_feature_group(
    name="global_co2",
    version=1,
    description="Global monthly mean atmospheric CO2 from NOAA GML with lags, rolling means, and time features",
    primary_key=["date"],
    event_time="date",
)

# Insert data into the feature group
co2_fg.insert(co2_df)

2025-12-22 14:31:27,866 INFO: Computing insert statistics


(None, None)

### Add Feature Descriptions

We add descriptions for each feature to improve documentation and discoverability in Hopsworks.


In [7]:
# Update feature descriptions
co2_fg.update_feature_description("date", "Date of measurement (first day of each month)")
co2_fg.update_feature_description("average", "Global monthly mean atmospheric CO2 concentration in ppm (parts per million)")
co2_fg.update_feature_description("trend", "Deseasonalized global monthly mean CO2 concentration in ppm (seasonal cycle removed)")
co2_fg.update_feature_description("average_unc", "Uncertainty (standard deviation) of the average CO2 measurement in ppm")
# Note: trend_unc is not included as it's not available in Mauna Loa data

# Lag features
for k in [1, 2, 3, 6, 12]:
    co2_fg.update_feature_description(f"average_lag_{k}", f"Average CO2 concentration {k} month(s) ago")
    co2_fg.update_feature_description(f"trend_lag_{k}", f"Trend CO2 concentration {k} month(s) ago")

# Rolling mean features
co2_fg.update_feature_description("average_roll_3", "3-month rolling mean of average CO2 concentration")
co2_fg.update_feature_description("average_roll_12", "12-month rolling mean of average CO2 concentration")
co2_fg.update_feature_description("trend_roll_3", "3-month rolling mean of trend CO2 concentration")
co2_fg.update_feature_description("trend_roll_12", "12-month rolling mean of trend CO2 concentration")

# Time-based features
co2_fg.update_feature_description("year", "Year of measurement")
co2_fg.update_feature_description("month", "Month of measurement (1-12)")
co2_fg.update_feature_description("month_sin", "Sine transformation of month for cyclical seasonality (sin(2pi*month/12))")
co2_fg.update_feature_description("month_cos", "Cosine transformation of month for cyclical seasonality (cos(2pi*month/12))")
co2_fg.update_feature_description("year_normalized", "Normalized year (0-1) relative to first year in dataset, for trend modeling")
co2_fg.update_feature_description("year_normalized_squared", "Squared normalized year term to capture acceleration in CO2 increase")
co2_fg.update_feature_description("year_month_sin", "Interaction term: year_normalized * month_sin (captures evolving seasonality)")
co2_fg.update_feature_description("year_month_cos", "Interaction term: year_normalized * month_cos (captures evolving seasonality)")

<hsfs.feature_group.FeatureGroup at 0x310330470>

## Fetch NASA GISS Global Monthly Mean Temperature Anomaly üå°Ô∏è

We load a **global monthly surface temperature anomaly** time series from a local CSV file and prepare it for feature engineering.


In [8]:
# Fetch NASA GISTEMP global land‚Äìocean monthly temperature anomalies (GLB.Ts+dSST)
GISTEMP_URL = "https://data.giss.nasa.gov/gistemp/tabledata_v4/GLB.Ts+dSST.csv"

response = requests.get(GISTEMP_URL)
response.raise_for_status()

# Read CSV, skipping the first descriptive line so the header row is used
wide_df = pd.read_csv(io.StringIO(response.text), skiprows=1)
# Strip any whitespace from column names
wide_df.columns = [c.strip() for c in wide_df.columns]

# Expected monthly columns in GISTEMP table
month_cols = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
wide_df = wide_df[["Year"] + month_cols]

# Reshape to long format: one row per (year, month)
long_df = wide_df.melt(id_vars="Year", value_vars=month_cols,
                       var_name="month", value_name="temp_anomaly")

# Drop missing values (marked as *** in original file)
long_df = long_df.replace("***", pd.NA).dropna(subset=["temp_anomaly"]).copy()

# Map month names to month numbers
month_map = {"Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6,
             "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12}
long_df["Year"] = long_df["Year"].astype(int)
long_df["month_num"] = long_df["month"].map(month_map)

# Build a proper datetime (first day of each month)
long_df["date"] = pd.to_datetime({
    "year": long_df["Year"],
    "month": long_df["month_num"],
    "day": 1,
})

# Convert anomaly to float (values are in ¬∞C anomalies)
long_df["temp_anomaly"] = long_df["temp_anomaly"].astype("float32")

# Final tidy DataFrame
temp_df = long_df[["date", "temp_anomaly"]].sort_values("date").reset_index(drop=True)

temp_df.head()


Unnamed: 0,date,temp_anomaly
0,1880-01-01,-0.19
1,1880-02-01,-0.25
2,1880-03-01,-0.1
3,1880-04-01,-0.17
4,1880-05-01,-0.11


### Add lag and rolling mean features for Global Temperature

We enrich the global temperature anomaly series with lagged values and rolling means, similar to the CO‚ÇÇ pipeline.


In [9]:
# Ensure the DataFrame is sorted by date
temp_df = temp_df.sort_values("date").reset_index(drop=True)

# Add lag features for the main target series (temperature anomaly)
for k in [1, 2, 3, 6, 12]:
    temp_df[f"temp_anomaly_lag_{k}"] = temp_df["temp_anomaly"].shift(k)

# Add rolling means over the temperature anomaly series
# IMPORTANT: shift by 1 so the rolling window uses only *past* months (no leakage of current month)
shifted_temp = temp_df["temp_anomaly"].shift(1)
temp_df["temp_anomaly_roll_3"] = shifted_temp.rolling(window=3).mean()
temp_df["temp_anomaly_roll_12"] = shifted_temp.rolling(window=12).mean()

# Drop rows that don't have full history for all lags/rolls
temp_df = temp_df.dropna().reset_index(drop=True)

# Add time-based features for trend modeling (similar to CO‚ÇÇ)
temp_df['year'] = pd.to_datetime(temp_df['date']).dt.year
temp_df['month'] = pd.to_datetime(temp_df['date']).dt.month
# Create cyclical month features (sin/cos for seasonal patterns)
temp_df['month_sin'] = np.sin(2 * np.pi * temp_df['month'] / 12)
temp_df['month_cos'] = np.cos(2 * np.pi * temp_df['month'] / 12)
# Normalize year for trend modeling (relative to first year in dataset)
year_min_temp = temp_df['year'].min()
year_max_temp = temp_df['year'].max()
temp_df['year_normalized'] = (temp_df['year'] - year_min_temp) / (year_max_temp - year_min_temp)
# Add polynomial year term to capture acceleration
temp_df['year_normalized_squared'] = temp_df['year_normalized'] ** 2
# Add interaction terms: year * seasonality
temp_df['year_month_sin'] = temp_df['year_normalized'] * temp_df['month_sin']
temp_df['year_month_cos'] = temp_df['year_normalized'] * temp_df['month_cos']

temp_df.head()


Unnamed: 0,date,temp_anomaly,temp_anomaly_lag_1,temp_anomaly_lag_2,temp_anomaly_lag_3,temp_anomaly_lag_6,temp_anomaly_lag_12,temp_anomaly_roll_3,temp_anomaly_roll_12,year,month,month_sin,month_cos,year_normalized,year_normalized_squared,year_month_sin,year_month_cos
0,1881-01-01,-0.2,-0.18,-0.23,-0.24,-0.19,-0.19,-0.216667,-0.178333,1881,1,0.5,0.8660254,0.0,0.0,0.0,0.0
1,1881-02-01,-0.14,-0.2,-0.18,-0.23,-0.11,-0.25,-0.203333,-0.179167,1881,2,0.866025,0.5,0.0,0.0,0.0,0.0
2,1881-03-01,0.03,-0.14,-0.2,-0.18,-0.15,-0.1,-0.173333,-0.17,1881,3,1.0,6.123234000000001e-17,0.0,0.0,0.0,0.0
3,1881-04-01,0.05,0.03,-0.14,-0.2,-0.24,-0.17,-0.103333,-0.159167,1881,4,0.866025,-0.5,0.0,0.0,0.0,-0.0
4,1881-05-01,0.06,0.05,0.03,-0.14,-0.23,-0.11,-0.02,-0.140833,1881,5,0.5,-0.8660254,0.0,0.0,0.0,-0.0


### Create Hopsworks Feature Group for Global Temperature

We now create (or get) a `global_temperature` feature group and insert the enriched `temp_df` DataFrame into Hopsworks.


In [10]:
# Build a Great Expectations suite for the global_temperature feature group

# Build expectations on the engineered temp_df
ge_temp_df = ge.from_pandas(temp_df.copy())

# Basic sanity checks
ge_temp_df.expect_column_values_to_not_be_null("date")
ge_temp_df.expect_column_values_to_be_unique("date")

# Temperature anomaly should be within a reasonable physical range (¬∞C)
ge_temp_df.expect_column_values_to_be_between(
    "temp_anomaly", min_value=-10, max_value=10
)

# Lags and rolling means should be non-null (after dropna in feature engineering)
for k in [1, 2, 3, 6, 12]:
    ge_temp_df.expect_column_values_to_not_be_null(f"temp_anomaly_lag_{k}")

ge_temp_df.expect_column_values_to_not_be_null("temp_anomaly_roll_3")
ge_temp_df.expect_column_values_to_not_be_null("temp_anomaly_roll_12")

# Export expectation suite so we can attach it when creating the feature group
temperature_expectation_suite = ge_temp_df.get_expectation_suite()

print("Great Expectations suite built for global_temperature.")

2025-12-22 14:31:54,577 INFO: 	10 expectation(s) included in expectation_suite. result_format settings filtered.
Great Expectations suite built for global_temperature.


In [11]:
# Create or get the feature group for global temperature
# Attach the Great Expectations suite we built above
temp_fg = fs.get_or_create_feature_group(
    name="global_temperature",
    version=1,
    description="Global monthly mean surface temperature anomaly with lags and rolling means",
    primary_key=["date"],
    event_time="date",
)

# Insert data into the feature group
temp_fg.insert(temp_df)

temp_fg


Feature Group created successfully, explore it at 
https://eu-west.cloud.hopsworks.ai:443/p/2177/fs/2128/fg/3177
2025-12-22 14:32:07,422 INFO: Computing insert statistics


<hsfs.feature_group.FeatureGroup at 0x31084b860>

In [12]:
# Add feature descriptions for the global_temperature feature group

temp_fg.update_feature_description("date", "Date of measurement (first day of each month)")
temp_fg.update_feature_description("temp_anomaly", "Global monthly mean surface temperature anomaly relative to 1951-1980 (¬∞C)")

# Lag features
for k in [1, 2, 3, 6, 12]:
    temp_fg.update_feature_description(f"temp_anomaly_lag_{k}", f"Global temperature anomaly {k} month(s) ago (¬∞C)")

# Rolling mean features (based on past values only)
temp_fg.update_feature_description("temp_anomaly_roll_3", "3-month rolling mean of past global temperature anomalies (excluding current month)")
temp_fg.update_feature_description("temp_anomaly_roll_12", "12-month rolling mean of global temperature anomalies over the previous 12 months (excluding current month)")

# Time-based features (for temperature trend modeling)
temp_fg.update_feature_description("year", "Year of measurement")
temp_fg.update_feature_description("month", "Month of measurement (1-12)")
temp_fg.update_feature_description("month_sin", "Sine transformation of month for cyclical seasonality (sin(2pi*month/12))")
temp_fg.update_feature_description("month_cos", "Cosine transformation of month for cyclical seasonality (cos(2pi*month/12))")
temp_fg.update_feature_description("year_normalized", "Normalized year (0-1) relative to first year in dataset, for temperature trend modeling")
temp_fg.update_feature_description("year_normalized_squared", "Squared normalized year term to capture acceleration in temperature trend")
temp_fg.update_feature_description("year_month_sin", "Interaction term: year_normalized * month_sin (captures evolving temperature seasonality)")
temp_fg.update_feature_description("year_month_cos", "Interaction term: year_normalized * month_cos (captures evolving temperature seasonality)")

<hsfs.feature_group.FeatureGroup at 0x31084b860>