<a href="https://colab.research.google.com/github/fishee82oo/nfs-oil-price-prediction/blob/main/GDELT_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q google-cloud-storage pyarrow pycountry tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import importlib.util, sys, os, io, json, gzip
from datetime import datetime, timedelta
import pandas as pd, numpy as np
from tqdm import tqdm

In [3]:
try:
    from google.colab import auth
    auth.authenticate_user()
except Exception:
    pass
import google.auth
from google.cloud import storage
credentials, default_project = google.auth.default()
client = storage.Client(project=default_project, credentials=credentials)

# Import data from GCS

In [4]:
spec = importlib.util.spec_from_file_location("gdelt_module", "/mnt/data/gdelt_data_engineering_clean.ipynb")
gdelt_module = None
if spec and spec.loader:
    gm = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(gm)
    gdelt_module = gm
if gdelt_module and hasattr(gdelt_module, "GCS_BUCKET_NAME"):
    BUCKET_NAME = getattr(gdelt_module, "GCS_BUCKET_NAME")
else:
    BUCKET_NAME = os.environ.get("GCS_BUCKET_NAME", "gdelt_raw_3_years")
if gdelt_module and hasattr(gdelt_module, "GCS_PROCESSED_PATH"):
    PROCESSED_PREFIX = getattr(gdelt_module, "GCS_PROCESSED_PATH")
else:
    PROCESSED_PREFIX = os.environ.get("GCS_PROCESSED_PATH", "processed_data/")
bucket = client.bucket(BUCKET_NAME)
blobs = list(client.list_blobs(BUCKET_NAME, prefix=PROCESSED_PREFIX))
final_blobs = [b for b in blobs if b.name.startswith(f"{PROCESSED_PREFIX}final_aligned_data_") and b.name.endswith(".json.gz")]
final_blobs_sorted = sorted(final_blobs, key=lambda b: b.name, reverse=True)
if len(final_blobs_sorted)==0:
    raise SystemExit("No final_aligned_data_*.json.gz found under the processed prefix")
latest_blob = final_blobs_sorted[0]
local_download_path = "/tmp/latest_final_aligned_data.json.gz"
with open(local_download_path, "wb") as f:
    f.write(latest_blob.download_as_bytes())

# Forming Graph

In [5]:
import os

if os.path.exists(local_download_path):
    file_size = os.path.getsize(local_download_path) / (1024*1024)
    print(f"File downloaded: {local_download_path}")
    print(f"Size: {file_size:.2f} MB")

else:
    print("File NOT found locally")

print(f"Downloaded from GCS: {latest_blob.name}")
print(f"Bucket: {BUCKET_NAME}")
print(f"Last modified: {latest_blob.updated}")

with gzip.open(local_download_path, "rt", encoding="utf-8") as f:
    sample = f.read(500)
    print(f"Data preview:\n{sample[:200]}...")

File downloaded: /tmp/latest_final_aligned_data.json.gz
Size: 2.37 MB
Downloaded from GCS: processed_data/final_aligned_data_20250908.json.gz
Bucket: gdelt_raw_3_years
Last modified: 2025-09-08 01:30:57.993000+00:00
Data preview:
[{"date": "20220825", "country": "US", "event_count": 2301, "avg_sentiment": -0.023938528465884453, "unique_sources": 728, "wti_price": 93.33, "brent_price": 98.81, "theme_energy": 45, "theme_conflict...


In [6]:
with gzip.open(local_download_path, "rt", encoding="utf-8") as f:
    raw = f.read()
records = json.loads(raw)
df = pd.DataFrame.from_records(records)
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)
df.shape

(143401, 13)

In [7]:
import pycountry
def to_iso3(name):
    try:
        c = pycountry.countries.lookup(name)
        return c.alpha_3
    except Exception:
        s = str(name).upper()
        s2 = "".join([c for c in s if c.isalpha() or c==" "]).strip().replace(" ", "_")
        return s2
df["country_iso3"] = df["country"].fillna("UNKNOWN").apply(to_iso3)
df["node_id"] = df["country_iso3"].astype(str) + "_" + df["date"].dt.strftime("%Y%m%d")
df.shape

(143401, 15)

In [8]:
price_by_date = df[["date","wti_price"]].drop_duplicates().set_index("date").sort_index()
price_by_date["wti_price"] = pd.to_numeric(price_by_date["wti_price"], errors="coerce")
price_by_date = price_by_date.sort_index()
price_by_date["wti_price_next"] = price_by_date["wti_price"].shift(-1)
price_by_date["wti_delta_next"] = price_by_date["wti_price_next"] - price_by_date["wti_price"]
price_by_date["wti_ret_next"] = price_by_date["wti_delta_next"] / price_by_date["wti_price"]
price_map = price_by_date.to_dict(orient="index")
def attach_targets(row):
    pdx = row["date"]
    v = price_map.get(pdx)
    if v is None:
        return pd.Series([np.nan,np.nan])
    return pd.Series([v.get("wti_delta_next"), v.get("wti_ret_next")])
df[["wti_delta_next","wti_ret_next"]] = df.apply(attach_targets, axis=1)

In [9]:
feature_cols = [c for c in df.columns if c not in ["country","date","node_id","country_iso3","wti_delta_next","wti_ret_next"]]
feature_cols = [c for c in feature_cols if df[c].dtype != "object" or c.startswith("theme_")]
node_features = df[["node_id","country","country_iso3","date","wti_price","brent_price","wti_delta_next","wti_ret_next"] + [c for c in df.columns if c in feature_cols]]
node_features = node_features.fillna(0)

In [10]:
opec_members = ["Venezuela","Saudi Arabia","Iran","Iraq","Kuwait","UAE","Qatar","Algeria","Angola","Libya","Nigeria","Ecuador","Gabon","Republic of the Congo","Equatorial Guinea"]
iso_map = {}
for name in opec_members:
    try:
        iso_map[name] = pycountry.countries.lookup(name).alpha_3
    except Exception:
        iso_map[name] = name.upper().replace(" ","_")
opec_iso = list(iso_map.values())
from itertools import combinations
static_edges = []
for a,b in combinations(opec_iso,2):
    static_edges.append({"source":a,"target":b,"edge_type":"opec_member"})
static_edges_df = pd.DataFrame(static_edges)

In [11]:
dyn_edges_df = pd.DataFrame(columns=["source","target","edge_type","timestamp","date"])
potential_actor_cols = [c for c in df.columns if "actor" in c.lower() or "actor1" in c.lower() or "actor2" in c.lower()]
if len(potential_actor_cols)>0:
    actors = []
    for idx,row in df.iterrows():
        for k in potential_actor_cols:
            v = row.get(k)
        if row.get("actor1") and row.get("actor2"):
            s = str(row.get("actor1"))
            t = str(row.get("actor2"))
            dyn_edges_df.loc[len(dyn_edges_df)] = [s,t,"gdelt_event",row.get("date"),row.get("date")]


In [12]:
df = df.loc[:, ~df.columns.duplicated()]

feature_cols = [c for c in df.columns if c not in ["country","date","node_id","country_iso3","wti_delta_next","wti_ret_next","wti_price","brent_price"]]
feature_cols = [c for c in feature_cols if df[c].dtype != "object" or c.startswith("theme_")]
node_features = df[["node_id","country","country_iso3","date","wti_price","brent_price","wti_delta_next","wti_ret_next"] + [c for c in df.columns if c in feature_cols]]
node_features = node_features.loc[:, ~node_features.columns.duplicated()]
node_features = node_features.fillna(0)

out_dir_local = "/tmp/graph_export"
os.makedirs(out_dir_local, exist_ok=True)
nodes_out = os.path.join(out_dir_local, "nodes.parquet")
static_edges_out = os.path.join(out_dir_local, "edges_static.parquet")
dyn_edges_out = os.path.join(out_dir_local, "edges_dynamic.parquet")
node_features.to_parquet(nodes_out, index=False)
static_edges_df.to_parquet(static_edges_out, index=False)
dyn_edges_df.to_parquet(dyn_edges_out, index=False)

In [13]:
gcs_prefix = PROCESSED_PREFIX + "graph_dataset/"
for p in [nodes_out, static_edges_out, dyn_edges_out]:
    bn = os.path.basename(p)
    blob = bucket.blob(f"{gcs_prefix}{bn}")
    with open(p,"rb") as f:
        blob.upload_from_file(f)
meta = {"nodes":nodes_out,"edges_static":static_edges_out,"edges_dynamic":dyn_edges_out,"uploaded_at":datetime.utcnow().isoformat()}
meta_blob = bucket.blob(f"{gcs_prefix}metadata.json")
meta_blob.upload_from_string(json.dumps(meta), content_type="application/json")

  meta = {"nodes":nodes_out,"edges_static":static_edges_out,"edges_dynamic":dyn_edges_out,"uploaded_at":datetime.utcnow().isoformat()}


In [14]:
unique_dates = sorted(node_features["date"].dt.date.unique())
for d in tqdm(unique_dates):
    sub = node_features[node_features["date"].dt.date==d].copy()
    if sub.shape[0]==0:
        continue
    fn = f"node_features_{d.strftime('%Y%m%d')}.parquet"
    localp = os.path.join(out_dir_local,fn)
    sub.to_parquet(localp, index=False)
    blob = bucket.blob(f"{gcs_prefix}{fn}")
    with open(localp,"rb") as f:
        blob.upload_from_file(f)

100%|██████████| 733/733 [09:19<00:00,  1.31it/s]


In [None]:
print("done")
print("bucket",BUCKET_NAME)
print("processed prefix",PROCESSED_PREFIX)
print("latest aligned blob", latest_blob.name)
print("graph files uploaded to", PROCESSED_PREFIX + "graph_dataset/")

done
bucket gdelt_raw_3_years
processed prefix processed_data/
latest aligned blob processed_data/final_aligned_data_20250908.json.gz
graph files uploaded to processed_data/graph_dataset/


# Baseline Model: XGBoost

In [21]:
!pip install tqdm



In [23]:
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from xgboost import XGBRegressor
import numpy as np

In [24]:
modeling_df = node_features.copy()
target_col = 'wti_delta_next'
modeling_df = modeling_df.dropna(subset=[target_col]).sort_values('date').reset_index(drop=True)

In [25]:
X_all = modeling_df[feature_cols].fillna(0.0).values
y_all = modeling_df[target_col].astype(float).values

if modeling_df['date'].nunique() > 30:
    cutoff_date = modeling_df['date'].max() - pd.Timedelta(days=90)
    train_mask = modeling_df['date'] < cutoff_date
    if train_mask.sum() == 0 or train_mask.sum() == len(modeling_df):
        train_mask = np.arange(len(modeling_df)) < int(0.8 * len(modeling_df))
else:
    train_mask = np.arange(len(modeling_df)) < int(0.8 * len(modeling_df))

In [26]:
X_train, X_test = X_all[train_mask], X_all[~train_mask]
y_train, y_test = y_all[train_mask], y_all[~train_mask]

if len(y_test) == 0:
    X_train, X_test, y_train, y_test = train_test_split(
        X_all, y_all, test_size=0.2, random_state=42, shuffle=False
    )

In [27]:
xgb = XGBRegressor(
    objective='reg:squarederror',
    random_state=42
)

param_grid = {
    'n_estimators': [200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8],
    'reg_lambda': [0.1, 1.0, 5.0]
}

In [28]:
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

# 4. Set up the GridSearchCV
# We use 'r2' as the scoring metric to optimize for.
# n_jobs=-1 uses all available CPU cores to speed up the search.
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='r2',
    cv=tscv,  # <--- Use TimeSeriesSplit here!
    verbose=2,
    n_jobs=-1
)

In [None]:
grid_search.fit(X_all, y_all)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [30]:
print(f"Best R-squared score found: {grid_search.best_score_:.4f}")
print("Best parameters found:")
print(grid_search.best_params_)

Best R-squared score found: -0.0034
Best parameters found:
{'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'reg_lambda': 0.1, 'subsample': 0.7}


In [34]:
best_model = grid_search.best_estimator_

In [35]:
y_pred = best_model.predict(X_test)
metrics = {
    'MAE': mean_absolute_error(y_test, y_pred),
    'RMSE': mean_squared_error(y_test, y_pred),
    'R2': r2_score(y_test, y_pred)
}
print('Test metrics:')
for name, value in metrics.items():
    print(f'  {name}: {value:.4f}')

Test metrics:
  MAE: 1.0686
  RMSE: 2.2176
  R2: -0.0030


# GNN Model

In [None]:
!pip install torch-geometric -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from collections import defaultdict
from dataclasses import dataclass

In [None]:
node_features['date'] = pd.to_datetime(node_features['date'])
price_by_date.index = pd.to_datetime(price_by_date.index)
if not dyn_edges_df.empty:
    dyn_edges_df['date'] = pd.to_datetime(dyn_edges_df['date'], errors='coerce')
    if 'timestamp' in dyn_edges_df.columns:
        dyn_edges_df['timestamp'] = pd.to_datetime(dyn_edges_df['timestamp'], errors='coerce')
    else:
        dyn_edges_df['timestamp'] = dyn_edges_df['date']
    dyn_edges_df['source_iso3'] = dyn_edges_df['source'].apply(to_iso3)
    dyn_edges_df['target_iso3'] = dyn_edges_df['target'].apply(to_iso3)
    dyn_edges_df = dyn_edges_df.dropna(subset=['source_iso3','target_iso3'])
else:
    dyn_edges_df['timestamp'] = pd.NaT
    dyn_edges_df['source_iso3'] = []
    dyn_edges_df['target_iso3'] = []

In [None]:
feature_columns_graph = [c for c in node_features.columns if c not in ['node_id','country','country_iso3','date','wti_price','brent_price','wti_delta_next','wti_ret_next']]
feature_columns_graph = [c for c in feature_columns_graph if node_features[c].dtype != 'object']
feature_columns_graph = sorted(dict.fromkeys(feature_columns_graph))

market_indicator_candidates = []
for col in node_features.columns:
    low = col.lower()
    if any(key in low for key in ['spread','usd','inventory','fx','dxy','dollar','inflation']):
        if node_features[col].dtype != 'object':
            market_indicator_candidates.append(col)
market_indicator_cols = ['wti_price','brent_price'] + sorted(dict.fromkeys(market_indicator_candidates))

In [None]:
@dataclass
class GraphSnapshot:
    date: pd.Timestamp
    nodes: pd.DataFrame
    adjacency: np.ndarray
    node_embeddings: np.ndarray
    global_vector: np.ndarray
    target: float

def _normalize_adjacency(adj: np.ndarray) -> np.ndarray:
    deg = adj.sum(axis=1, keepdims=True)
    deg[deg == 0] = 1.0
    return adj / deg

def build_adjacency_matrix(nodes_df: pd.DataFrame, static_edges_df: pd.DataFrame, dyn_edges_df: pd.DataFrame, snapshot_time: pd.Timestamp, lookback_hours: int = 24) -> np.ndarray:
    n = len(nodes_df)
    adjacency = np.zeros((n, n), dtype=float)
    iso_to_indices = defaultdict(list)
    for idx, iso in enumerate(nodes_df['country_iso3']):
        iso_to_indices[iso].append(idx)
    if not static_edges_df.empty:
        for _, edge in static_edges_df.iterrows():
            src_nodes = iso_to_indices.get(edge['source'], [])
            tgt_nodes = iso_to_indices.get(edge['target'], [])
            for i in src_nodes:
                for j in tgt_nodes:
                    adjacency[i, j] = 1.0
                    adjacency[j, i] = 1.0
    if not dyn_edges_df.empty and snapshot_time is not None:
        window_start = snapshot_time - pd.Timedelta(hours=lookback_hours)
        mask = dyn_edges_df['timestamp'].between(window_start, snapshot_time)
        for _, edge in dyn_edges_df.loc[mask].iterrows():
            src_nodes = iso_to_indices.get(edge['source_iso3'], [])
            tgt_nodes = iso_to_indices.get(edge['target_iso3'], [])
            for i in src_nodes:
                for j in tgt_nodes:
                    adjacency[i, j] += 1.0
                    adjacency[j, i] += 1.0
    if n > 0:
        np.fill_diagonal(adjacency, 1.0)
    return adjacency

def run_message_passing(nodes_df: pd.DataFrame, adjacency: np.ndarray, feature_columns: list, steps: int = 2) -> np.ndarray:
    if len(nodes_df) == 0:
        return np.empty((0, 0))
    base_features = nodes_df[feature_columns].astype(float).to_numpy()
    mean = np.nanmean(base_features, axis=0, keepdims=True)
    std = np.nanstd(base_features, axis=0, keepdims=True)
    std[std == 0] = 1.0
    h = (base_features - mean) / std
    norm_adj = _normalize_adjacency(adjacency)
    for _ in range(steps):
        neighbor_messages = norm_adj @ h
        h = np.tanh(h + neighbor_messages)
    embeddings = np.hstack([h, neighbor_messages])
    embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)
    return embeddings

def attention_readout(nodes_df: pd.DataFrame, node_embeddings: np.ndarray, weight_keys: tuple = ('tone','tone_score','goldstein')) -> tuple:
    if node_embeddings.size == 0:
        return np.array([]), []
    n_nodes = len(nodes_df)
    scores = np.zeros(n_nodes, dtype=float)
    for key in weight_keys:
        candidates = [c for c in nodes_df.columns if key in c.lower() and nodes_df[c].dtype != 'object']
        if candidates:
            scores += nodes_df[candidates].astype(float).fillna(0.0).sum(axis=1).to_numpy()
    if np.all(scores == 0):
        scores = np.ones_like(scores)
    scores = scores - scores.max()
    weights = np.exp(scores)
    weights = weights / weights.sum()
    aggregated = weights @ node_embeddings
    feature_names = [f"gnn_embedding_{i}" for i in range(node_embeddings.shape[1])]
    return aggregated, feature_names

def compute_market_features(nodes_df: pd.DataFrame, indicator_cols: list) -> tuple:
    values = []
    names = []
    for col in indicator_cols:
        if col not in nodes_df.columns:
            continue
        col_values = pd.to_numeric(nodes_df[col], errors='coerce')
        values.extend([
            np.nanmean(col_values.values),
            np.nanstd(col_values.values)
        ])
        names.extend([f"{col}_mean", f"{col}_std"])
    if 'wti_price' in nodes_df.columns:
        latest_price = pd.to_numeric(nodes_df['wti_price'], errors='coerce').iloc[-1]
        values.append(latest_price)
        names.append('wti_price_current')
    if 'brent_price' in nodes_df.columns:
        latest_brent = pd.to_numeric(nodes_df['brent_price'], errors='coerce').iloc[-1]
        values.append(latest_brent)
        names.append('brent_price_current')
    values = np.nan_to_num(np.array(values, dtype=float), nan=0.0, posinf=0.0, neginf=0.0)
    return values, names

def build_global_vector(snapshot_time: pd.Timestamp, nodes_df: pd.DataFrame, adjacency: np.ndarray) -> tuple:
    node_embeddings = run_message_passing(nodes_df, adjacency, feature_columns_graph)
    graph_vector, graph_names = attention_readout(nodes_df, node_embeddings)
    market_vector, market_names = compute_market_features(nodes_df, market_indicator_cols)
    combined = np.concatenate([graph_vector, market_vector])
    feature_names = graph_names + market_names
    return node_embeddings, combined, feature_names

def build_snapshots(node_features: pd.DataFrame, static_edges_df: pd.DataFrame, dyn_edges_df: pd.DataFrame, price_by_date: pd.DataFrame, lookback_hours: int = 24) -> tuple:
    rows = []
    targets = []
    dates = []
    snapshot_records = []
    feature_names = None
    for ts in sorted(node_features['date'].unique()):
        ts = pd.Timestamp(ts)
        nodes_df = node_features[node_features['date'] == ts].copy()
        if nodes_df.empty:
            continue
        adjacency = build_adjacency_matrix(nodes_df, static_edges_df, dyn_edges_df, ts, lookback_hours=lookback_hours)
        node_embeddings, global_vector, names = build_global_vector(ts, nodes_df, adjacency)
        target = price_by_date['wti_ret_next'].reindex([ts]).iloc[0] if ts in price_by_date.index else np.nan
        if pd.isna(target):
            continue
        rows.append(global_vector)
        targets.append(float(target))
        dates.append(ts)
        snapshot_records.append(GraphSnapshot(ts, nodes_df, adjacency, node_embeddings, global_vector, float(target)))
        if feature_names is None:
            feature_names = names
    if not rows:
        raise ValueError('No graph snapshots were built; verify that node features and price targets overlap.')
    data_matrix = np.vstack(rows)
    targets = np.array(targets, dtype=float)
    return data_matrix, targets, dates, feature_names, snapshot_records

In [None]:
class DecisionStump:
      def __init__(self, max_thresholds: int = 32):
          self.max_thresholds = max_thresholds
          self.feature_index = 0
          self.threshold = 0.0
          self.left_value = 0.0
          self.right_value = 0.0

      def fit(self, X: np.ndarray, y: np.ndarray):
          n_samples, n_features = X.shape
          best_error = np.inf
          overall_mean = y.mean() if len(y) > 0 else 0.0
          if n_samples == 0:
              self.left_value = overall_mean
              self.right_value = overall_mean
              return
          for feature in range(n_features):
              column = X[:, feature]
              unique_vals = np.unique(column)
              if len(unique_vals) > self.max_thresholds:
                  thresholds = np.linspace(unique_vals.min(), unique_vals.max(), self.max_thresholds)
              else:
                  thresholds = unique_vals
              for thr in thresholds:
                  left_mask = column <= thr
                  right_mask = ~left_mask
                  if left_mask.sum() == 0 or right_mask.sum() == 0:
                      continue
                  left_value = y[left_mask].mean()
                  right_value = y[right_mask].mean()
                  predictions = np.where(left_mask, left_value, right_value)
                  error = np.mean((y - predictions) ** 2)
                  if error < best_error:
                      best_error = error
                      self.feature_index = feature
                      self.threshold = thr
                      self.left_value = left_value
                      self.right_value = right_value
          if best_error == np.inf:
              self.left_value = overall_mean
              self.right_value = overall_mean

      def predict(self, X: np.ndarray) -> np.ndarray:
          if X.size == 0:
              return np.array([])
          mask = X[:, self.feature_index] <= self.threshold
          return np.where(mask, self.left_value, self.right_value)

In [None]:
class GradientBoostedStumps:
      def __init__(self, n_estimators: int = 200, learning_rate: float = 0.05, max_thresholds: int = 32):
          self.n_estimators = n_estimators
          self.learning_rate = learning_rate
          self.max_thresholds = max_thresholds
          self.estimators_ = []
          self.init_value_ = 0.0
          self.feature_counts_ = None

      def fit(self, X: np.ndarray, y: np.ndarray):
          self.estimators_ = []
          self.feature_counts_ = np.zeros(X.shape[1], dtype=float)
          self.init_value_ = y.mean() if len(y) > 0 else 0.0
          y_pred = np.full_like(y, self.init_value_, dtype=float)
          for _ in range(self.n_estimators):
              residual = y - y_pred
              stump = DecisionStump(max_thresholds=self.max_thresholds)
              stump.fit(X, residual)
              update = stump.predict(X)
              y_pred += self.learning_rate * update
              self.estimators_.append(stump)
              self.feature_counts_[stump.feature_index] += 1

      def predict(self, X: np.ndarray) -> np.ndarray:
          if X.size == 0:
              return np.array([])
          y_pred = np.full(X.shape[0], self.init_value_, dtype=float)
          for stump in self.estimators_:
              y_pred += self.learning_rate * stump.predict(X)
          return y_pred

      def feature_importances(self) -> np.ndarray:
          if self.feature_counts_ is None:
              return np.array([])
          total = self.feature_counts_.sum()
          if total == 0:
              return np.zeros_like(self.feature_counts_)
          return self.feature_counts_ / total

In [None]:
def train_gnn_lgbm_pipeline(node_features: pd.DataFrame, static_edges_df: pd.DataFrame, dyn_edges_df: pd.DataFrame, price_by_date: pd.DataFrame, lookback_hours: int = 24, test_fraction: float = 0.2):
      X, y, dates, feature_names, snapshots = build_snapshots(node_features, static_edges_df, dyn_edges_df, price_by_date, lookback_hours=lookback_hours)
      n_samples = len(X)
      split_index = max(1, int(n_samples * (1 - test_fraction)))
      X_train, X_test = X[:split_index], X[split_index:]
      y_train, y_test = y[:split_index], y[split_index:]
      dates_train, dates_test = dates[:split_index], dates[split_index:]

      model = GradientBoostedStumps(n_estimators=200, learning_rate=0.05, max_thresholds=32)
      model.fit(X_train, y_train)
      train_pred = model.predict(X_train)
      test_pred = model.predict(X_test) if len(X_test) > 0 else np.array([])

      def mae(targets, preds):
          return float(np.mean(np.abs(targets - preds))) if len(targets) else float('nan')

      def rmse(targets, preds):
          return float(np.sqrt(np.mean((targets - preds) ** 2))) if len(targets) else float('nan')

      def directional_accuracy(targets, preds):
          if len(targets) == 0:
              return float('nan')
          mask = (targets != 0) & (preds != 0)
          if mask.sum() == 0:
              return float('nan')
          return float(np.mean(np.sign(targets[mask]) == np.sign(preds[mask])))

      metrics = {
          'train_mae': mae(y_train, train_pred),
          'train_rmse': rmse(y_train, train_pred),
          'train_directional_accuracy': directional_accuracy(y_train, train_pred),
          'test_mae': mae(y_test, test_pred),
          'test_rmse': rmse(y_test, test_pred),
          'test_directional_accuracy': directional_accuracy(y_test, test_pred)
      }

      results = {
          'model': model,
          'feature_names': feature_names,
          'metrics': metrics,
          'train_predictions': pd.DataFrame({'date': dates_train, 'target': y_train, 'prediction': train_pred}),
          'test_predictions': pd.DataFrame({'date': dates_test, 'target': y_test, 'prediction': test_pred}),
          'snapshots': snapshots
      }
      return results

In [None]:
pipeline_results = train_gnn_lgbm_pipeline(
    node_features=node_features,
    static_edges_df=static_edges_df,
    dyn_edges_df=dyn_edges_df,
    price_by_date=price_by_date,
    lookback_hours=24,
    test_fraction=0.2
)

metrics_table = pd.DataFrame([pipeline_results['metrics']])
metrics_table

Unnamed: 0,train_mae,train_rmse,train_directional_accuracy,test_mae,test_rmse,test_directional_accuracy
0,0.015557,0.019464,0.605489,0.017943,0.023429,0.517007


In [None]:
# --- CELL TO SAVE MODEL ARTIFACTS TO GCS ---

import joblib
import json
import numpy as np
import os
import io

print("Saving model and evaluation data to GCS...")

# --- 1. Define Paths ---
# Use a temporary local directory for staging files
local_tmp_dir = "/tmp/model_export"
os.makedirs(local_tmp_dir, exist_ok=True)

# Define the GCS prefix where artifacts will be stored
gcs_model_prefix = PROCESSED_PREFIX + "model_artifacts/"

# Define local paths
local_model_path = os.path.join(local_tmp_dir, "gbs_model.joblib")
local_x_test_path = os.path.join(local_tmp_dir, "X_test.npy")
local_y_test_path = os.path.join(local_tmp_dir, "y_test.npy")
local_features_path = os.path.join(local_tmp_dir, "feature_names.json")

# --- 2. Retrieve Data to Save ---
model_to_save = pipeline_results['model']
feature_names_to_save = pipeline_results['feature_names']

# Re-build snapshots to get the correct test set split
X_all, y_all, dates_all, _, _ = build_snapshots(
    node_features, static_edges_df, dyn_edges_df, price_by_date, lookback_hours=24
)
n_samples = len(X_all)
split_index = max(1, int(n_samples * (1 - 0.2)))
X_test = X_all[split_index:]
y_test = y_all[split_index:]


# --- 3. Save Files Locally ---
print(f"Staging files in {local_tmp_dir}...")
# 1. Save the model
joblib.dump(model_to_save, local_model_path)

# 2. Save the test data
np.save(local_x_test_path, X_test)
np.save(local_y_test_path, y_test)

# 3. Save the feature names
with open(local_features_path, 'w') as f:
    json.dump(feature_names_to_save, f)


# --- 4. Upload Files to GCS ---
print(f"Uploading artifacts to GCS bucket '{BUCKET_NAME}' at prefix '{gcs_model_prefix}'...")

files_to_upload = [
    ("gbs_model.joblib", local_model_path),
    ("X_test.npy", local_x_test_path),
    ("y_test.npy", local_y_test_path),
    ("feature_names.json", local_features_path)
]

gcs_paths = {}

for filename, local_path in files_to_upload:
    try:
        gcs_path = f"{gcs_model_prefix}{filename}"
        blob = bucket.blob(gcs_path)
        blob.upload_from_filename(local_path)
        gcs_paths[filename] = f"gs://{BUCKET_NAME}/{gcs_path}"
    except Exception as e:
        print(f"ERROR uploading {filename}: {e}")

print("\n--- Upload Complete ---")
print(f"Artifacts saved to GCS:")
print(f"  - Model: {gcs_paths.get('gbs_model.joblib')}")
print(f"  - X_test: {gcs_paths.get('X_test.npy')} (shape: {X_test.shape})")
print(f"  - y_test: {gcs_paths.get('y_test.npy')} (shape: {y_test.shape})")
print(f"  - Features: {gcs_paths.get('feature_names.json')} (count: {len(feature_names_to_save)})")

Saving model and evaluation data to GCS...
Staging files in /tmp/model_export...
Uploading artifacts to GCS bucket 'gdelt_raw_3_years' at prefix 'processed_data/model_artifacts/'...

--- Upload Complete ---
Artifacts saved to GCS:
  - Model: gs://gdelt_raw_3_years/processed_data/model_artifacts/gbs_model.joblib
  - X_test: gs://gdelt_raw_3_years/processed_data/model_artifacts/X_test.npy (shape: (147, 24))
  - y_test: gs://gdelt_raw_3_years/processed_data/model_artifacts/y_test.npy (shape: (147,))
  - Features: gs://gdelt_raw_3_years/processed_data/model_artifacts/feature_names.json (count: 24)

Cleaned up temporary directory: /tmp/model_export
