<a href="https://colab.research.google.com/github/fishee82oo/nfs-oil-price-prediction/blob/main/GDELT_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install -q google-cloud-storage pyarrow pycountry tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import importlib.util, sys, os, io, json, gzip
from datetime import datetime, timedelta
import pandas as pd, numpy as np
from tqdm import tqdm

In [2]:
try:
    from google.colab import auth
    auth.authenticate_user()
except Exception:
    pass
import google.auth
from google.cloud import storage
credentials, default_project = google.auth.default()
client = storage.Client(project=default_project, credentials=credentials)

# Import data from GCS

In [3]:
spec = importlib.util.spec_from_file_location("gdelt_module", "/mnt/data/gdelt_data_engineering_clean.ipynb")
gdelt_module = None
if spec and spec.loader:
    gm = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(gm)
    gdelt_module = gm
if gdelt_module and hasattr(gdelt_module, "GCS_BUCKET_NAME"):
    BUCKET_NAME = getattr(gdelt_module, "GCS_BUCKET_NAME")
else:
    BUCKET_NAME = os.environ.get("GCS_BUCKET_NAME", "gdelt_raw_3_years")
if gdelt_module and hasattr(gdelt_module, "GCS_PROCESSED_PATH"):
    PROCESSED_PREFIX = getattr(gdelt_module, "GCS_PROCESSED_PATH")
else:
    PROCESSED_PREFIX = os.environ.get("GCS_PROCESSED_PATH", "processed_data/")
bucket = client.bucket(BUCKET_NAME)
blobs = list(client.list_blobs(BUCKET_NAME, prefix=PROCESSED_PREFIX))
final_blobs = [b for b in blobs if b.name.startswith(f"{PROCESSED_PREFIX}final_aligned_data_") and b.name.endswith(".json.gz")]
final_blobs_sorted = sorted(final_blobs, key=lambda b: b.name, reverse=True)
if len(final_blobs_sorted)==0:
    raise SystemExit("No final_aligned_data_*.json.gz found under the processed prefix")
latest_blob = final_blobs_sorted[0]
local_download_path = "/tmp/latest_final_aligned_data.json.gz"
with open(local_download_path, "wb") as f:
    f.write(latest_blob.download_as_bytes())

# Forming Graph

In [4]:
import os

if os.path.exists(local_download_path):
    file_size = os.path.getsize(local_download_path) / (1024*1024)
    print(f"File downloaded: {local_download_path}")
    print(f"Size: {file_size:.2f} MB")

else:
    print("File NOT found locally")

print(f"Downloaded from GCS: {latest_blob.name}")
print(f"Bucket: {BUCKET_NAME}")
print(f"Last modified: {latest_blob.updated}")

with gzip.open(local_download_path, "rt", encoding="utf-8") as f:
    sample = f.read(500)
    print(f"Data preview:\n{sample[:200]}...")

File downloaded: /tmp/latest_final_aligned_data.json.gz
Size: 2.37 MB
Downloaded from GCS: processed_data/final_aligned_data_20250908.json.gz
Bucket: gdelt_raw_3_years
Last modified: 2025-09-08 01:30:57.993000+00:00
Data preview:
[{"date": "20220825", "country": "US", "event_count": 2301, "avg_sentiment": -0.023938528465884453, "unique_sources": 728, "wti_price": 93.33, "brent_price": 98.81, "theme_energy": 45, "theme_conflict...


In [5]:
with gzip.open(local_download_path, "rt", encoding="utf-8") as f:
    raw = f.read()
records = json.loads(raw)
df = pd.DataFrame.from_records(records)
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)
df.shape

(143401, 13)

In [8]:
import pycountry
def to_iso3(name):
    try:
        c = pycountry.countries.lookup(name)
        return c.alpha_3
    except Exception:
        s = str(name).upper()
        s2 = "".join([c for c in s if c.isalpha() or c==" "]).strip().replace(" ", "_")
        return s2
df["country_iso3"] = df["country"].fillna("UNKNOWN").apply(to_iso3)
df["node_id"] = df["country_iso3"].astype(str) + "_" + df["date"].dt.strftime("%Y%m%d")
df.shape

(143401, 15)

In [9]:
price_by_date = df[["date","wti_price"]].drop_duplicates().set_index("date").sort_index()
price_by_date["wti_price"] = pd.to_numeric(price_by_date["wti_price"], errors="coerce")
price_by_date = price_by_date.sort_index()
price_by_date["wti_price_next"] = price_by_date["wti_price"].shift(-1)
price_by_date["wti_delta_next"] = price_by_date["wti_price_next"] - price_by_date["wti_price"]
price_by_date["wti_ret_next"] = price_by_date["wti_delta_next"] / price_by_date["wti_price"]
price_map = price_by_date.to_dict(orient="index")
def attach_targets(row):
    pdx = row["date"]
    v = price_map.get(pdx)
    if v is None:
        return pd.Series([np.nan,np.nan])
    return pd.Series([v.get("wti_delta_next"), v.get("wti_ret_next")])
df[["wti_delta_next","wti_ret_next"]] = df.apply(attach_targets, axis=1)

In [10]:
price_lag_features = price_by_date[['wti_price']].copy()

price_lag_features['wti_price_lag_1'] = price_lag_features['wti_price'].shift(1)
price_lag_features['wti_price_lag_2'] = price_lag_features['wti_price'].shift(2)

price_lag_features['wti_delta_prev_1'] = price_lag_features['wti_price'] - price_lag_features['wti_price_lag_1']
price_lag_features['wti_delta_prev_3'] = price_lag_features['wti_price'] - price_lag_features['wti_price'].shift(3)

price_lag_features['wti_ret_prev_1'] = price_lag_features['wti_delta_prev_1'] / price_lag_features['wti_price_lag_1'].replace(0, np.nan)
price_lag_features['wti_ret_prev_3'] = price_lag_features['wti_price'].div(price_lag_features['wti_price'].shift(3)) - 1

price_lag_features['wti_rolling_mean_3'] = price_lag_features['wti_price'].rolling(window=3, min_periods=1).mean()
price_lag_features['wti_rolling_std_3'] = price_lag_features['wti_price'].rolling(window=3, min_periods=2).std()
price_lag_features['wti_rolling_mean_7'] = price_lag_features['wti_price'].rolling(window=7, min_periods=1).mean()
price_lag_features['wti_rolling_std_7'] = price_lag_features['wti_price'].rolling(window=7, min_periods=2).std()

price_lag_features = price_lag_features.drop(columns=['wti_price'])
lag_feature_cols = list(price_lag_features.columns)

In [11]:
feature_cols = [c for c in df.columns if c not in ["country","date","node_id","country_iso3","wti_delta_next","wti_ret_next"]]
feature_cols = [c for c in feature_cols if df[c].dtype != "object" or c.startswith("theme_")]
node_features = df[["node_id","country","country_iso3","date","wti_price","brent_price","wti_delta_next","wti_ret_next"] + [c for c in df.columns if c in feature_cols]]
node_features = node_features.fillna(0)

In [12]:
opec_members = ["Venezuela","Saudi Arabia","Iran","Iraq","Kuwait","UAE","Qatar","Algeria","Angola","Libya","Nigeria","Ecuador","Gabon","Republic of the Congo","Equatorial Guinea"]
iso_map = {}
for name in opec_members:
    try:
        iso_map[name] = pycountry.countries.lookup(name).alpha_3
    except Exception:
        iso_map[name] = name.upper().replace(" ","_")
opec_iso = list(iso_map.values())
from itertools import combinations
static_edges = []
for a,b in combinations(opec_iso,2):
    static_edges.append({"source":a,"target":b,"edge_type":"opec_member"})
static_edges_df = pd.DataFrame(static_edges)

In [13]:
dyn_edges_df = pd.DataFrame(columns=["source","target","edge_type","timestamp","date"])
potential_actor_cols = [c for c in df.columns if "actor" in c.lower() or "actor1" in c.lower() or "actor2" in c.lower()]
if len(potential_actor_cols)>0:
    actors = []
    for idx,row in df.iterrows():
        for k in potential_actor_cols:
            v = row.get(k)
        if row.get("actor1") and row.get("actor2"):
            s = str(row.get("actor1"))
            t = str(row.get("actor2"))
            dyn_edges_df.loc[len(dyn_edges_df)] = [s,t,"gdelt_event",row.get("date"),row.get("date")]


In [14]:
df = df.loc[:, ~df.columns.duplicated()]

feature_cols = [c for c in df.columns if c not in ["country","date","node_id","country_iso3","wti_delta_next","wti_ret_next","wti_price","brent_price"]]
feature_cols = [c for c in feature_cols if df[c].dtype != "object" or c.startswith("theme_")]
node_features = df[["node_id","country","country_iso3","date","wti_price","brent_price","wti_delta_next","wti_ret_next"] + [c for c in df.columns if c in feature_cols]]
node_features = node_features.loc[:, ~node_features.columns.duplicated()]
node_features = node_features.merge(price_lag_features, left_on='date', right_index=True, how='left')
node_features = node_features.fillna(0)

feature_cols = list(dict.fromkeys(feature_cols + lag_feature_cols))

out_dir_local = "/tmp/graph_export"
os.makedirs(out_dir_local, exist_ok=True)
nodes_out = os.path.join(out_dir_local, "nodes.parquet")
static_edges_out = os.path.join(out_dir_local, "edges_static.parquet")
dyn_edges_out = os.path.join(out_dir_local, "edges_dynamic.parquet")
node_features.to_parquet(nodes_out, index=False)
static_edges_df.to_parquet(static_edges_out, index=False)
dyn_edges_df.to_parquet(dyn_edges_out, index=False)

In [15]:
gcs_prefix = PROCESSED_PREFIX + "graph_dataset/"
for p in [nodes_out, static_edges_out, dyn_edges_out]:
    bn = os.path.basename(p)
    blob = bucket.blob(f"{gcs_prefix}{bn}")
    with open(p,"rb") as f:
        blob.upload_from_file(f)
meta = {"nodes":nodes_out,"edges_static":static_edges_out,"edges_dynamic":dyn_edges_out,"uploaded_at":datetime.utcnow().isoformat()}
meta_blob = bucket.blob(f"{gcs_prefix}metadata.json")
meta_blob.upload_from_string(json.dumps(meta), content_type="application/json")

  meta = {"nodes":nodes_out,"edges_static":static_edges_out,"edges_dynamic":dyn_edges_out,"uploaded_at":datetime.utcnow().isoformat()}


In [16]:
unique_dates = sorted(node_features["date"].dt.date.unique())
for d in tqdm(unique_dates):
    sub = node_features[node_features["date"].dt.date==d].copy()
    if sub.shape[0]==0:
        continue
    fn = f"node_features_{d.strftime('%Y%m%d')}.parquet"
    localp = os.path.join(out_dir_local,fn)
    sub.to_parquet(localp, index=False)
    blob = bucket.blob(f"{gcs_prefix}{fn}")
    with open(localp,"rb") as f:
        blob.upload_from_file(f)

100%|██████████| 733/733 [09:04<00:00,  1.35it/s]


In [17]:
print("done")
print("bucket",BUCKET_NAME)
print("processed prefix",PROCESSED_PREFIX)
print("latest aligned blob", latest_blob.name)
print("graph files uploaded to", PROCESSED_PREFIX + "graph_dataset/")

done
bucket gdelt_raw_3_years
processed prefix processed_data/
latest aligned blob processed_data/final_aligned_data_20250908.json.gz
graph files uploaded to processed_data/graph_dataset/


# Baseline Model: XGBoost

In [16]:
!pip install tqdm



In [50]:
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from xgboost import XGBRegressor
import numpy as np

In [18]:
modeling_df = node_features.copy()
target_col = 'wti_delta_next'
modeling_df = modeling_df.dropna(subset=[target_col]).sort_values('date').reset_index(drop=True)

In [19]:
X_all = modeling_df[feature_cols].fillna(0.0).values
y_all = modeling_df[target_col].astype(float).values

if modeling_df['date'].nunique() > 30:
    cutoff_date = modeling_df['date'].max() - pd.Timedelta(days=90)
    train_mask = modeling_df['date'] < cutoff_date
    if train_mask.sum() == 0 or train_mask.sum() == len(modeling_df):
        train_mask = np.arange(len(modeling_df)) < int(0.8 * len(modeling_df))
else:
    train_mask = np.arange(len(modeling_df)) < int(0.8 * len(modeling_df))

In [20]:
X_train, X_test = X_all[train_mask], X_all[~train_mask]
y_train, y_test = y_all[train_mask], y_all[~train_mask]

if len(y_test) == 0:
    X_train, X_test, y_train, y_test = train_test_split(
        X_all, y_all, test_size=0.2, random_state=42, shuffle=False
    )

In [21]:
xgb = XGBRegressor(
    objective='reg:squarederror',
    random_state=42
)

param_grid = {
    'n_estimators': [200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8],
    'reg_lambda': [0.1, 1.0, 5.0]
}

In [22]:
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

# 4. Set up the GridSearchCV
# We use 'r2' as the scoring metric to optimize for.
# n_jobs=-1 uses all available CPU cores to speed up the search.
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='r2',
    cv=tscv,  # <--- Use TimeSeriesSplit here!
    verbose=2,
    n_jobs=-1
)

In [23]:
grid_search.fit(X_all, y_all)

print(f"Best R-squared score found: {grid_search.best_score_:.4f}")
print("Best parameters found:")
print(grid_search.best_params_)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best R-squared score found: -0.0187
Best parameters found:
{'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'reg_lambda': 0.1, 'subsample': 0.8}


In [26]:
model = grid_search.best_estimator_
y_pred = model.predict(X_test)
metrics = {
    'MAE': mean_absolute_error(y_test, y_pred),
    'RMSE': mean_squared_error(y_test, y_pred),
    'R2': r2_score(y_test, y_pred)
}
print('Test metrics:')
for name, value in metrics.items():
    print(f'  {name}: {value:.4f}')

Test metrics:
  MAE: 0.9734
  RMSE: 1.5267
  R2: 0.3095


# GNN Model

In [18]:
!pip install torch-geometric -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [48]:
from collections import defaultdict
from dataclasses import dataclass
import lightgbm as lgb

In [20]:
node_features['date'] = pd.to_datetime(node_features['date'])
price_by_date.index = pd.to_datetime(price_by_date.index)
if not dyn_edges_df.empty:
    dyn_edges_df['date'] = pd.to_datetime(dyn_edges_df['date'], errors='coerce')
    if 'timestamp' in dyn_edges_df.columns:
        dyn_edges_df['timestamp'] = pd.to_datetime(dyn_edges_df['timestamp'], errors='coerce')
    else:
        dyn_edges_df['timestamp'] = dyn_edges_df['date']
    dyn_edges_df['source_iso3'] = dyn_edges_df['source'].apply(to_iso3)
    dyn_edges_df['target_iso3'] = dyn_edges_df['target'].apply(to_iso3)
    dyn_edges_df = dyn_edges_df.dropna(subset=['source_iso3','target_iso3'])
else:
    dyn_edges_df['timestamp'] = pd.NaT
    dyn_edges_df['source_iso3'] = []
    dyn_edges_df['target_iso3'] = []

In [22]:
node_features.shape

(143401, 27)

In [46]:
feature_columns_graph = [c for c in node_features.columns if c not in ['node_id','country','country_iso3','date','wti_price','brent_price','wti_delta_next','wti_ret_next']]
feature_columns_graph = [c for c in feature_columns_graph if node_features[c].dtype != 'object']
feature_columns_graph = sorted(dict.fromkeys(feature_columns_graph))

market_indicator_candidates = []
for col in node_features.columns:
    low = col.lower()
    if any(key in low for key in ['spread','usd','inventory','fx','dxy','dollar','inflation']):
        if node_features[col].dtype != 'object':
            market_indicator_candidates.append(col)
market_indicator_cols = ['wti_price','brent_price'] + sorted(dict.fromkeys(market_indicator_candidates))
engineered_feature_cols = [c for c in lag_feature_cols if c in node_features.columns]
engineered_feature_cols = sorted(dict.fromkeys(engineered_feature_cols))

In [35]:
@dataclass
class GraphSnapshot:
    date: pd.Timestamp
    node_features: np.ndarray
    adjacency: np.ndarray
    engineered_features: np.ndarray
    target: float

def _normalize_adjacency(adj: np.ndarray) -> np.ndarray:
    deg = adj.sum(axis=1, keepdims=True)
    deg[deg == 0] = 1.0
    return adj / deg

def build_adjacency_matrix(nodes_df: pd.DataFrame, static_edges_df: pd.DataFrame, dyn_edges_df: pd.DataFrame, snapshot_time: pd.Timestamp, lookback_hours: int = 24) -> np.ndarray:
    n = len(nodes_df)
    adjacency = np.zeros((n, n), dtype=float)
    iso_to_indices = defaultdict(list)
    for idx, iso in enumerate(nodes_df['country_iso3']):
        iso_to_indices[iso].append(idx)
    if not static_edges_df.empty:
        for _, edge in static_edges_df.iterrows():
            src_nodes = iso_to_indices.get(edge['source'], [])
            tgt_nodes = iso_to_indices.get(edge['target'], [])
            for i in src_nodes:
                for j in tgt_nodes:
                    adjacency[i, j] = 1.0
                    adjacency[j, i] = 1.0
    if not dyn_edges_df.empty and snapshot_time is not None:
        window_start = snapshot_time - pd.Timedelta(hours=lookback_hours)
        mask = dyn_edges_df['timestamp'].between(window_start, snapshot_time)
        for _, edge in dyn_edges_df.loc[mask].iterrows():
            src_nodes = iso_to_indices.get(edge['source_iso3'], [])
            tgt_nodes = iso_to_indices.get(edge['target_iso3'], [])
            for i in src_nodes:
                for j in tgt_nodes:
                    adjacency[i, j] += 1.0
                    adjacency[j, i] += 1.0
    if n > 0:
        np.fill_diagonal(adjacency, 1.0)
    return _normalize_adjacency(adjacency)

def compute_market_features(nodes_df: pd.DataFrame, indicator_cols: list) -> tuple:
    values = []
    names = []
    for col in indicator_cols:
        if col not in nodes_df.columns:
            continue
        col_values = pd.to_numeric(nodes_df[col], errors='coerce')
        values.extend([
            np.nanmean(col_values.values),
            np.nanstd(col_values.values)
        ])
        names.extend([f"{col}_mean", f"{col}_std"])
    if 'wti_price' in nodes_df.columns:
        latest_price = pd.to_numeric(nodes_df['wti_price'], errors='coerce').iloc[-1]
        values.append(latest_price)
        names.append('wti_price_current')
    if 'brent_price' in nodes_df.columns:
        latest_brent = pd.to_numeric(nodes_df['brent_price'], errors='coerce').iloc[-1]
        values.append(latest_brent)
        names.append('brent_price_current')
    values = np.nan_to_num(np.array(values, dtype=float), nan=0.0, posinf=0.0, neginf=0.0)
    return values, names

def extract_engineered_features(nodes_df: pd.DataFrame, engineered_cols: list) -> tuple:
    values = []
    names = []
    for col in engineered_cols:
        if col not in nodes_df.columns:
            continue
        col_values = pd.to_numeric(nodes_df[col], errors='coerce')
        if col_values.notna().any():
            values.append(float(col_values.dropna().iloc[0]))
        else:
            values.append(0.0)
        names.append(col)
    if not values:
        return np.array([], dtype=float), []
    values = np.nan_to_num(np.array(values, dtype=float), nan=0.0, posinf=0.0, neginf=0.0)
    return values, names

def build_engineered_feature_vector(nodes_df: pd.DataFrame) -> tuple:
    market_vector, market_names = compute_market_features(nodes_df, market_indicator_cols)
    engineered_vector, engineered_names = extract_engineered_features(nodes_df, engineered_feature_cols)
    feature_blocks = [block for block in [market_vector, engineered_vector] if block.size > 0]
    if not feature_blocks:
        return np.array([], dtype=float), []
    combined = np.concatenate(feature_blocks)
    feature_names = market_names + engineered_names
    return combined, feature_names

def build_snapshots(node_features: pd.DataFrame, static_edges_df: pd.DataFrame, dyn_edges_df: pd.DataFrame, price_by_date: pd.DataFrame, lookback_hours: int = 24) -> tuple:
    engineered_rows = []
    targets = []
    dates = []
    snapshots = []
    feature_names = None
    for ts in sorted(node_features['date'].unique()):
        ts = pd.Timestamp(ts)
        nodes_df = node_features[node_features['date'] == ts].copy()
        if nodes_df.empty:
            continue
        adjacency = build_adjacency_matrix(nodes_df, static_edges_df, dyn_edges_df, ts, lookback_hours=lookback_hours)
        node_matrix = nodes_df[feature_columns_graph].apply(pd.to_numeric, errors='coerce').fillna(0.0).to_numpy(dtype=float)
        if node_matrix.size == 0:
            continue
        engineered_vector, names = build_engineered_feature_vector(nodes_df)
        if engineered_vector.size == 0:
            continue
        target_series = price_by_date['wti_ret_next'].reindex([ts]) if 'wti_ret_next' in price_by_date.columns else price_by_date['wti_delta_next'].reindex([ts])
        target = target_series.iloc[0] if ts in target_series.index else np.nan
        if pd.isna(target):
            continue
        if feature_names is None:
            feature_names = names
        engineered_rows.append(engineered_vector.astype(np.float32))
        targets.append(float(target))
        dates.append(ts)
        snapshots.append(
            GraphSnapshot(
                date=ts,
                node_features=node_matrix.astype(np.float32),
                adjacency=adjacency.astype(np.float32),
                engineered_features=engineered_vector.astype(np.float32),
                target=float(target)
            )
        )
    if not engineered_rows:
        raise ValueError('No graph snapshots were built; verify that node features and price targets overlap.')
    engineered_matrix = np.vstack(engineered_rows)
    targets = np.array(targets, dtype=np.float32)
    feature_names = feature_names or []
    return engineered_matrix, targets, dates, feature_names, snapshots

In [52]:
def compute_regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
  y_true = np.asarray(y_true, dtype=float)
  y_pred = np.asarray(y_pred, dtype=float)
  if y_true.size == 0 or y_pred.size == 0:
      return {
          'mae': float('nan'),
          'rmse': float('nan'),
          'r2': float('nan'),
          'directional_accuracy': float('nan')
      }
  mae = mean_absolute_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred)
  r2 = r2_score(y_true, y_pred) if y_true.size > 1 else float('nan')
  mask = (y_true != 0) | (y_pred != 0)
  if mask.any():
      da = float((np.sign(y_true[mask]) == np.sign(y_pred[mask])).mean())
  else:
      da = float('nan')
  return {
      'mae': float(mae),
      'rmse': float(rmse),
      'r2': float(r2),
      'directional_accuracy': da
  }


def train_tabular_baseline(engineered_matrix: np.ndarray, targets: np.ndarray, dates: list, feature_names: list, split_index: int) -> dict:
  if engineered_matrix.ndim == 1:
      engineered_matrix = engineered_matrix.reshape(-1, 1)
  X_train = engineered_matrix[:split_index]
  X_test = engineered_matrix[split_index:]
  y_train = targets[:split_index]
  y_test = targets[split_index:]
  train_dates = dates[:split_index]
  test_dates = dates[split_index:]

  model = lgb.LGBMRegressor(
      learning_rate=0.05,
      n_estimators=400,
      num_leaves=31,
      feature_fraction=0.9,
      bagging_fraction=0.8,
      bagging_freq=5,
      min_data_in_leaf=10,
      random_state=42
  )
  model.fit(X_train, y_train)

  train_pred = model.predict(X_train)
  test_pred = model.predict(X_test) if len(X_test) else np.array([], dtype=float)

  metrics = {
      'train': compute_regression_metrics(y_train, train_pred),
      'test': compute_regression_metrics(y_test, test_pred)
  }

  feature_importance = pd.DataFrame()
  if feature_names:
      booster = model.booster_
      feature_importance = pd.DataFrame({
          'feature': feature_names,
          'importance_gain': booster.feature_importance(importance_type='gain'),
          'importance_split': booster.feature_importance(importance_type='split')
      }).sort_values('importance_gain', ascending=False).reset_index(drop=True)

  predictions = {
      'train': pd.DataFrame({'date': train_dates, 'target': y_train, 'prediction': train_pred}),
      'test': pd.DataFrame({'date': test_dates, 'target': y_test, 'prediction': test_pred})
  }

In [39]:
import torch
from torch import nn

class SimpleGCN(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int = 64, embedding_dim: int = 32):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.lin1 = nn.Linear(input_dim, hidden_dim)
        self.lin2 = nn.Linear(hidden_dim, embedding_dim)
        self.readout = nn.Linear(embedding_dim, 1)

    def forward(self, features: torch.Tensor, adjacency: torch.Tensor) -> tuple:
        h = adjacency @ features
        h = torch.relu(self.lin1(h))
        h = adjacency @ h
        h = torch.relu(self.lin2(h))
        graph_embedding = h.mean(dim=0)
        output = self.readout(graph_embedding)
        return output.squeeze(-1), graph_embedding

def _predict_with_gnn(model: SimpleGCN, snapshots: list) -> tuple:
    predictions = []
    embeddings = []
    dates = []
    for snap in snapshots:
        features = torch.from_numpy(snap.node_features).float()
        adjacency = torch.from_numpy(snap.adjacency).float()
        with torch.no_grad():
            pred, embedding = model(features, adjacency)
        predictions.append(pred.item())
        embeddings.append(embedding.detach().cpu().numpy())
        dates.append(snap.date)
    if embeddings:
        embedding_matrix = np.vstack(embeddings).astype(np.float32)
    else:
        embedding_matrix = np.empty((0, getattr(model, 'embedding_dim', 0)), dtype=np.float32)
    predictions = np.array(predictions, dtype=np.float32)
    return predictions, embedding_matrix, dates

def train_gnn_model(snapshots: list, split_index: int, epochs: int = 200, learning_rate: float = 1e-3, hidden_dim: int = 64, embedding_dim: int = 32, weight_decay: float = 1e-4) -> dict:
    if not snapshots:
        raise ValueError('No graph snapshots provided for GNN training.')

    input_dim = snapshots[0].node_features.shape[1]
    model = SimpleGCN(input_dim, hidden_dim=hidden_dim, embedding_dim=embedding_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    loss_fn = nn.MSELoss()

    train_snapshots = snapshots[:split_index]
    test_snapshots = snapshots[split_index:]

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for snap in train_snapshots:
            features = torch.from_numpy(snap.node_features).float()
            adjacency = torch.from_numpy(snap.adjacency).float()
            target = torch.tensor(snap.target, dtype=torch.float32)

            optimizer.zero_grad()
            prediction, _ = model(features, adjacency)
            loss = loss_fn(prediction, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        if (epoch + 1) % max(1, epochs // 5) == 0:
            avg_loss = total_loss / max(1, len(train_snapshots))
            print(f"Epoch {epoch + 1}/{epochs} - Train loss: {avg_loss:.4f}")

    model.eval()
    train_pred, train_embeddings, train_dates = _predict_with_gnn(model, train_snapshots)
    test_pred, test_embeddings, test_dates = _predict_with_gnn(model, test_snapshots)
    all_pred, all_embeddings, all_dates = _predict_with_gnn(model, snapshots)

    train_targets = np.array([snap.target for snap in train_snapshots], dtype=np.float32)
    test_targets = np.array([snap.target for snap in test_snapshots], dtype=np.float32)

    metrics = {
        'train': compute_regression_metrics(train_targets, train_pred),
        'test': compute_regression_metrics(test_targets, test_pred)
    }

    predictions = {
        'train': pd.DataFrame({'date': train_dates, 'target': train_targets, 'prediction': train_pred}),
        'test': pd.DataFrame({'date': test_dates, 'target': test_targets, 'prediction': test_pred})
    }

    embedding_names = [f'gnn_embedding_{i}' for i in range(all_embeddings.shape[1])] if all_embeddings.size else []

    return {
        'model': model,
        'metrics': metrics,
        'predictions': predictions,
        'embeddings': {
            'train': train_embeddings,
            'test': test_embeddings,
            'all': all_embeddings,
            'dates': all_dates,
            'feature_names': embedding_names
        }
    }

In [61]:
from typing import Optional

def train_hybrid_lightgbm(engineered_matrix: np.ndarray, gnn_embeddings: np.ndarray, targets: np.ndarray, dates: list, engineered_feature_names: list, gnn_feature_names: list, split_index: int) -> dict:
    if engineered_matrix.ndim == 1:
        engineered_matrix = engineered_matrix.reshape(-1, 1)
    if gnn_embeddings.ndim == 1:
        gnn_embeddings = gnn_embeddings.reshape(-1, 1)
    if engineered_matrix.shape[0] != gnn_embeddings.shape[0]:
        raise ValueError('Engineered features and GNN embeddings must align on samples.')

    combined_features = np.hstack([engineered_matrix, gnn_embeddings])
    feature_names = list(engineered_feature_names) + list(gnn_feature_names)
    if not feature_names:
        feature_names = [f'feature_{i}' for i in range(combined_features.shape[1])]

    X_train = combined_features[:split_index]
    X_test = combined_features[split_index:]
    y_train = targets[:split_index]
    y_test = targets[split_index:]
    train_dates = dates[:split_index]
    test_dates = dates[split_index:]

    X_train_df = pd.DataFrame(X_train, columns=feature_names)
    X_test_df = pd.DataFrame(X_test, columns=feature_names)

    validation_fraction = 0.2
    min_train_points = 5
    if len(X_train_df) >= min_train_points:
        val_size = max(1, int(len(X_train_df) * validation_fraction))
        if val_size >= len(X_train_df):
            val_size = len(X_train_df) - 1
        val_start = len(X_train_df) - val_size
        X_fit_df = X_train_df.iloc[:val_start]
        y_fit = y_train[:val_start]
        X_val_df = X_train_df.iloc[val_start:]
        y_val = y_train[val_start:]
        eval_set = [(X_val_df, y_val)]
        callbacks = [lgb.early_stopping(50, verbose=False), lgb.log_evaluation(period=0)]
    else:
        X_fit_df = X_train_df
        y_fit = y_train
        eval_set = None
        callbacks = None

    model = lgb.LGBMRegressor(
        learning_rate=0.05,
        n_estimators=500,
        num_leaves=31,
        subsample=0.8,
        subsample_freq=5,
        colsample_bytree=0.9,
        min_data_in_leaf=10,
        random_state=42
    )

    fit_kwargs = {'feature_name': feature_names}
    if eval_set:
        fit_kwargs.update({
            'eval_set': eval_set,
            'eval_names': ['validation'],
            'eval_metric': 'l2',
            'callbacks': callbacks
        })

    model.fit(X_fit_df, y_fit, **fit_kwargs)

    predict_kwargs = {}
    if getattr(model, 'best_iteration_', None) is not None:
        predict_kwargs['num_iteration'] = model.best_iteration_

    train_pred = model.predict(X_train_df, **predict_kwargs)
    test_pred = model.predict(X_test_df, **predict_kwargs) if len(X_test_df) else np.array([], dtype=float)

    metrics = {
        'train': compute_regression_metrics(y_train, train_pred),
        'test': compute_regression_metrics(y_test, test_pred)
    }

    booster = model.booster_
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance_gain': booster.feature_importance(importance_type='gain'),
        'importance_split': booster.feature_importance(importance_type='split')
    }).sort_values('importance_gain', ascending=False).reset_index(drop=True)

    predictions = {
        'train': pd.DataFrame({'date': train_dates, 'target': y_train, 'prediction': train_pred}),
        'test': pd.DataFrame({'date': test_dates, 'target': y_test, 'prediction': test_pred})
    }

    data_splits = {
        'X_train': X_train_df,
        'X_test': X_test_df,
        'y_train': y_train,
        'y_test': y_test,
        'train_dates': train_dates,
        'test_dates': test_dates,
        'feature_names': feature_names,
        'best_iteration': getattr(model, 'best_iteration_', None)
    }

    return {
        'model': model,
        'feature_names': feature_names,
        'metrics': metrics,
        'feature_importance': feature_importance,
        'predictions': predictions,
        'data': data_splits
    }

def run_modeling_pipeline(node_features: pd.DataFrame, static_edges_df: pd.DataFrame, dyn_edges_df: pd.DataFrame, price_by_date: pd.DataFrame, lookback_hours: int = 24, test_fraction: float = 0.2, gnn_params: Optional[dict] = None) -> dict:
    engineered_matrix, targets, dates, engineered_feature_names, snapshots = build_snapshots(
        node_features, static_edges_df, dyn_edges_df, price_by_date, lookback_hours=lookback_hours
    )
    n_samples = len(targets)
    if n_samples == 0:
        raise ValueError('No samples available for training pipeline.')

    split_index = int(n_samples * (1 - test_fraction))
    split_index = max(1, min(split_index, n_samples))


    gnn_kwargs = gnn_params or {}
    gnn_results = train_gnn_model(snapshots, split_index, **gnn_kwargs)
    gnn_embeddings = gnn_results['embeddings']['all']
    gnn_feature_names = gnn_results['embeddings']['feature_names']

    hybrid_results = train_hybrid_lightgbm(
        engineered_matrix,
        gnn_embeddings,
        targets,
        dates,
        engineered_feature_names,
        gnn_feature_names,
        split_index
    )

    return {
        'engineered_matrix': engineered_matrix,
        'targets': targets,
        'dates': dates,
        'snapshots': snapshots,
        'split_index': split_index,
        'gnn': gnn_results,
        'hybrid': hybrid_results
    }

In [62]:
pipeline_results = run_modeling_pipeline(
    node_features=node_features,
    static_edges_df=static_edges_df,
    dyn_edges_df=dyn_edges_df,
    price_by_date=price_by_date,
    lookback_hours=24,
    test_fraction=0.2,
    gnn_params={'epochs': 150, 'hidden_dim': 64, 'embedding_dim': 32, 'learning_rate': 1e-3}
)

Epoch 30/150 - Train loss: 0.0004
Epoch 60/150 - Train loss: 0.0004
Epoch 90/150 - Train loss: 0.0004
Epoch 120/150 - Train loss: 0.0004
Epoch 150/150 - Train loss: 0.0004
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000490 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2293
[LightGBM] [Info] Number of data points in the train set: 468, number of used features: 17
[LightGBM] [Info] Start training from score -0.000008


In [63]:
metrics_records = []
missing_models = []
for model_name in ['hybrid']:
    stage_result = pipeline_results.get(model_name)
    if not stage_result:
        missing_models.append(model_name)
        continue
    metrics = stage_result.get('metrics')
    if not isinstance(metrics, dict):
        raise ValueError(f"Metrics for '{model_name}' stage are unavailable.")
    for split_name, metric_values in metrics.items():
        if not isinstance(metric_values, dict):
            continue
        record = {'model': model_name, 'split': split_name}
        record.update(metric_values)
        metrics_records.append(record)

if missing_models:
    raise ValueError(
        'No results were produced for the following stages: ' + ', '.join(missing_models) + '. Check earlier logs for training errors.'
    )

metrics_table = pd.DataFrame(metrics_records)
metrics_table

Unnamed: 0,model,split,mae,rmse,r2,directional_accuracy
0,hybrid,train,0.014741,0.000353,0.182487,0.748718
1,hybrid,test,0.016244,0.000442,-0.002574,0.537415


In [None]:
# --- CELL TO SAVE MODEL ARTIFACTS TO GCS ---

import joblib
import json
import numpy as np
import os
import io

print("Saving model and evaluation data to GCS...")

# --- 1. Define Paths ---
# Use a temporary local directory for staging files
local_tmp_dir = "/tmp/model_export"
os.makedirs(local_tmp_dir, exist_ok=True)

# Define the GCS prefix where artifacts will be stored
gcs_model_prefix = PROCESSED_PREFIX + "model_artifacts/"

# Define local paths
local_model_path = os.path.join(local_tmp_dir, "gbs_model.joblib")
local_x_test_path = os.path.join(local_tmp_dir, "X_test.npy")
local_y_test_path = os.path.join(local_tmp_dir, "y_test.npy")
local_features_path = os.path.join(local_tmp_dir, "feature_names.json")

# --- 2. Retrieve Data to Save ---
model_to_save = pipeline_results['model']
feature_names_to_save = pipeline_results['feature_names']

# Re-build snapshots to get the correct test set split
X_all, y_all, dates_all, _, _ = build_snapshots(
    node_features, static_edges_df, dyn_edges_df, price_by_date, lookback_hours=24
)
n_samples = len(X_all)
split_index = max(1, int(n_samples * (1 - 0.2)))
X_test = X_all[split_index:]
y_test = y_all[split_index:]


# --- 3. Save Files Locally ---
print(f"Staging files in {local_tmp_dir}...")
# 1. Save the model
joblib.dump(model_to_save, local_model_path)

# 2. Save the test data
np.save(local_x_test_path, X_test)
np.save(local_y_test_path, y_test)

# 3. Save the feature names
with open(local_features_path, 'w') as f:
    json.dump(feature_names_to_save, f)


# --- 4. Upload Files to GCS ---
print(f"Uploading artifacts to GCS bucket '{BUCKET_NAME}' at prefix '{gcs_model_prefix}'...")

files_to_upload = [
    ("gbs_model.joblib", local_model_path),
    ("X_test.npy", local_x_test_path),
    ("y_test.npy", local_y_test_path),
    ("feature_names.json", local_features_path)
]

gcs_paths = {}

for filename, local_path in files_to_upload:
    try:
        gcs_path = f"{gcs_model_prefix}{filename}"
        blob = bucket.blob(gcs_path)
        blob.upload_from_filename(local_path)
        gcs_paths[filename] = f"gs://{BUCKET_NAME}/{gcs_path}"
    except Exception as e:
        print(f"ERROR uploading {filename}: {e}")

print("\n--- Upload Complete ---")
print(f"Artifacts saved to GCS:")
print(f"  - Model: {gcs_paths.get('gbs_model.joblib')}")
print(f"  - X_test: {gcs_paths.get('X_test.npy')} (shape: {X_test.shape})")
print(f"  - y_test: {gcs_paths.get('y_test.npy')} (shape: {y_test.shape})")
print(f"  - Features: {gcs_paths.get('feature_names.json')} (count: {len(feature_names_to_save)})")

Saving model and evaluation data to GCS...
Staging files in /tmp/model_export...
Uploading artifacts to GCS bucket 'gdelt_raw_3_years' at prefix 'processed_data/model_artifacts/'...

--- Upload Complete ---
Artifacts saved to GCS:
  - Model: gs://gdelt_raw_3_years/processed_data/model_artifacts/gbs_model.joblib
  - X_test: gs://gdelt_raw_3_years/processed_data/model_artifacts/X_test.npy (shape: (147, 24))
  - y_test: gs://gdelt_raw_3_years/processed_data/model_artifacts/y_test.npy (shape: (147,))
  - Features: gs://gdelt_raw_3_years/processed_data/model_artifacts/feature_names.json (count: 24)

Cleaned up temporary directory: /tmp/model_export
