In [5]:
# ================================
# FX Buying Price Forecast Toolkit
# SARIMAX + XGBoost + TS-CV + SHAP
# ================================

# --- 0) Imports & Toggles ---
import pathlib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")

In [6]:
# Optional heavy steps
RUN_SARIMAX = True        # set False if statsmodels too slow
RUN_XGBOOST = True        # set False to skip XGB (will fallback to RF)
RUN_SHAP = True           # set False to skip SHAP
N_SPLITS = 4

In [8]:
# Try XGBoost; fallback to RandomForest
XGB_AVAILABLE, SHAP_AVAILABLE = False, False
try:
    from xgboost import XGBRegressor
    XGB_AVAILABLE = True
except Exception:
    pass

try:
    import shap
    SHAP_AVAILABLE = True
except Exception:
    pass

# SARIMAX
if RUN_SARIMAX:
    try:
        from statsmodels.tsa.statespace.sarimax import SARIMAX
    except Exception:
        RUN_SARIMAX = False
        print("SARIMAX import failed; skipping.")

In [9]:
# --- 1) Load Data ---
DATA_PATH = "datasets/cleaned/merged_fx_dataset.csv"  # change if needed
assert os.path.exists(DATA_PATH), f"File not found: {DATA_PATH}"

raw = pd.read_csv(DATA_PATH)
raw.columns = [str(c).strip() for c in raw.columns]

In [10]:
# Date handling
date_col = [c for c in raw.columns if c.lower() == "date"]
assert len(date_col) == 1, "Expected a single 'Date' column."
DATE_COL = date_col[0]

raw[DATE_COL] = pd.to_datetime(raw[DATE_COL], errors="coerce")
raw = raw.dropna(subset=[DATE_COL]).sort_values(DATE_COL).set_index(DATE_COL)

# Numeric coercion
for c in raw.columns:
    raw[c] = pd.to_numeric(raw[c], errors="coerce")

# Fill slow-moving macro (monthly/quarterly)
df = raw.ffill().bfill()

In [11]:
def make_features(data: pd.DataFrame):
    d = data.copy()
    assert "Buying" in d.columns, "Target column 'Buying' not found."

    # Lags & rolling stats of target
    lag_list = [1, 5, 10, 21, 63]
    for L in lag_list:
        d[f"Buy_lag{L}"] = d["Buying"].shift(L)

    for W in [5, 21, 63]:
        d[f"Buy_roll_mean_{W}"] = d["Buying"].shift(1).rolling(W).mean()
        d[f"Buy_roll_std_{W}"] = d["Buying"].shift(1).rolling(W).std()

    # Spreads (GH - US)
    if "GhInflationRate" in d.columns and "USInflationRate" in d.columns:
        d["Inflation_Spread"] = d["GhInflationRate"] - d["USInflationRate"]
    if "GhInterestRate" in d.columns and "USInterestRate" in d.columns:
        d["Rate_Spread"] = d["GhInterestRate"] - d["USInterestRate"]

    # Commodity log returns
    for col in ["BrentOil", "Gold", "Cocoa"]:
        if col in d.columns:
            d[f"{col}_ret"] = np.log(d[col].replace(0, np.nan)).diff()

    # Macro deltas (fast changes)
    for col in ["NIR", "NetForeignAssets", "Imports", "Exports"]:
        if col in d.columns:
            d[f"{col}_chg"] = d[col].diff()

    # Calendar
    d["Month"] = d.index.month
    d["Quarter"] = d.index.quarter
    d["Dow"] = d.index.dayofweek

    # Level exogenous that might help (will be lagged)
    base_levels = [c for c in [
        "MPR", "GhInflationRate", "USInflationRate", "GhInterestRate", "USInterestRate",
        "NIR", "NetForeignAssets", "Imports", "Exports"
    ] if c in d.columns]

    # XGBoost feature set (wide)
    feat_cols = []
    feat_cols += [f"Buy_lag{L}" for L in lag_list]
    feat_cols += [f"Buy_roll_mean_{W}" for W in [5, 21, 63]]
    feat_cols += [f"Buy_roll_std_{W}" for W in [5, 21, 63]]
    feat_cols += [c for c in ["Inflation_Spread",
                              "Rate_Spread"] if c in d.columns]
    feat_cols += [f"{c}_ret" for c in ["BrentOil",
                                       "Gold", "Cocoa"] if f"{c}_ret" in d.columns]
    feat_cols += [f"{c}_chg" for c in ["NIR", "NetForeignAssets",
                                       "Imports", "Exports"] if f"{c}_chg" in d.columns]
    feat_cols += base_levels + ["Month", "Quarter", "Dow"]

    # Shift ALL features by 1 step: features(t-1) -> predict Buying(t)
    d[feat_cols] = d[feat_cols].shift(1)

    # Target
    d["y"] = d["Buying"]

    d = d.dropna()

    # Reduced exog for SARIMAX (stable set)
    sarimax_exog = [c for c in [
        "Inflation_Spread", "Rate_Spread", "BrentOil_ret", "Gold_ret", "Cocoa_ret", "MPR", "NIR"
    ] if c in d.columns]

    return d, feat_cols, sarimax_exog

In [12]:
work, XFEATS, SARIMAX_EXOG = make_features(df)
X = work[XFEATS].copy()
y = work["y"].copy()

In [13]:
# --- 3) Helpers: Metrics & CV ---
def eval_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

In [14]:
tscv = TimeSeriesSplit(n_splits=N_SPLITS)

records = []
pred_logs = []   # to plot last fold
models_trained = {}

In [22]:
if RUN_XGBOOST:
    if XGB_AVAILABLE:
        xgb = XGBRegressor(
            n_estimators=500, learning_rate=0.05, max_depth=5,
            subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
            random_state=42, objective="reg:squarederror"
        )
        xgb_name = "XGBoost"
    else:
        from sklearn.ensemble import RandomForestRegressor
        xgb = RandomForestRegressor(
            n_estimators=500, random_state=42, n_jobs=-1)
        xgb_name = "RandomForest(fallback)"

    fold = 0
    for tr, va in tscv.split(X):
        fold += 1
        Xtr, Xva = X.iloc[tr], X.iloc[va]
        ytr, yva = y.iloc[tr], y.iloc[va]

        xgb.fit(Xtr, ytr)
        yhat = xgb.predict(Xva)

        mae, rmse, r2 = eval_metrics(yva, yhat)
        records.append({"model": xgb_name, "fold": fold,
                       "MAE": mae, "RMSE": rmse, "R2": r2})
        pred_logs.append({"model": xgb_name, "fold": fold,
                         "dates": Xva.index, "y_true": yva, "y_pred": yhat})

    models_trained[xgb_name] = (xgb, XFEATS)

In [16]:
if RUN_SARIMAX and len(SARIMAX_EXOG) > 0:
    fold = 0
    for tr, va in tscv.split(work):
        fold += 1
        ytr, yva = y.iloc[tr], y.iloc[va]
        Xtr_ex, Xva_ex = work[SARIMAX_EXOG].iloc[tr], work[SARIMAX_EXOG].iloc[va]

        try:
            sarimax = SARIMAX(
                endog=ytr, exog=Xtr_ex,
                order=(1, 1, 1),
                enforce_stationarity=False,
                enforce_invertibility=False
            )
            res = sarimax.fit(disp=False)
            yhat = res.forecast(steps=len(va), exog=Xva_ex)

            mae, rmse, r2 = eval_metrics(yva, yhat)
            records.append({"model": "SARIMAX(1,1,1)+exog",
                           "fold": fold, "MAE": mae, "RMSE": rmse, "R2": r2})
            pred_logs.append({"model": "SARIMAX(1,1,1)+exog", "fold": fold,
                             "dates": yva.index, "y_true": yva, "y_pred": yhat.values})
        except Exception as e:
            records.append({"model": "SARIMAX(1,1,1)+exog", "fold": fold,
                           "MAE": np.nan, "RMSE": np.nan, "R2": np.nan, "error": str(e)})

In [23]:
# --- 6) Metrics summary ---
metrics_df = pd.DataFrame(records)
summary = (
    metrics_df.groupby("model")[["MAE", "RMSE", "R2"]
                                ].mean().sort_values("RMSE")
    if not metrics_df.empty else pd.DataFrame(columns=["MAE", "RMSE", "R2"])
)
print("\n=== Cross-validated Performance (averaged) ===")
print(summary)


=== Cross-validated Performance (averaged) ===
                          MAE      RMSE        R2
model                                            
SARIMAX(1,1,1)+exog  0.736586  1.058551 -2.573688
XGBoost              0.811151  1.109448 -2.935091


In [17]:
out_dir = pathlib.Path("./fx_outputs")
out_dir.mkdir(parents=True, exist_ok=True)

In [24]:
last_fold = {}
for rec in pred_logs:
    last_fold[rec["model"]] = rec  # keep last occurrence (latest fold)

plot_paths = []
for model_name, rec in last_fold.items():
    dates = pd.to_datetime(rec["dates"])
    yt = np.asarray(rec["y_true"], float)
    yp = np.asarray(rec["y_pred"], float)

    plt.figure()
    plt.plot(dates, yt, label="Actual")
    plt.plot(dates, yp, label="Predicted")
    plt.title(f"{model_name} — Last Fold Predictions")
    plt.xlabel("Date")
    plt.ylabel("Buying")
    plt.legend()
    fp = out_dir / \
        f"{model_name.replace(' ', '_').replace('(', '').replace(')', '').replace('+', '_')}_last_fold.png"
    plt.savefig(fp, bbox_inches="tight")
    plt.close()
    plot_paths.append(str(fp))

print("\nSaved prediction plots:")
for p in plot_paths:
    print(" -", p)


Saved prediction plots:
 - fx_outputs/XGBoost_last_fold.png
 - fx_outputs/SARIMAX1,1,1_exog_last_fold.png


In [25]:
if RUN_XGBOOST:
    model_name = list(models_trained.keys())[0]
    model, feat_cols = models_trained[model_name]

    # Built-in feature importance (Top 20)
    if hasattr(model, "feature_importances_"):
        fi = pd.DataFrame(
            {"feature": feat_cols, "importance": model.feature_importances_})
        fi = fi.sort_values("importance", ascending=False)
        fi_path = out_dir / "feature_importance_top20.csv"
        fi.head(20).to_csv(fi_path, index=False)

        top = fi.head(20).iloc[::-1]
        plt.figure()
        plt.barh(top["feature"], top["importance"])
        plt.title(f"{model_name} Feature Importance (Top 20)")
        plt.xlabel("Importance")
        plt.ylabel("Feature")
        fp = out_dir / "feature_importance_top20.png"
        plt.savefig(fp, bbox_inches="tight")
        plt.close()
        print("\nFeature importance saved:", fi_path, "and", fp)

    # SHAP (optional)
    if RUN_SHAP and SHAP_AVAILABLE and XGB_AVAILABLE and model_name.startswith("XGBoost"):
        sample_n = min(2000, X.shape[0])
        X_sample = X.sample(sample_n, random_state=42)
        explainer = shap.TreeExplainer(model)
        shap_values = explainer(X_sample)

        plt.figure()
        shap.summary_plot(shap_values, X_sample, plot_type="bar", show=False)
        fp = out_dir / "shap_summary_bar.png"
        plt.savefig(fp, bbox_inches="tight")
        plt.close()
        print("SHAP bar summary saved:", fp)


Feature importance saved: fx_outputs/feature_importance_top20.csv and fx_outputs/feature_importance_top20.png
SHAP bar summary saved: fx_outputs/shap_summary_bar.png


In [26]:
# --- 9) Save full metrics & per-timestamp predictions ---
metrics_csv = out_dir / "fx_model_metrics_all_folds.csv"
metrics_df.to_csv(metrics_csv, index=False)

pred_rows = []
for rec in pred_logs:
    for dt, yt, yp in zip(rec["dates"], rec["y_true"], rec["y_pred"]):
        pred_rows.append({
            "model": rec["model"],
            "fold": rec["fold"],
            "date": pd.to_datetime(dt),
            "y_true": float(yt),
            "y_pred": float(yp)
        })
pred_df = pd.DataFrame(pred_rows).sort_values(["model", "fold", "date"])
pred_csv = out_dir / "fx_model_predictions.csv"
pred_df.to_csv(pred_csv, index=False)

print("\n=== Files saved ===")
print(" - Metrics (all folds):", metrics_csv)
print(" - Predictions (all folds):", pred_csv)


=== Files saved ===
 - Metrics (all folds): fx_outputs/fx_model_metrics_all_folds.csv
 - Predictions (all folds): fx_outputs/fx_model_predictions.csv


In [None]:








# --- 4) XGBoost (or RF fallback) ---



# --- 5) SARIMAX ---






# --- 7) Plots (last fold of each model) ---



# --- 8) Feature importance & SHAP (tree models) ---





KeyboardInterrupt: 

In [4]:
# ============================================================
# GradientBoostingRegressor for USD/GHS "Buying" (time-series)
# - Leak-safe lags/rolls
# - TimeSeriesSplit CV (MAE/RMSE/R2)
# - Last-fold Actual vs Predicted plot
# - Feature importance (CSV + bar chart)
# ============================================================

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")


DATA_PATH = "datasets/cleaned/merged_fx_dataset.csv"   # <-- change if needed
OUT_DIR = "./fx_gbr_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# ---------- 1) Load ----------
raw = pd.read_csv(DATA_PATH)
raw.columns = [str(c).strip() for c in raw.columns]
date_col = [c for c in raw.columns if c.lower() == "date"]
assert len(date_col) == 1, "Expected a single 'Date' column."
DATE = date_col[0]

raw[DATE] = pd.to_datetime(raw[DATE], errors="coerce")
raw = raw.dropna(subset=[DATE]).sort_values(DATE).set_index(DATE)

# numeric coercion + sensible fill for slow-moving series
for c in raw.columns:
    raw[c] = pd.to_numeric(raw[c], errors="coerce")
df = raw.ffill().bfill()

# ---------- 2) Features (NO LEAKAGE) ----------


def make_features(data: pd.DataFrame):
    d = data.copy()
    if "Buying" not in d.columns:
        raise ValueError("Target 'Buying' not found.")

    # target lags
    for L in [1, 5, 10, 21]:
        d[f"Buy_lag{L}"] = d["Buying"].shift(L)

    # rolling stats (shifted to avoid leakage)
    for W in [5, 21]:
        d[f"Buy_roll_mean_{W}"] = d["Buying"].shift(1).rolling(W).mean()
        d[f"Buy_roll_std_{W}"] = d["Buying"].shift(1).rolling(W).std()

    # spreads (GH - US)
    if "GhInflationRate" in d.columns and "USInflationRate" in d.columns:
        d["Inflation_Spread"] = d["GhInflationRate"] - d["USInflationRate"]
    if "GhInterestRate" in d.columns and "USInterestRate" in d.columns:
        d["Rate_Spread"] = d["GhInterestRate"] - d["USInterestRate"]

    # commodity log returns
    for col in ["BrentOil", "Gold", "Cocoa"]:
        if col in d.columns:
            d[f"{col}_ret"] = np.log(d[col].replace(0, np.nan)).diff()

    # macro deltas
    for col in ["NIR", "NetForeignAssets", "Imports", "Exports"]:
        if col in d.columns:
            d[f"{col}_chg"] = d[col].diff()

    # calendar
    d["Month"] = d.index.month
    d["Quarter"] = d.index.quarter
    d["Dow"] = d.index.dayofweek

    feat_cols = []
    feat_cols += [f"Buy_lag{L}" for L in [1, 5, 10, 21]]
    feat_cols += [f"Buy_roll_mean_{W}" for W in [5, 21]]
    feat_cols += [f"Buy_roll_std_{W}" for W in [5, 21]]
    feat_cols += [c for c in ["Inflation_Spread",
                              "Rate_Spread"] if c in d.columns]
    feat_cols += [f"{c}_ret" for c in ["BrentOil",
                                       "Gold", "Cocoa"] if f"{c}_ret" in d.columns]
    feat_cols += [f"{c}_chg" for c in ["NIR", "NetForeignAssets",
                                       "Imports", "Exports"] if f"{c}_chg" in d.columns]
    feat_cols += ["Month", "Quarter", "Dow"]

    # shift ALL features by 1 so we predict y_t with info available at t-1
    d[feat_cols] = d[feat_cols].shift(1)

    d["y"] = d["Buying"]
    d = d.dropna()
    return d, feat_cols


work, FEATURES = make_features(df)
X, y = work[FEATURES], work["y"]

# ---------- 3) Model & CV ----------
gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.9,
    random_state=42
)

tscv = TimeSeriesSplit(n_splits=3)  # expand to 4–5 if you want
rows, preds = [], []
fold = 0
for tr, va in tscv.split(X):
    fold += 1
    Xtr, Xva = X.iloc[tr], X.iloc[va]
    ytr, yva = y.iloc[tr], y.iloc[va]

    gbr.fit(Xtr, ytr)
    yhat = gbr.predict(Xva)

    mae = mean_absolute_error(yva, yhat)
    rmse = root_mean_squared_error(yva, yhat)
    r2 = r2_score(yva, yhat)

    rows.append({"model": "GradientBoostingRegressor",
                "fold": fold, "MAE": mae, "RMSE": rmse, "R2": r2})
    preds.append({"fold": fold, "dates": Xva.index, "y_true": yva,
                 "y_pred": pd.Series(yhat, index=Xva.index)})

metrics = pd.DataFrame(rows)
metrics.to_csv(os.path.join(OUT_DIR, "gbr_metrics_all_folds.csv"), index=False)
print(metrics.groupby("model")[["MAE", "RMSE", "R2"]].mean())

# ---------- 4) Plot last-fold ----------
last = preds[-1]
plt.figure()
plt.plot(last["dates"], last["y_true"].values, label="Actual")
plt.plot(last["dates"], last["y_pred"].values, label="Predicted")
plt.title("GradientBoostingRegressor — Last Fold Predictions")
plt.xlabel("Date")
plt.ylabel("Buying")
plt.legend()
plt.savefig(os.path.join(OUT_DIR, "gbr_last_fold_predictions.png"),
            bbox_inches="tight")
plt.close()

# ---------- 5) Feature importance ----------
fi = pd.DataFrame({"feature": FEATURES, "importance": gbr.feature_importances_}
                  ).sort_values("importance", ascending=False)
fi.to_csv(os.path.join(OUT_DIR, "gbr_feature_importance.csv"), index=False)

top = fi.head(20).iloc[::-1]
plt.figure()
plt.barh(top["feature"], top["importance"])
plt.title("GBR Feature Importance (Top 20)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.savefig(os.path.join(
    OUT_DIR, "gbr_feature_importance_top20.png"), bbox_inches="tight")
plt.close()

# ---------- 6) (Optional) Fit on all data & predict next step ----------
# Train on all history:
gbr.fit(X, y)

# Prepare one-step-ahead features from the most recent row (already lagged/shifted)
X_last = X.iloc[[-1]]
next_pred = gbr.predict(X_last)[0]
print(f"One-step ahead forecast (level): {next_pred:.4f}")

                                MAE     RMSE        R2
model                                                 
GradientBoostingRegressor  1.925321  2.50554 -1.540633
One-step ahead forecast (level): 14.6898
