Setup the path

In [5]:
# Ensure the repository root (the folder that contains "src/") is on sys.path
import sys
from pathlib import Path

CWD = Path.cwd().resolve()
if (CWD / "src").exists():
    PROJECT_ROOT = CWD
elif (CWD.parent / "src").exists():
    PROJECT_ROOT = CWD.parent
else:
    # Fallback: search up to 5 levels for a "src" folder
    PROJECT_ROOT = CWD
    for _ in range(5):
        if (PROJECT_ROOT.parent / "src").exists():
            PROJECT_ROOT = PROJECT_ROOT.parent
            break
        PROJECT_ROOT = PROJECT_ROOT.parent

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("Project root:", PROJECT_ROOT)
print("Has src?   :", (PROJECT_ROOT / "src").exists())


Project root: /Users/corentinperdrizet/Documents/Finance/deep-learning-market-prediction
Has src?   : True


Imports

In [6]:
# Core data and baseline imports
from src.data.dataset import prepare_dataset

# Optional config bridge (if present)
try:
    from src.data.config_bridge import make_default_cfg
except Exception:
    make_default_cfg = None

# Optional price loader (if present)
try:
    from src.data.loaders import load_prices
except Exception:
    load_prices = None

from src.training.evaluate import run_baselines


Build a config and prepare a dataset

In [7]:
# Build a config for your pipeline.
# 1) If make_default_cfg exists, use it so you stay aligned with your codebase.
# 2) Otherwise, use a minimal dict with sensible defaults.
if make_default_cfg is not None:
    cfg = make_default_cfg(ticker="BTC-USD", interval="1d")
else:
    cfg = dict(
        ticker="BTC-USD",
        interval="1d",
        label_type="direction",
        horizon=1,
        seq_len=64,
        scaler="standard",
        val_start=None,
        test_start=None,
    )

# IMPORTANT: your function expects cfg => pass it explicitly
D = prepare_dataset(cfg)
D_keys = list(D.keys())
print("Dataset keys:", D_keys)
print("Meta:", D.get("meta", {}))


[Quality] Missing per col:
 Price      Ticker 
Adj Close  BTC-USD    0
Close      BTC-USD    0
High       BTC-USD    0
Low        BTC-USD    0
Open       BTC-USD    0
Volume     BTC-USD    0
dtype: int64
[Quality] Duplicate timestamps: 0
Dataset keys: ['X_train', 'y_train', 'X_val', 'y_val', 'X_test', 'y_test', 'features', 'idx', 'meta']
Meta: {'ticker': 'BTC-USD', 'interval': '1d', 'label_type': 'direction', 'horizon': 1, 'seq_len': 64}


  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


Load prices (optional) and coerce to Series(DateTimeIndex)

In [8]:
import pandas as pd

px = None

# Try loader if available
if load_prices is not None:
    try:
        ticker = D.get("meta", {}).get("ticker", "BTC-USD")
        interval = D.get("meta", {}).get("interval", "1d")
        px = load_prices(ticker, interval)
    except Exception as e:
        print("[notebook] load_prices failed:", e)

# Fall back to anything embedded in the dataset dict
if px is None:
    for key in ("prices", "px", "close", "close_price"):
        if key in D:
            try:
                px = D[key]
                break
            except Exception:
                pass

def coerce_to_series_datetimeindex(prices):
    """Coerce input to pd.Series with a DateTimeIndex if possible; else return None."""
    if prices is None:
        return None
    if isinstance(prices, pd.Series):
        if not isinstance(prices.index, pd.DatetimeIndex):
            try:
                prices.index = pd.to_datetime(prices.index)
            except Exception:
                return None
        return prices
    if isinstance(prices, pd.DataFrame):
        for col in ("Close", "Adj Close", "close", "adj_close"):
            if col in prices.columns:
                s = prices[col].copy()
                if not isinstance(s.index, pd.DatetimeIndex):
                    s.index = pd.to_datetime(s.index)
                s.name = getattr(s, "name", "close")
                return s
        # Fallback to first column
        s = prices.iloc[:, 0].copy()
        if not isinstance(s.index, pd.DatetimeIndex):
            s.index = pd.to_datetime(s.index)
        s.name = getattr(s, "name", "close")
        return s
    return None

px = coerce_to_series_datetimeindex(px)
px.head() if px is not None else print("[notebook] No valid price series available; SMA baseline will be skipped.")


Date
2014-09-17    457.334015
2014-09-18    424.440002
2014-09-19    394.795990
2014-09-20    408.903992
2014-09-21    398.821014
Name: BTC-USD, dtype: float64

Run baselines and display

In [9]:
# XGBoost is optional; set use_xgb=True if you have it installed
df = run_baselines(D, prices=px, use_xgb=False, pooling_lr="last")
df


Unnamed: 0,model,split,accuracy,f1_pos,roc_auc,pr_auc,brier,pr_mean_precision
0,buy_hold,val,0.452991,0.623529,0.5,0.452991,0.252655,0.726496
1,buy_hold,test,0.509298,0.67488,0.5,0.509298,0.250094,0.754649
2,logreg[last],val,0.42735,0.48062,0.452241,0.429304,0.254644,0.414148
3,logreg[last],test,0.512397,0.594502,0.527477,0.538758,0.249315,0.535516
4,sma_50_200,val,0.547009,0.0,0.5,0.452991,0.452991,0.726496
5,sma_50_200,test,0.490702,0.0,0.5,0.509298,0.509298,0.754649


Save results

In [None]:
import pandas as pd

out = PROJECT_ROOT / "data" / "artifacts" / "baselines_metrics.csv"
out.parent.mkdir(parents=True, exist_ok=True)
pd.DataFrame(df).to_csv(out, index=False)
print("Saved:", out)
