# Fit the SVM model

In [1]:
import pandas as pd
import numpy as np
import os
import joblib
from glob import glob
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler

# Paths
input_folder = "data/processed/stock_data"
summary_folder = "summary"
model_folder = os.path.join(summary_folder, "svm")  # Changed from "random_forest"
os.makedirs(model_folder, exist_ok=True)

summary_results = []

for file_path in glob(os.path.join(input_folder, "*.csv")):
    stock_name = os.path.basename(file_path).replace("_features.csv", "")
    print(f"\n=== Processing {stock_name} ===")

    # Load data
    df = pd.read_csv(file_path)
    df.dropna(inplace=True)

    # Create binary target: 1 if price goes up tomorrow, 0 if down
    df['Direction'] = (df['Close'].shift(-1) > df['Close']).astype(int)
    df.dropna(inplace=True)

    # Features
    X = df.drop(columns=[
        "Date", "Close", "Target", "Direction",
        "High", "Low", "Open", "High_lag1", "Low_lag1", "Open_lag1"
    ])
    y = df['Direction']

    # Split
    train_size = int(len(X) * 0.8)
    X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
    y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

    # Scale features (important for SVM)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # SVM Classifier with probability estimation enabled
    svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
    svm_model.fit(X_train_scaled, y_train)
    y_pred = svm_model.predict(X_test_scaled)
    y_proba = svm_model.predict_proba(X_test_scaled)[:, 1]

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    print(f"SVM Accuracy: {acc:.4f}, AUC: {auc:.4f}")
    print(classification_report(y_test, y_pred))

    # Save model and scaler
    model_path = os.path.join(model_folder, f"{stock_name}_svm_model.pkl")
    scaler_path = os.path.join(model_folder, f"{stock_name}_scaler.pkl")
    joblib.dump(svm_model, model_path)
    joblib.dump(scaler, scaler_path)
    print(f"Saved model to {model_path} and scaler to {scaler_path}")

    # Append metrics to summary
    summary_results.append({
        "Stock": stock_name,
        "Accuracy": acc,
        "AUC": auc
    })

# Save summary
summary_df = pd.DataFrame(summary_results)
summary_file = os.path.join(summary_folder, "stock_data_model_summary_svm.csv")
summary_df.to_csv(summary_file, index=False)
print(f"\nSummary saved to {summary_file}")



=== Processing AAPL_daily ===
SVM Accuracy: 0.4664, AUC: 0.4872
              precision    recall  f1-score   support

           0       0.46      0.97      0.63      1040
           1       0.54      0.04      0.07      1205

    accuracy                           0.47      2245
   macro avg       0.50      0.50      0.35      2245
weighted avg       0.51      0.47      0.33      2245

Saved model to summary\svm\AAPL_daily_svm_model.pkl and scaler to summary\svm\AAPL_daily_scaler.pkl

=== Processing GE_daily ===
SVM Accuracy: 0.4966, AUC: 0.5004
              precision    recall  f1-score   support

           0       0.49      0.58      0.53      1582
           1       0.50      0.42      0.46      1614

    accuracy                           0.50      3196
   macro avg       0.50      0.50      0.49      3196
weighted avg       0.50      0.50      0.49      3196

Saved model to summary\svm\GE_daily_svm_model.pkl and scaler to summary\svm\GE_daily_scaler.pkl

=== Processing IBM_da

# Test the SVM model

In [1]:
# evaluate_svm_donchian.py
import os
import warnings
from glob import glob

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import joblib

warnings.filterwarnings("ignore")

# -------------------------------
# Config
# -------------------------------
INPUT_FOLDER   = "data/processed/stock_data"
SUMMARY_FOLDER = "summary"
MODEL_FOLDER   = os.path.join(SUMMARY_FOLDER, "svm")
RESULTS_FOLDER = "backtest_results_svm_donchian"

DONCHIAN_PERIOD = 20
CONFIDENCE_THRESHOLD = 0.0      # raise if you want to filter by higher model confidence
TEST_SPLIT = 0.2                # last 20% as test
TRADING_DAYS = 252
RISK_FREE_ANNUAL = 0.0          # set e.g. 0.03 for 3%
N_PERMUTATIONS = 500            # MC permutations per stock

os.makedirs(MODEL_FOLDER, exist_ok=True)
os.makedirs(SUMMARY_FOLDER, exist_ok=True)
os.makedirs(RESULTS_FOLDER, exist_ok=True)

# -------------------------------
# Helpers
# -------------------------------
def sortino_ratio(returns, rf_annual=0.0, periods=TRADING_DAYS):
    """Annualized Sortino: (mean - rf)/downside_std * sqrt(periods)."""
    if len(returns) == 0:
        return 0.0
    rf_daily = rf_annual / periods
    downside = returns[returns < rf_daily]
    if downside.size == 0:
        # no downside volatility -> treat as very high risk-adjusted perf
        return np.inf
    downside_std = downside.std(ddof=0)
    if downside_std == 0:
        return np.inf
    return ((returns.mean() - rf_daily) / downside_std) * np.sqrt(periods)

def sharpe_ratio(returns, rf_annual=0.0, periods=TRADING_DAYS):
    """Annualized Sharpe using population std (ddof=0) to be robust in small samples."""
    if len(returns) == 0:
        return 0.0
    std = returns.std(ddof=0)
    if std == 0:
        return 0.0
    rf_daily = rf_annual / periods
    return ((returns.mean() - rf_daily) / std) * np.sqrt(periods)

def profit_factor(returns):
    gains = returns[returns > 0].sum()
    losses = np.abs(returns[returns < 0].sum())
    if losses == 0:
        return np.inf if gains > 0 else 0.0
    return gains / losses

def donchian_breakout_strategy(df, preds, conf, donchian_period=DONCHIAN_PERIOD, conf_th=CONFIDENCE_THRESHOLD):
    """
    Long if breakout above Donchian high and model predicts up (1) with confidence > threshold.
    Short if breakout below Donchian low and model predicts down (0) with confidence > threshold.
    Positions are held until the next signal (ffill).
    """
    out = df.copy()
    out["Pred"] = preds
    out["Conf"] = np.abs(conf)

    out["Donchian_High"] = out["High"].rolling(window=donchian_period).max()
    out["Donchian_Low"]  = out["Low"].rolling(window=donchian_period).min()

    out["Long_Signal"]  = (out["Close"] > out["Donchian_High"].shift(1)) & (out["Pred"] == 1) & (out["Conf"] > conf_th)
    out["Short_Signal"] = (out["Close"] < out["Donchian_Low"].shift(1))  & (out["Pred"] == 0) & (out["Conf"] > conf_th)

    out["Position"] = 0
    out.loc[out["Long_Signal"], "Position"] = 1
    out.loc[out["Short_Signal"], "Position"] = -1
    out["Position"] = out["Position"].replace(0, np.nan).ffill().fillna(0)

    out["Market_Return"]   = out["Close"].pct_change()
    out["Strategy_Return"] = out["Position"].shift(1) * out["Market_Return"]
    out = out.dropna(subset=["Market_Return", "Strategy_Return"])
    return out

def monte_carlo_pvalue(df, preds, conf, n_permutations=N_PERMUTATIONS):
    """Permutation test on profit factor by shuffling predictions only."""
    # actual
    strat_df = donchian_breakout_strategy(df, preds, conf)
    actual_rets = strat_df["Strategy_Return"].dropna()
    if len(actual_rets) == 0:
        return 0.0, np.array([0.0]), 1.0
    actual_pf = profit_factor(actual_rets)

    rnd_pfs = np.zeros(n_permutations, dtype=float)
    for i in range(n_permutations):
        shuf_preds = np.random.permutation(preds)
        try:
            rnd_df = donchian_breakout_strategy(df, shuf_preds, conf)
            rnd_rets = rnd_df["Strategy_Return"].dropna()
            rnd_pfs[i] = profit_factor(rnd_rets) if len(rnd_rets) else 0.0
        except Exception:
            rnd_pfs[i] = 0.0

    p_val = (rnd_pfs >= actual_pf).mean()
    return actual_pf, rnd_pfs, p_val

# -------------------------------
# Main loop
# -------------------------------
all_rows = []

files = sorted(glob(os.path.join(INPUT_FOLDER, "*_features.csv")))
if not files:
    print(f"No files found in: {INPUT_FOLDER}")

for file_path in files:
    stock_name = os.path.basename(file_path).replace("_features.csv", "")
    print(f"\n=== Processing {stock_name} ===")

    try:
        df = pd.read_csv(file_path, parse_dates=["Date"], infer_datetime_format=True)
    except Exception:
        df = pd.read_csv(file_path)  # fallback if Date not parseable
    df.dropna(inplace=True)

    # binary target
    if "Close" not in df.columns or "High" not in df.columns or "Low" not in df.columns:
        print(f"  ⚠️  Missing required OHLC columns for {stock_name}; skipping.")
        continue

    df["Direction"] = (df["Close"].shift(-1) > df["Close"]).astype(int)
    df = df.dropna()

    # features
    drop_cols = [
        "Date", "Close", "Target", "Direction",
        "High", "Low", "Open", "High_lag1", "Low_lag1", "Open_lag1"
    ]
    X = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")
    y = df["Direction"]

    if len(X) < 50:
        print(f"  ⚠️  Not enough rows for {stock_name}; skipping.")
        continue

    # split (time-ordered)
    split_idx = int(len(X) * (1 - TEST_SPLIT))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    test_df = df.iloc[split_idx:].copy()

    # scale + SVM
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s  = scaler.transform(X_test)

    svm = SVC(kernel="rbf", C=1.0, gamma="scale", probability=True, random_state=42)
    svm.fit(X_train_s, y_train)

    y_pred  = svm.predict(X_test_s)
    y_proba = svm.predict_proba(X_test_s)[:, 1]
    # confidence as distance-from-0.5 probability (symmetric)
    conf = np.abs(y_proba - 0.5) * 2.0

    # classification metrics
    acc = accuracy_score(y_test, y_pred)
    try:
        auc = roc_auc_score(y_test, y_proba)
    except ValueError:
        auc = np.nan

    print(f"  SVM Accuracy: {acc:.4f}, AUC: {auc:.4f}" if not np.isnan(auc) else f"  SVM Accuracy: {acc:.4f}")

    # backtest with Donchian + predictions
    try:
        strat_df = donchian_breakout_strategy(test_df, y_pred, conf)
    except Exception as e:
        print(f"  ❌ Strategy error for {stock_name}: {e}")
        continue

    strat_rets  = strat_df["Strategy_Return"].dropna()
    market_rets = strat_df["Market_Return"].dropna()

    if len(strat_rets) == 0:
        print(f"  ⚠️  No strategy returns for {stock_name}; skipping.")
        continue

    # performance
    total_strategy_return = (1 + strat_rets).prod() - 1
    total_market_return   = (1 + market_rets).prod() - 1
    win_rate = (strat_rets > 0).mean()

    # trades: position changes / 2
    trades = strat_df["Position"].diff().abs().fillna(0).sum() / 2

    # risk metrics
    sr = sharpe_ratio(strat_rets, rf_annual=RISK_FREE_ANNUAL, periods=TRADING_DAYS)
    sortino = sortino_ratio(strat_rets, rf_annual=RISK_FREE_ANNUAL, periods=TRADING_DAYS)

    # profit factor + permutation p-value
    pf_actual, rnd_pfs, p_val = monte_carlo_pvalue(test_df, y_pred, conf, n_permutations=N_PERMUTATIONS)

    row = {
        "stock": stock_name,
        "test_samples": len(test_df),
        "accuracy": acc,
        "auc": auc,
        "total_strategy_return": total_strategy_return,
        "total_market_return": total_market_return,
        "excess_return": total_strategy_return - total_market_return,
        "profit_factor": pf_actual,
        "p_value": p_val,
        "sharpe": sr,
        "sortino": sortino,
        "win_rate": win_rate,
        "num_trades": trades,
        "test_start": test_df["Date"].min() if "Date" in test_df.columns else np.nan,
        "test_end": test_df["Date"].max() if "Date" in test_df.columns else np.nan,
    }
    all_rows.append(row)

    # per-stock CSV
    out_stock_csv = os.path.join(RESULTS_FOLDER, f"{stock_name}_strat_timeseries.csv")
    cols_to_save = ["Date", "Close", "Position", "Market_Return", "Strategy_Return",
                    "Donchian_High", "Donchian_Low", "Pred", "Conf"]
    cols_to_save = [c for c in cols_to_save if c in strat_df.columns]
    strat_df.to_csv(out_stock_csv, index=False)
    print(f"  ✓ {stock_name}: PF={pf_actual:.3f}, Sharpe={sr:.2f}, Sortino={sortino:.2f}, p={p_val:.3f}")
    print(f"    Saved time series to {out_stock_csv}")

# -------------------------------
# Save master summary
# -------------------------------
if all_rows:
    results_df = pd.DataFrame(all_rows)
    # rank: primary by p_value asc, secondary by sharpe desc
    results_df = results_df.sort_values(by=["p_value", "sharpe"], ascending=[True, False])
    summary_csv = os.path.join(RESULTS_FOLDER, "svm_donchian_evaluation_summary.csv")
    results_df.to_csv(summary_csv, index=False)
    print(f"\nSummary saved to: {summary_csv}")

    # quick console summary
    print("\nTop 5 by lowest p-value:")
    print(results_df[["stock", "p_value", "profit_factor", "sharpe", "sortino", "total_strategy_return"]].head(5).to_string(index=False))

    print("\nTop 5 by Sharpe:")
    print(results_df.sort_values("sharpe", ascending=False)[["stock", "sharpe", "p_value", "profit_factor", "total_strategy_return"]].head(5).to_string(index=False))
else:
    print("\nNo successful evaluations to summarize.")



=== Processing AAPL_daily ===
  SVM Accuracy: 0.4664, AUC: 0.4872
  ✓ AAPL_daily: PF=0.854, Sharpe=-0.85, Sortino=-1.16, p=0.676
    Saved time series to backtest_results_svm_donchian\AAPL_daily_strat_timeseries.csv

=== Processing GE_daily ===
  SVM Accuracy: 0.4966, AUC: 0.5004
  ✓ GE_daily: PF=1.051, Sharpe=0.26, Sortino=0.35, p=0.748
    Saved time series to backtest_results_svm_donchian\GE_daily_strat_timeseries.csv

=== Processing IBM_daily ===
  SVM Accuracy: 0.5161, AUC: 0.5021
  ✓ IBM_daily: PF=0.987, Sharpe=-0.07, Sortino=-0.09, p=0.358
    Saved time series to backtest_results_svm_donchian\IBM_daily_strat_timeseries.csv

=== Processing JNJ_daily ===
  SVM Accuracy: 0.5147, AUC: 0.4946
  ✓ JNJ_daily: PF=1.123, Sharpe=0.62, Sortino=0.81, p=0.226
    Saved time series to backtest_results_svm_donchian\JNJ_daily_strat_timeseries.csv

=== Processing MSFT_daily ===
  SVM Accuracy: 0.5278, AUC: 0.5048
  ✓ MSFT_daily: PF=1.207, Sharpe=1.03, Sortino=1.42, p=0.682
    Saved time serie