Cell 1 (imports + DB connection)

In [1]:
from pathlib import Path

import sqlite3

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

project_root = Path.cwd().parent

db_path = project_root / "data" / "Synthetic Dataset" / "product_analytics.db"

conn = sqlite3.connect(db_path)

Cell 2 (base time series)

In [2]:
query = """ 

SELECT

    DATE(order_ts) AS day,

    SUM(net_revenue) AS revenue

FROM orders

GROUP BY day

ORDER BY day;

"""

ts = pd.read_sql_query(query, conn) 

ts["day"] = pd.to_datetime(ts["day"])

ts = ts.set_index("day")

ts["returns"] = ts["revenue"].pct_change()

ts.dropna(inplace=True)

ts.head()

Unnamed: 0_level_0,revenue,returns
day,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-01-02,1456.57,0.024152
2024-01-03,1459.21,0.001812
2024-01-04,2009.31,0.376985
2024-01-05,1429.87,-0.288378
2024-01-06,1428.31,-0.001091


Cell 3 (backtest fucntion)

In [3]:
def backtest_ma(ts, short_w, long_w):

    ts = ts.copy()

    ts["ma_short"] = ts["revenue"].rolling(short_w).mean()

    ts["ma_long"] = ts["revenue"].rolling(long_w).mean()

    ts.dropna(inplace=True)

    ts["signal"] = (ts["ma_short"] > ts["ma_long"]).astype(int)

    ts["position"] = ts["signal"].shift(1)

    ts.dropna(inplace=True)

    ts["strategy_returns"] = ts["position"] * ts["returns"]

    sharpe = ts["strategy_returns"].mean() / ts["strategy_returns"].std()
    
    total_return = (1 + ts["strategy_returns"]).prod() - 1

    return sharpe, total_return

Cell 4 (define walk forward windows)

   - Train: 60% of data
   - Test: next 20%
   - Roll forward

In [4]:
n = len(ts)

train_size = int(n * 0.6)

test_size = int(n * 0.2)

train_size, test_size

(437, 145)

Cell 5 (first walk forward split)

In [5]:
train = ts.iloc[:train_size]

test = ts.iloc[train_size: train_size + test_size]

train.index.min(), train.index.max(), test.index.min(), test.index.max()

(Timestamp('2024-01-02 00:00:00'),
 Timestamp('2025-03-13 00:00:00'),
 Timestamp('2025-03-14 00:00:00'),
 Timestamp('2025-08-05 00:00:00'))

Cell 6 (optimize parameters on "train" only)

 - Using the same grid, but only on training data.
 - This is the only place optimization is allowed.

In [6]:
short_windows = range(3, 10)

long_windows = range(10, 30)

results = []

for s in short_windows:

    for l in long_windows:

        if s >= l:

            continue

        sharpe, _ = backtest_ma(train, s, l)

        results.append((s, l, sharpe))

opt = pd.DataFrame(results, columns=["short", "long", "sharpe"])

best = opt.sort_values("sharpe", ascending=False).iloc[0]

best

short      8.000000
long      10.000000
sharpe     0.122522
Name: 100, dtype: float64

Cell 7 (test optimized parameters out of sample data)

In [7]:
test_sharpe, test_return = backtest_ma(

    test,

    int(best["short"]),

    int(best["long"])
    
)

test_sharpe, test_return

(np.float64(0.12207776178799536), np.float64(-0.9110570355993084))

This number is the truth, everything before was a hypothesis.

Cell 8 (compare train vs test performance)

In [8]:
train_sharpe, train_return = backtest_ma(

    train,

    int(best["short"]),

    int(best["long"])

)

pd.Series({

    "train_sharpe": train_sharpe,

    "test_sharpe": test_sharpe,

    "train_return": train_return,

    "test_return": test_return
    
})

train_sharpe    0.122522
test_sharpe     0.122078
train_return   -0.944626
test_return    -0.911057
dtype: float64

Big drop from train --> test = overfitting

Similar magnitude = robustness

Sign flip = strategy failure

Cell 9 (rolling walk forward) 
  
  - Reapeat the process by rolling forward.

In [9]:
step = test_size

walk_results = []

for start in range(0, n - train_size - test_size, step):

    train = ts.iloc[start:start + train_size]

    test = ts.iloc[start + train_size:start + train_size + test_size]

    # optimize
    results = []

    for s in short_windows:

        for l in long_windows:

            if s >= l:

                continue

            sharpe, _ = backtest_ma(train, s, l)

            results.append((s, l, sharpe))

    opt = pd.DataFrame(results, columns=["short","long","sharpe"])

    best = opt.sort_values("sharpe", ascending=False).iloc[0]

    # test
    test_sharpe, test_return = backtest_ma(

        test,
        int(best["short"]),

        int(best["long"])

    )

    walk_results.append({

        "train_end": train.index.max(),

        "test_start": test.index.min(),

        "test_sharpe": test_sharpe,

        "test_return": test_return

    })

wf = pd.DataFrame(walk_results)

wf

Unnamed: 0,train_end,test_start,test_sharpe,test_return
0,2025-03-13,2025-03-14,0.122078,-0.911057
1,2025-08-05,2025-08-06,0.168095,2.952576


## Day 8 â€“ Walk-Forward Validation

- Model parameters were optimized on historical training windows only.
- Performance was evaluated on strictly out-of-sample data.
- Walk-forward testing revealed how strategy performance evolves over time.
- Significant degradation from train to test indicates overfitting risk.
- Walk-forward validation provides a realistic assessment of deployability.

This methodology is essential for credible quantitative research.