# Walk Forward: A Realistic Approach to Backtesting

In [1]:
???

Object `?` not found.


![](<src/10_Table_Validation Methods.png>)

## Load the data

In [2]:
import pandas as pd

df = pd.read_excel('data/Microsoft_LinkedIn_Processed.xlsx', parse_dates=['Date'], index_col=0)
df = df.drop(columns='change_tomorrow_direction')
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-12-08,56.325228,56.582507,55.902560,56.058762,21220800,1.549143
2016-12-09,56.214968,56.959234,56.169027,56.940857,27349400,0.321692
2016-12-12,56.803028,57.244073,56.711145,57.124622,20198100,1.286112
2016-12-13,57.427836,58.273172,57.188938,57.868881,35718900,-0.478622
2016-12-14,57.887258,58.300739,57.455399,57.593227,30352700,-0.159789
...,...,...,...,...,...,...
2023-03-09,255.820007,259.559998,251.580002,252.320007,26653400,-1.500467
2023-03-10,251.080002,252.789993,247.600006,248.589996,28321800,2.099087
2023-03-13,247.399994,257.910004,245.729996,253.919998,33339700,2.634307
2023-03-14,256.750000,261.070007,255.860001,260.790009,33620300,1.751806


## Walk Forward Validation

### How `TimeSeriesSplit` works

In [3]:
from sklearn.model_selection import TimeSeriesSplit
ts = TimeSeriesSplit(test_size=200) #200 dni 


In [4]:
list_df_train = []
list_df_test = []

for index_train, index_test in ts.split(X=df):
    list_df_train.append(df.iloc[index_train])
    list_df_test.append(df.iloc[index_test])

list_df_test[0]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-03-27,113.129993,113.446697,110.865089,112.064720,22733400,0.136831
2019-03-28,112.707721,112.842079,111.450503,112.218269,18334800,0.856386
2019-03-29,113.312351,113.552277,112.247078,113.187592,25399800,0.907406
2019-04-01,114.156889,114.310445,113.341141,114.224068,22789100,0.142634
2019-04-02,114.262456,114.665538,113.744214,114.387222,18142300,0.650166
...,...,...,...,...,...,...
2020-01-03,153.531397,155.112085,153.279252,153.822311,21116200,0.257813
2020-01-06,152.328893,154.287800,151.776127,154.219910,20813700,-0.920158
2020-01-07,154.501158,154.840563,152.561651,152.813782,21634100,1.567839
2020-01-08,154.122900,155.936349,153.172546,155.247818,27746500,1.233897


## Machine Learning Model

### Separate the data

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [5]:
y = df.change_tomorrow
X = df[['Open','High','Low','Close','Volume']]

In [6]:
list_df_train = []
list_df_test = []

for index_train, index_test in ts.split(X=df):
    X_train, y_train = X.iloc[index_train], y.iloc[index_train]
    X_test, y_test = X.iloc[index_test], y.iloc[index_test]

### Simulate one computation of the ML model

- Compute the model
- Calculate predictions on the test set
- Evaluate how good the model is

In [7]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

model_dt = DecisionTreeRegressor(max_depth=15, random_state=42)
model_dt.fit(X_train, y_train)

y_pred = model_dt.predict(X_test)
error_mse = mean_squared_error(y_test, y_pred)
error_mse

5.308276780150754

### Add the procedure inside the for loop

In [8]:
model_dt = DecisionTreeRegressor(max_depth=15, random_state=42)

error_mse_list = []

for index_train, index_test in ts.split(df):
    X_train, y_train = X.iloc[index_train], y.iloc[index_train]
    X_test, y_test = X.iloc[index_test], y.iloc[index_test]

    model_dt.fit(X_train, y_train)
    y_pred = model_dt.predict(X_test)
    error_mse = mean_squared_error(y_test, y_pred)
    error_mse_list.append(error_mse)

In [9]:
error_mse_list

import numpy as np
np.mean(error_mse_list) ## error v dolarech 

np.float64(13.38250771248552)

## Anchored Walk Forward evaluation in backtesting

![](<src/10_Table_Validation Methods.png>)

### Create a new strategy

In [10]:
from backtesting import Backtest, Strategy



In [11]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5

    n_train = 600
    coef_retrain = 200

    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        X_train = self.data.iloc[:self.n_train, :-1]
        y_train = self.data.iloc[:self.n_train, -1]
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :-1]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

In [12]:
class WalkForwardAnchored(Regression):
    def next(self):
        
        # we don't take any action and move on to the following day
        if len(self.data) < self.n_train:
            return
        
        # we retrain the model each x days
        if len(self.data) % self.coef_retrain == 0:
            X_train = self.data.df.iloc[:, :-1]
            y_train = self.data.df.iloc[:, -1]

            self.model.fit(X_train, y_train)

            super().next()
        else:
            super().next()

In [13]:
bt = Backtest(df, WalkForwardAnchored, cash=10000, commission=.002, exclusive_orders=True)

In [14]:
import multiprocessing as mp
mp.set_start_method('skopt')

ValueError: cannot find context for 'skopt'

In [None]:
stats_skopt, heatmap, optimize_result = bt.optimize(
    limit_buy = range(0, 6), limit_sell = range(-6, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'
    )

dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]', ascending=False)
dff

  avg = a.mean(axis, **keepdims_kw)
  ret = um.true_divide(
  cov_matrix = np.cov(equity_log_returns, market_log_returns)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


AttributeError: Column 'iloc' not in data

## Unanchored Walk Forward

### Create a library of strategies

`strategies.py`

### Create the unanchored walk forward class

![](<src/10_Table_Validation Methods.png>)

### Import the strategy and perform the backtest

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import strategies

In [None]:
strategies.WalkForwardUnanchored

In [None]:
bt_unanchored = Backtest(df, strategies.WalkForwardUnanchored, cash=10000, commission=.002, exclusive_orders=True)

stats_skopt, heatmap, optimize_result = bt_unanchored.optimize(
    limit_buy = range(0, 6), limit_sell = range(-6, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'
    )

dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]', ascending=False)
dff

### Interpret the strategies' performance

- Both anchored and unanchored backtesting

## Practice to master the knowledge

Work on the challenge with another dataset:

1. Learn the <a>mental models</a> to solve the challenge faster.
2. Complete the <a href="10C_Walk Forward Regression.ipynb">notebook</a>.