<img src="http://certificate.tpq.io/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# EPAT Session 2

**Executive Program in Algorithmic Trading**

**_Vectorized Backtesting_**

Prof. Dr. Yves J. Hilpisch | The Python Quants GmbH | http://tpq.io

<a href="https://home.tpq.io/certificates/pyalgo" target="_blank"><img src="https://hilpisch.com/pyalgo_cover_shadow.png" width="300px" align="left"></a>

## Basic Imports

In [None]:
import math
import numpy as np
import pandas as pd
from pylab import plt
import cufflinks
np.set_printoptions(suppress=True)
cufflinks.set_config_file(offline=True)
plt.style.use('seaborn')
pd.set_option('mode.chained_assignment', None)
%config InlineBackend.figure_format = 'svg'

## Reading Financial Data

In [None]:
url = 'http://hilpisch.com/pyalgo_eikon_eod_data.csv'  # EOD data
# url = 'http://hilpisch.com/aiif_eikon_id_data.csv'  # intraday data

In [None]:
raw = pd.read_csv(url, index_col=0, parse_dates=True).dropna()

In [None]:
raw.info()

In [None]:
rets = np.log(raw / raw.shift(1)).dropna()  # log returns

In [None]:
rets.head().round(4)

## Regime Detection

... based on **unsupervised learning** (= only features, no labels).

Typical regimes are:

* low volatility, positive trend
* low volatility, negative trend
* high volatility, positive trend
* high volatility, negative trend

In [None]:
from sklearn.cluster import KMeans

In [None]:
model = KMeans(n_clusters=4)  # 1. step

### Features

In [None]:
symbol = '.SPX'

In [None]:
f = ['mom', 'vol']

In [None]:
data = pd.DataFrame(rets[symbol])

In [None]:
window = 20

In [None]:
data['mom'] = data[symbol].rolling(window).mean()

In [None]:
data['vol'] = data[symbol].rolling(window).std()

In [None]:
data.dropna(inplace=True)

In [None]:
data[f] = (data[f] - data[f].mean()) / data[f].std()

In [None]:
data[f].mean().round(6)

In [None]:
data[f].std().round(6)

In [None]:
data[f].plot.scatter(x='vol', y='mom');

### Clustering

Clustering = unsupervised learning.

In [None]:
model.fit(data[f])  # 2. step

In [None]:
r = model.predict(data[f])  # 3. step

In [None]:
r

In [None]:
data[f].plot.scatter(x='vol', y='mom', c=r, cmap='coolwarm');

In [None]:
#plt.scatter?

In [None]:
plt.scatter(x=data.index, y=raw.iloc[window:][symbol],
            marker='.', c=r, cmap='coolwarm');

## Advanced Trading Strategy

### Data Preprocessing

This is the point where the research plays the major role, in terms of coming up with valuable features for the prediction of future market movements ("**hard work**"):

In [None]:
symbol = 'EUR='

In [None]:
data = pd.DataFrame(raw[symbol])

In [None]:
data['r'] = np.log(data[symbol] / data[symbol].shift(1))

In [None]:
data['d'] = np.sign(data['r'])

In [None]:
data.dropna(inplace=True)

In [None]:
data['d'] = data['d'].astype(int)

In [None]:
data.head()

In [None]:
cols = list()
lags = 5
for lag in range(1, lags + 1):
    col = f'lag_{lag}'
    # data[col] = data['d'].shift(lag)
    data[col] = data['r'].shift(lag)
    cols.append(col)

In [None]:
data.head(8)

In [None]:
data.dropna(inplace=True)

In [None]:
# data[cols] = data[cols].astype(int)

In [None]:
data.head()

In [None]:
data_ = (data - data.mean()) / data.std()

In [None]:
data_.head()

In [None]:
# 2 ** lags  # number of patterns

### Model Fitting (In-Sample)

... based on **supervised learning** (= features + labels).

In [None]:
from sklearn.naive_bayes import GaussianNB  # simple baseline algorithm
from sklearn.neural_network import MLPClassifier  # deep neural network
from sklearn.metrics import accuracy_score

In [None]:
model = GaussianNB()  # 1. step

In [None]:
model = MLPClassifier()  # 1. step

In [None]:
model = MLPClassifier(hidden_layer_sizes=[32], shuffle=False,
                     random_state=100, max_iter=1000)  # 1. step

In [None]:
model.fit(data_[cols], data['d'])  # 2. step

In [None]:
pred = model.predict(data_[cols])  # 3. step

In [None]:
pred

In [None]:
accuracy_score(data['d'], pred)

In [None]:
model.predict_proba(data_[cols])

In [None]:
model.predict_proba(data_[cols]).sum(axis=1)

### Vectorized Backtesting (In-Sample)

In [None]:
data['p'] = pred

In [None]:
data['s'] = data['r'] * data['p']

In [None]:
data

In [None]:
data[['r', 's']].sum().apply(np.exp)

In [None]:
data[['r', 's']].cumsum().apply(np.exp).plot();

### Train-Test Split

In [None]:
split = int(len(data) * 0.8)

In [None]:
train = data.iloc[:split].copy()

In [None]:
mu, std = train[cols].mean(), train[cols].std()

In [None]:
train[cols] = (train[cols] - mu) / std

In [None]:
test = data.iloc[split:].copy()

In [None]:
test[cols] = (test[cols] - mu) / std

### Training (In-Sample)

In [None]:
model.fit(train[cols], train['d'])

In [None]:
accuracy_score(train['d'], model.predict(train[cols]))

### Testing (Out-of-Sample)

In [None]:
pred = model.predict(test[cols])

In [None]:
accuracy_score(test['d'], pred)

In [None]:
test['p'] = pred

In [None]:
test['s'] = test['r'] * test['p']

In [None]:
test[['r', 's']].sum().apply(np.exp)

In [None]:
test[['r', 's']].cumsum().apply(np.exp).plot();

## Risk Factors Data File

In [None]:
factors = pd.read_csv('http://hilpisch.com/aiif_eikon_eod_factors.csv',
                      index_col=0, parse_dates=True)

In [None]:
factors.info()

In [None]:
factors.normalize().plot();

<img src="http://certificate.tpq.io/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>