Skip to content
This repository has been archived by the owner on Sep 26, 2019. It is now read-only.

Commit

Permalink
simulation tests: kurtosis & skewness
Browse files Browse the repository at this point in the history
  • Loading branch information
filangelos committed May 24, 2018
1 parent bd91437 commit c74c4fa
Show file tree
Hide file tree
Showing 8 changed files with 188 additions and 231 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Expand Up @@ -148,4 +148,7 @@ www
/assets/

# cached data
/db/
/db/

# NumPy objects
*.npy
222 changes: 17 additions & 205 deletions log/week_8.ipynb

Large diffs are not rendered by default.

10 changes: 4 additions & 6 deletions qtrader/envs/data_loader.py
Expand Up @@ -55,7 +55,8 @@ def _csv(cls,
"""
df = pd.read_csv(root, index_col='Date',
parse_dates=True).sort_index(ascending=True)
return df[tickers]
union = [ticker for ticker in tickers if ticker in df.columns]
return df[union]

@classmethod
def Returns(cls,
Expand All @@ -78,10 +79,7 @@ def Returns(cls,
Table of Returns of Adjusted Close prices for `tickers`
"""
if isinstance(csv, str):
df = pd.DataFrame.from_dict(
{ticker: cls._csv(csv, ticker)
for ticker in tickers}).loc[start_date:end_date]
return df
return cls._csv(csv, tickers).loc[start_date:end_date]
else:
return cls.Prices(tickers,
start_date,
Expand Down Expand Up @@ -109,7 +107,7 @@ def Prices(cls,
Table of Adjusted Close prices for `tickers`
"""
if isinstance(csv, str):
df = cls._csv(csv, tickers).loc[start_date:end_date]
return cls._csv(csv, tickers).loc[start_date:end_date]
else:
# tmp dictionary of panda.Series
data = {}
Expand Down
145 changes: 127 additions & 18 deletions qtrader/simulation/tests/moments.py
@@ -1,5 +1,6 @@
from qtrader.utils.numpy import eps
from qtrader.simulation.tests.base import Test
from qtrader.utils import uuid

import numpy as np

Expand All @@ -11,46 +12,64 @@ class Moments(Test):

@classmethod
def run(cls, df_1, df_2, tolerance=0.1, render=False):
# generate UUID for render
cls.UUID = uuid()
# first order moment test
first_order = cls._first_order_test(
df_1, df_2, tolerance, render)
# second order moment test
second_order = cls._second_order_test(
df_1, df_2, tolerance, render)
# third order moment test
third_order = cls._third_order_test(
df_1, df_2, tolerance, render)
# forth order moment test
forth_order = cls._forth_order_test(
df_1, df_2, tolerance, render)
# hypothesis test: accept if all tests passed
hypothesis = first_order[0] and second_order[0]
hypothesis = first_order[0] and second_order[0] and third_order[0] and forth_order[0]
# summary of tests and scores
info = {'first': first_order, 'second': second_order}
info = {'first': first_order, 'second': second_order,
'third': third_order, 'forth': forth_order}
return hypothesis, info

@classmethod
def _first_order_test(cls, df_1, df_2, tolerance, render):
# check dimensions consistency
assert (df_1.columns == df_2.columns).all()
# column-wise mean values
mu_1 = np.mean(df_1, axis=0)
mu_2 = np.mean(df_2, axis=0)
# relative deviation of mean values
score = np.abs(mu_1 / (mu_2+eps) - 1)
# plot distributions (kde)
# plot distributions
if render:
# random selection of assets to render
I = np.sort(np.random.choice(
df_1.shape[1], min(3, df_1.shape[1]), replace=False))
# initialize figure & axes
_, ax = plt.subplots()
# iterate over columns
for m in range(df_1.shape[1]):
fig, axes = plt.subplots(ncols=len(I), figsize=(6.4 * len(I), 4.8))
for i, m in enumerate(I):
# distribution plot of family 1
sns.distplot(df_1[:, m], label='df_1::%d' % m, ax=ax)
# iterate over columns
for m in range(df_2.shape[1]):
sns.distplot(df_1.iloc[:, m], label='df_1::%s' %
df_1.columns[m], color='g', norm_hist=True, ax=axes[i])
# distribution plot of family 2
sns.distplot(df_2[:, m], label='df_2::%d' % m, ax=ax)
# plot settings
ax.legend(ncol=2)
ax.set_title('Distributions')
ax.set_ylabel('Frequency')
sns.distplot(df_2.iloc[:, m], label='df_2::%s' %
df_1.columns[m], color='r', norm_hist=True, ax=axes[i])
for i, ax in enumerate(axes):
# plot settings
ax.legend()
ax.set_title('%s: Distributions' % df_1.columns[I[i]])
ax.set_ylabel('Frequency')
fig.savefig('assets/tmp/first_order_%s.pdf' % cls.UUID)
# fig.show()
# threshold score
return (score < tolerance).all(), score

@classmethod
def _second_order_test(cls, df_1, df_2, tolerance, render):
# check dimensions consistency
assert (df_1.columns == df_2.columns).all()
# covariance matrix of family 1
cov_1 = np.cov(df_1.T)
# frobenius norm of family 1
Expand All @@ -60,21 +79,111 @@ def _second_order_test(cls, df_1, df_2, tolerance, render):
# frobenius norm of family 2
fro_2 = np.linalg.norm(cov_2, ord='fro')
# relative deviation of covariances
# normalised by frobenius norms
# normalized by frobenius norms
score = np.linalg.norm(cov_1 - cov_2, ord='fro') / \
(np.sqrt(fro_1) * np.sqrt(fro_2) + eps)
# plot covariance matrices
if render:
# random selection of assets to render
I = np.sort(np.random.choice(
df_1.shape[1], min(3, df_1.shape[1]), replace=False))
# fetch sub-matrices
sub_cov_1 = cov_1[np.ix_(I, I)]
sub_cov_2 = cov_2[np.ix_(I, I)]
# initialize figure & axes
fig, axes = plt.subplots(ncols=3, figsize=(19.2, 4.8))
# family 1
sns.heatmap(cov_1, ax=axes[0])
sns.heatmap(sub_cov_1, xticklabels=I, yticklabels=I, ax=axes[0])
axes[0].set_title('Covariance Matrix: Series 1')
# family 2
sns.heatmap(cov_2, ax=axes[1])
sns.heatmap(sub_cov_2, xticklabels=I, yticklabels=I, ax=axes[1])
axes[1].set_title('Covariance Matrix: Series 2')
# absolute difference
sns.heatmap(np.abs(cov_1 - cov_2), ax=axes[2])
sns.heatmap(np.abs(sub_cov_1 - sub_cov_2),
xticklabels=I, yticklabels=I, ax=axes[2])
axes[2].set_title('Covariance Matrix: Absolute Difference')
fig.savefig('assets/tmp/second_order_%s.pdf' % cls.UUID)
# fig.show()
# threshold score
return score < tolerance, score

@classmethod
def _third_order_test(cls, df_1, df_2, tolerance, render):
# check dimensions consistency
assert (df_1.columns == df_2.columns).all()
# skewness of family 1
skew_1 = df_1.skew()
# skewness of family 2
skew_2 = df_2.skew()
# relative deviation of kurtosis
score = np.abs(skew_1 / (skew_2+eps) - 1)
# plot mean-median deviations
if render:
# random selection of assets to render
I = np.sort(np.random.choice(
df_1.shape[1], min(3, df_1.shape[1]), replace=False))
# initialize figure & axes
fig, axes = plt.subplots(ncols=len(I), figsize=(6.4 * len(I), 4.8))
for i, m in enumerate(I):
# distribution plot of family 1
sns.distplot(df_1.iloc[:, m], label='df_1::%s' %
df_1.columns[m], color='g', hist=False, norm_hist=True, ax=axes[i])
# vertical line for mean of family 1
axes[i].vlines(df_1.iloc[:, m].mean(), 0, 1e20, label='df_1::mean::%s' %
df_1.columns[m], color='g', linestyles='-')
# vertical line for median of family 1
axes[i].vlines(df_1.iloc[:, m].median(), 0, 1e20, label='df_1::median::%s' %
df_1.columns[m], color='g', linestyles='-.')
for i, m in enumerate(I):
# distribution plot of family 2
sns.distplot(df_2.iloc[:, m], label='df_2::%s' %
df_1.columns[m], color='r', hist=False, norm_hist=True, ax=axes[i])
# vertical line for mean of family 2
axes[i].vlines(df_2.iloc[:, m].mean(), 0, 1e20, label='df_2::mean::%s' %
df_2.columns[m], color='r', linestyles='-')
# vertical line for median of family 2
axes[i].vlines(df_2.iloc[:, m].median(), 0, 1e20, label='df_2::median::%s' %
df_2.columns[m], color='r', linestyles='-.')
for i, ax in enumerate(axes):
# plot settings
ax.legend()
ax.set_title('%s: Distributions' % df_1.columns[I[i]])
ax.set_ylabel('Frequency')
fig.savefig('assets/tmp/third_order_%s.pdf' % cls.UUID)
# fig.show()
# threshold score
return (score < tolerance).all(), score

@classmethod
def _forth_order_test(cls, df_1, df_2, tolerance, render):
# check dimensions consistency
assert (df_1.columns == df_2.columns).all()
# kurtosis of family 1
kurt_1 = df_1.kurt()
# kurtosis of family 2
kurt_2 = df_2.kurt()
# relative deviation of kurtosis
score = np.abs(kurt_1 / (kurt_2+eps) - 1)
# plot y-axis log scale
if render:
# random selection of assets to render
I = np.sort(np.random.choice(
df_1.shape[1], min(3, df_1.shape[1]), replace=False))
# initialize figure & axes
fig, axes = plt.subplots(ncols=len(I), figsize=(6.4 * len(I), 4.8))
for i, m in enumerate(I):
# distribution plot of family 1
sns.distplot(df_1.iloc[:, m], label='df_1::%s' %
df_1.columns[m], color='g', hist_kws={'log': True}, norm_hist=True, ax=axes[i])
# distribution plot of family 2
sns.distplot(df_2.iloc[:, m], label='df_2::%s' %
df_2.columns[m], color='r', hist_kws={'log': True}, norm_hist=True, ax=axes[i])
for i, ax in enumerate(axes):
# plot settings
ax.legend()
ax.set_title('%s: Distributions' % df_1.columns[I[i]])
ax.set_ylabel('Frequency')
fig.savefig('assets/tmp/forth_order_%s.pdf' % cls.UUID)
# fig.show()
# threshold score
return (score < tolerance).all(), score
1 change: 1 addition & 0 deletions qtrader/utils/__init__.py
Expand Up @@ -5,3 +5,4 @@
from qtrader.utils.preprocessor import rolling1d
from qtrader.utils.preprocessor import rolling2d
from qtrader.utils.preprocessor import softmax
from qtrader.utils.uuid import uuid
5 changes: 5 additions & 0 deletions qtrader/utils/uuid.py
@@ -0,0 +1,5 @@
import uuid as _uuid


def uuid():
return _uuid.uuid4().hex
3 changes: 2 additions & 1 deletion requirements.txt
Expand Up @@ -13,4 +13,5 @@ rise
flask
h5py==2.8.0rc1
pydot
bs4
bs4
pyyaml
28 changes: 28 additions & 0 deletions scripts/db.py
@@ -0,0 +1,28 @@
import qtrader

import os

import pandas as pd

# fetch data
sp500, prices, returns = qtrader.envs.data_loader.Finance.SP500(
return_prices_returns=True)
# make if not there 'db' folder
if not os.path.exists('db'):
os.makedirs('db')
# store data
sp500.to_csv('db/sp500.csv')
prices.to_csv('db/prices.csv')
returns.to_csv('db/returns.csv')

# remove from score
del sp500
del prices
del returns

# read data
sp500 = pd.read_csv('db/sp500.csv', index_col=0, header=0)
prices = qtrader.envs.data_loader.Finance.Prices(
sp500.index.tolist(), csv='db/prices.csv')
returns = qtrader.envs.data_loader.Finance.Returns(
sp500.index.tolist(), csv='db/returns.csv')

0 comments on commit c74c4fa

Please sign in to comment.