In [1]:
from IPython.core.display import display
from __future__ import division
import numpy as np
import pandas as pd
from datautils.data_utils import get_more_data, merge_trades_and_quotes
import datautils.features as features

import statsmodels.api as sm
from sklearn import cross_validation, svm, preprocessing

In [5]:
data = merge_trades_and_quotes(get_more_data('XLE', 2012, 1, 5, days=10, bar_width='None'))
display(data[0].head(5))

Unnamed: 0,SYM,DATE_TIME,ASK_PRICE,ASK_SIZE,BID_PRICE,BID_SIZE,SIZE,PRICE
0,XLE,2012-01-05 09:30:00.000,70.8065,15,70.76,5,,
1,XLE,2012-01-05 09:30:00.259,70.8065,15,70.76,5,130.0,70.74
2,XLE,2012-01-05 09:30:00.261,70.8065,15,70.76,5,130.0,70.74
3,XLE,2012-01-05 09:30:00.267,70.8065,15,70.76,5,100.0,70.81
4,XLE,2012-01-05 09:30:00.271,70.8065,15,70.76,5,100.0,70.87


## Add Features

In [6]:
hls = [10, 40, 100]
vpin_window = pd.Timedelta(seconds=10)
feature_names = []
for day in data:
    features.add_future_log_returns(day, label_hls=hls)
    feature_names.extend(features.add_ema(day, halflives=hls))
    feature_names.extend(features.add_dema(day, halflives=hls))
    feature_names.extend(features.add_log_return_ema(day, halflives=hls))
    feature_names.extend(features.add_price_diff(day))
    feature_names.extend(features.add_size_diff(day))
    feature_names.extend(features.add_vpin_time(day, vpin_window))
    
day = data[0]
day = day.fillna(0)
display(day)

In [10]:
features.standardize_features(day, feature_names)
display(feature_names)

['EMA_10',
 'EMA_40',
 'EMA_100',
 'dEMA_10',
 'dEMA_40',
 'dEMA_100',
 'log_returns_10-',
 'log_returns_40-',
 'log_returns_100-',
 'log_returns_std_10-',
 'log_returns_std_40-',
 'log_returns_std_100-',
 'price_diff',
 'size_diff',
 'VPIN_TIME',
 'EMA_10',
 'EMA_40',
 'EMA_100',
 'dEMA_10',
 'dEMA_40',
 'dEMA_100',
 'log_returns_10-',
 'log_returns_40-',
 'log_returns_100-',
 'log_returns_std_10-',
 'log_returns_std_40-',
 'log_returns_std_100-',
 'price_diff',
 'size_diff',
 'VPIN_TIME',
 'EMA_10',
 'EMA_40',
 'EMA_100',
 'dEMA_10',
 'dEMA_40',
 'dEMA_100',
 'log_returns_10-',
 'log_returns_40-',
 'log_returns_100-',
 'log_returns_std_10-',
 'log_returns_std_40-',
 'log_returns_std_100-',
 'price_diff',
 'size_diff',
 'VPIN_TIME']

In [9]:
for feature in feature_names:
    print feature, "min", min(day[feature]), "max", max(day[feature])

EMA_10 min 70.0115949179 max 70.9081294146
EMA_40 min 70.0282165729 max 70.9017293638
EMA_100 min 70.0451092993 max 70.8923651018
dEMA_10 min -0.00493683294108 max 0.00378741252347
dEMA_40 min -0.00136081668663 max 0.00145226141295
dEMA_100 min -0.000750181072149 max 0.000815743842153
log_returns_10- min -7.03873382999e-05 max 5.39212274303e-05
log_returns_40- min -1.92720940793e-05 max 2.05530466644e-05
log_returns_100- min -1.06296828324e-05 max 1.16006213169e-05
log_returns_std_10- min 0.0 max 0.000223224845301
log_returns_std_40- min 0.0 max 0.000123947760689
log_returns_std_100- min 0.0 max 0.000102077284405
price_diff min 0.00999999999971 max 0.0465
size_diff min -248.0 max 160.0
VPIN_TIME min -1.0 max 1.0
EMA_10 min 70.0115949179 max 70.9081294146
EMA_40 min 70.0282165729 max 70.9017293638
EMA_100 min 70.0451092993 max 70.8923651018
dEMA_10 min -0.00493683294108 max 0.00378741252347
dEMA_40 min -0.00136081668663 max 0.00145226141295
dEMA_100 min -0.000750181072149 max 0.00081574