In [1]:
from IPython.core.display import display
from __future__ import division
import numpy as np
import pandas as pd
from datautils.data_utils import get_more_data, merge_trades_and_quotes
import datautils.features as features

import statsmodels.api as sm
from sklearn import cross_validation, svm, preprocessing

In [2]:
import time
start = time.time()
data = merge_trades_and_quotes(get_more_data('XLE', 2012, 1, 5, days=10, bar_width='second'))
print(time.time()-start)
display(data[0].head(10))

82.4728548527


Unnamed: 0,SYM,DATE_TIME,ASK_PRICE,ASK_SIZE,BID_PRICE,BID_SIZE,SIZE,PRICE
0,XLE,2012-01-05 09:30:00.000,70.8065,15,70.76,5,,
1,XLE,2012-01-05 09:30:00.259,70.8065,15,70.76,5,130.0,70.74
2,XLE,2012-01-05 09:30:00.261,70.8065,15,70.76,5,130.0,70.74
3,XLE,2012-01-05 09:30:00.267,70.8065,15,70.76,5,100.0,70.81
4,XLE,2012-01-05 09:30:00.271,70.8065,15,70.76,5,100.0,70.87
5,XLE,2012-01-05 09:30:00.271,70.8065,15,70.76,5,1000.0,70.87
6,XLE,2012-01-05 09:30:00.273,70.8065,15,70.76,5,137.0,70.8
7,XLE,2012-01-05 09:30:00.273,70.8065,15,70.76,5,237.0,70.8
8,XLE,2012-01-05 09:30:00.291,70.8065,15,70.76,5,474.0,70.87
9,XLE,2012-01-05 09:30:00.305,70.8065,15,70.76,5,429.0,70.79


## Add Features

In [3]:
hls = [10, 40, 100]
vpin_window = pd.Timedelta(seconds=30)
feature_names = []
for day in data:
    day = day.fillna(0)
    features.add_future_log_returns(day, label_hls=hls)
    feature_names.extend(features.add_ema(day, halflives=hls))
    feature_names.extend(features.add_dema(day, halflives=hls))
    feature_names.extend(features.add_log_return_ema(day, halflives=hls))
    feature_names.extend(features.add_price_diff(day))
    feature_names.extend(features.add_size_diff(day))
    feature_names.extend(features.add_vpin_time(day, vpin_window))
feature_names = list(set(feature_names))
df = pd.concat(data)


['EMA_100',
 'size_diff',
 'dEMA_100',
 'EMA_40',
 'log_returns_std_100-',
 'dEMA_10',
 'log_returns_100-',
 'log_returns_std_40-',
 'log_returns_10-',
 'dEMA_40',
 'VPIN_TIME',
 'log_returns_40-',
 'EMA_10',
 'price_diff',
 'log_returns_std_10-']

Unnamed: 0,SYM,DATE_TIME,ASK_PRICE,ASK_SIZE,BID_PRICE,BID_SIZE,SIZE,PRICE,price,log_returns,...,dEMA_100,log_returns_10-,log_returns_std_10-,log_returns_40-,log_returns_std_40-,log_returns_100-,log_returns_std_100-,price_diff,size_diff,VPIN_TIME
0,XLE,2012-01-05 09:30:00.000,70.8065,15,70.76,5,,,70.794875,0,...,0.014407,0.004685,-1.417041,0.010094,-2.677782,0.018086,-3.645322,14.874171,-0.170972,
1,XLE,2012-01-05 09:30:00.259,70.8065,15,70.76,5,130.0,70.74,70.794875,0,...,0.014407,0.004685,,0.010094,,0.018086,,14.874171,-0.170972,-2.218529
2,XLE,2012-01-05 09:30:00.261,70.8065,15,70.76,5,130.0,70.74,70.794875,0,...,0.014407,0.004685,,0.010094,,0.018086,,14.874171,-0.170972,-2.218529
3,XLE,2012-01-05 09:30:00.267,70.8065,15,70.76,5,100.0,70.81,70.794875,0,...,0.014407,0.004685,,0.010094,,0.018086,,14.874171,-0.170972,-0.991708
4,XLE,2012-01-05 09:30:00.271,70.8065,15,70.76,5,100.0,70.87,70.794875,0,...,0.014407,0.004685,,0.010094,,0.018086,,14.874171,-0.170972,-0.298288
5,XLE,2012-01-05 09:30:00.271,70.8065,15,70.76,5,1000.0,70.87,70.794875,0,...,0.014407,0.004685,,0.010094,,0.018086,,14.874171,-0.170972,1.411515
6,XLE,2012-01-05 09:30:00.273,70.8065,15,70.76,5,137.0,70.8,70.794875,0,...,0.014407,0.004685,,0.010094,,0.018086,,14.874171,-0.170972,1.478987
7,XLE,2012-01-05 09:30:00.273,70.8065,15,70.76,5,237.0,70.8,70.794875,0,...,0.014407,0.004685,,0.010094,,0.018086,,14.874171,-0.170972,1.571905
8,XLE,2012-01-05 09:30:00.291,70.8065,15,70.76,5,474.0,70.87,70.794875,0,...,0.014407,0.004685,,0.010094,,0.018086,,14.874171,-0.170972,1.700493
9,XLE,2012-01-05 09:30:00.305,70.8065,15,70.76,5,429.0,70.79,70.794875,0,...,0.014407,0.004685,,0.010094,,0.018086,,14.874171,-0.170972,1.778476


In [13]:
for feature in feature_names:
    df[feature] = df[feature].fillna(0)
df[feature_names] = (df[feature_names] - df[feature_names].mean()) / df[feature_names].std()
display(feature_names)
display(df)
for feature in feature_names:
    print feature, "min", min(df[feature]), "max", max(df[feature])

['EMA_100',
 'size_diff',
 'dEMA_100',
 'EMA_40',
 'log_returns_std_100-',
 'dEMA_10',
 'log_returns_100-',
 'log_returns_std_40-',
 'log_returns_10-',
 'dEMA_40',
 'VPIN_TIME',
 'log_returns_40-',
 'EMA_10',
 'price_diff',
 'log_returns_std_10-']

Unnamed: 0,SYM,DATE_TIME,ASK_PRICE,ASK_SIZE,BID_PRICE,BID_SIZE,SIZE,PRICE,price,log_returns,...,dEMA_100,log_returns_10-,log_returns_std_10-,log_returns_40-,log_returns_std_40-,log_returns_100-,log_returns_std_100-,price_diff,size_diff,VPIN_TIME
0,XLE,2012-01-05 09:30:00.000,70.8065,15,70.7600,5,,,70.794875,0.000000,...,0.014407,0.004685,-1.417304e+00,0.010094,-2.679854e+00,0.018086,-3.652439e+00,14.874171,-0.170972,-1.407396e-17
1,XLE,2012-01-05 09:30:00.259,70.8065,15,70.7600,5,130,70.74,70.794875,0.000000,...,0.014407,0.004685,3.944994e-17,0.010094,4.854798e-18,0.018086,-4.812636e-18,14.874171,-0.170972,-2.218550e+00
2,XLE,2012-01-05 09:30:00.261,70.8065,15,70.7600,5,130,70.74,70.794875,0.000000,...,0.014407,0.004685,3.944994e-17,0.010094,4.854798e-18,0.018086,-4.812636e-18,14.874171,-0.170972,-2.218550e+00
3,XLE,2012-01-05 09:30:00.267,70.8065,15,70.7600,5,100,70.81,70.794875,0.000000,...,0.014407,0.004685,3.944994e-17,0.010094,4.854798e-18,0.018086,-4.812636e-18,14.874171,-0.170972,-9.917181e-01
4,XLE,2012-01-05 09:30:00.271,70.8065,15,70.7600,5,100,70.87,70.794875,0.000000,...,0.014407,0.004685,3.944994e-17,0.010094,4.854798e-18,0.018086,-4.812636e-18,14.874171,-0.170972,-2.982911e-01
5,XLE,2012-01-05 09:30:00.271,70.8065,15,70.7600,5,1000,70.87,70.794875,0.000000,...,0.014407,0.004685,3.944994e-17,0.010094,4.854798e-18,0.018086,-4.812636e-18,14.874171,-0.170972,1.411529e+00
6,XLE,2012-01-05 09:30:00.273,70.8065,15,70.7600,5,137,70.80,70.794875,0.000000,...,0.014407,0.004685,3.944994e-17,0.010094,4.854798e-18,0.018086,-4.812636e-18,14.874171,-0.170972,1.479001e+00
7,XLE,2012-01-05 09:30:00.273,70.8065,15,70.7600,5,237,70.80,70.794875,0.000000,...,0.014407,0.004685,3.944994e-17,0.010094,4.854798e-18,0.018086,-4.812636e-18,14.874171,-0.170972,1.571920e+00
8,XLE,2012-01-05 09:30:00.291,70.8065,15,70.7600,5,474,70.87,70.794875,0.000000,...,0.014407,0.004685,3.944994e-17,0.010094,4.854798e-18,0.018086,-4.812636e-18,14.874171,-0.170972,1.700509e+00
9,XLE,2012-01-05 09:30:00.305,70.8065,15,70.7600,5,429,70.79,70.794875,0.000000,...,0.014407,0.004685,3.944994e-17,0.010094,4.854798e-18,0.018086,-4.812636e-18,14.874171,-0.170972,1.778494e+00


EMA_100 min -2.71876246182 max 2.42231138405
size_diff min -4.46082469888 max 6.26380828922
dEMA_100 min -9.09697737032 max 4.51205077753
EMA_40 min -2.7966855997 max 2.41348433736
log_returns_std_100- min -3.65243862651 max 7.74033721834
dEMA_10 min -8.73306245156 max 7.43538310951
log_returns_100- min -9.03697651798 max 4.50010222976
log_returns_std_40- min -2.67985429614 max 7.76767802897
log_returns_10- min -8.73657657081 max 7.43826081724
dEMA_40 min -7.25968743449 max 4.94828345791
VPIN_TIME min -2.21855044723 max 2.1980461618
log_returns_40- min -7.25276014423 max 4.93433208186
EMA_10 min -2.87901009832 max 2.41523945261
price_diff min -0.269163582087 max 21.858082939
log_returns_std_10- min -1.4173041233 max 8.65399333125


In [14]:
def cross_validation(data, clf, feature_columns, ycol='label', label='label', K=5,
                     fit_method=lambda cl, X, y: cl.fit(X, y),
                     predict_method=lambda cl, X: cl.predict(X)):
    """

    :param data:
    :param clf:
    :param feature_columns:
    :param label:
    :param K:
    :param fit_method:
    :param predict_method:
    :return:

    define false positive as predicting incorrect 1 or -1 value
    examples:
    true    predicted
    0,-1       1
    0,1       -1

    counter-examples:
    0,1,-1      0
    1           1
    -1          -1
    """
    #data.apply(np.random.shuffle, axis=0)
    partitions = np.array_split(data, K)

    weights = []

    results = {
        'run': range(1, K+1) + ['Total'],
        'acc': [None]*(K+1),
        'fpr': [None]*(K+1),
        'fnr': [None]*(K+1),
        'bpr': [None]*(K+1),
        'gpr': [None]*(K+1),
        '-1': [None]*(K+1),
        '0': [None]*(K+1),
        '1': [None]*(K+1)
    }

    for k in xrange(K):
        training_data = pd.concat(partitions[:k] + partitions[(k+1):])
        testing_data = partitions[k]
        train_x, train_y = training_data[feature_columns], training_data[ycol]
        test_x, test_y = testing_data[feature_columns], testing_data[label]
        fit_method(clf, train_x, train_y)
        pred_y = predict_method(clf, test_x)
        n = np.size(test_y)
        results['acc'][k] = np.sum(pred_y == test_y) / n
        results['fpr'][k] = np.sum((pred_y != test_y) & (pred_y != 0)) / np.sum(pred_y != 0)
        results['fnr'][k] = np.sum((pred_y != test_y) & (pred_y == 0)) / np.sum(pred_y == 0)
        results['bpr'][k] = np.sum((pred_y * test_y) == -1) / np.sum(test_y != 0)
        results['gpr'][k] = np.sum((pred_y == test_y) & (test_y != 0)) / np.sum(test_y != 0)
        results['-1'][k] = np.sum(pred_y == -1)
        results['0'][k] = np.sum(pred_y == 0)
        results['1'][k] = np.sum(pred_y == 1)

        W = clf.coef_
        weights.append(pd.DataFrame({'-1': W[0], '0': W[1], '1': W[2]}, index=feature_columns))

    for col in results:
        if col != 'run':
            results[col][K] = np.mean(results[col][:K])

    mean_weights = sum(weights) / K

    return pd.DataFrame(results).set_index('run'), mean_weights

def clf_output(cv_results, y, K):
    print """
                                 Results
==============================================================================
    """
    print pd.DataFrame({'%': np.array([len(y[y == -1]),
                                len(y[y == 0]),
                                len(y[y == 1])])/len(y)},
                       index=[-1, 0, 1])
    print pd.DataFrame({'values': [(len(y)/K)*(K-1)]}, index=['Training Size / Fold'])
    print cv_results[0]
    print cv_results[1]
    print "=============================================================================="

## Linear SVM, 3-class 5-fold CV, no-class weights 

In [16]:
thresh = 0.000005/2
display(thresh)
hl = 100
K = 5
df['label'] = 0
pred_col = 'log_returns_100+'
df.ix[df[pred_col] > thresh, 'label'] = 1
df.ix[df[pred_col] < -thresh, 'label'] = -1

2.5e-06

In [39]:
clf = svm.LinearSVC(class_weight='auto')
cv_results = cross_validation(df, clf, feature_names, label='label', K=5)


KeyboardInterrupt: 

In [20]:
y = df['label'].values
clf_output(cv_results, y, K)


                                 Results
    
           %
-1  0.152689
 0  0.702226
 1  0.145085
                        values
Training Size / Fold  122670.4
           -1        0        1       acc       bpr       fnr       fpr  \
run                                                                       
1       727.0  14124.0  15817.0  0.424775  0.202285  0.352804  0.765111   
2      1302.0  28495.0    871.0  0.669786  0.022086  0.307387  0.629544   
3       962.0  28639.0   1067.0  0.645363  0.033658  0.332903  0.661410   
4      2316.0  27201.0   1150.0  0.689308  0.048999  0.265909  0.662147   
5       912.0  29088.0    667.0  0.797535  0.029918  0.168764  0.823306   
Total  1243.8  25509.4   3914.4  0.645353  0.067389  0.285553  0.708304   

            gpr  
run              
1      0.349523  
2      0.082311  
3      0.064952  
4      0.132511  
5      0.052169  
Total  0.136293  
                            -1         0         1
EMA_100              -0.207149  0.447237 -0

In [None]:
clf = svm.SVC(class_weight='auto', kernel='rbf', gamma='auto')
cv_results = cross_validation(df, clf, feature_names, label='label', K=5)
