In [48]:
import pandas as pd
from datetime import datetime

import numpy as np

import matplotlib.pylab as plt
%matplotlib inline

from tqdm import tqdm, tqdm_notebook
pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 200)

fname = 'data.csv'

def init_data(fname):
    data = pd.read_csv('data.csv')
    data.xprice = (data.xprice - data.xprice.min()) #/ data.xprice.std() 
    data.yprice = (data.yprice - data.yprice.min()) #/ data.yprice.std() 
    data['timestamp'] = data['timestamp'] // 1000
    data.timestamp = data.timestamp.apply(lambda stamp: datetime.fromtimestamp(stamp))
    data.timestamp = data.timestamp - pd.Timedelta(hours=1) # for flexibility
    data['weekday'] = data.timestamp.dt.weekday
    data['day'] = (data.timestamp.dt.date - data.timestamp.dt.date.min()).apply(lambda x: int(x.days))
    day_close_time = data.day.map(data.groupby('day').timestamp.max())
    data['periods_before_closing'] = (day_close_time - data.timestamp).apply(lambda x: x.seconds // 10) 
    return data
    
def time_split(data, valid_ratio, test_ratio):
    n_valid = max(1, int(data.shape[0] * valid_ratio))
    n_test = max(1, int(data.shape[0] * test_ratio))
    n_train = data.shape[0] - n_valid - n_test
    
    train = data.iloc[:n_train].reset_index(drop=True).copy()
    valid = data.iloc[n_train:-n_test].reset_index(drop=True).copy()
    test = data.iloc[-n_test:].reset_index(drop=True).copy()
    merged_test = valid.append(test).reset_index(drop=True)
    print('Data shapes: ', train.shape, valid.shape, test.shape)
    return train, valid, test, merged_test

In [36]:
def add_diffs(df, column, uselags):
    new_columns = []
    for lag in uselags:
        colname = '{}_diff_{}'.format(column, lag)
        df.loc[:, colname] = df[column].diff(lag)
        new_columns.append(colname)
    print(new_columns)
    return new_columns

def add_shifts(df, column, uselags):
    new_columns = []
    for lag in uselags:
        colname = '{}_lag_{}'.format(column, lag)
        df.loc[:, colname] = df[column].shift(lag)
        new_columns.append(colname)
    print(new_columns)
    return new_columns

def add_norm(df, column):
    colname = '{}_norm'.format(column)
    df.loc[:, colname] = (data[column] - data[column].mean()) / data[column].std()
    return colname

def add_rolling_mean(df, column, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_ma_{}'.format(column, window_size)
        df.loc[:, colname] = df[column].rolling(window=window_size).mean()
        new_columns.append(colname)
    print(new_columns)
    return new_columns

def add_curstom_rolling_operation(df, column, agg_function, function_name, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_{}_{}'.format(column, function_name, window_size)
        df.loc[:, colname] = df[column].rolling(window=window_size).agg(agg_function)
        new_columns.append(colname)
    print(new_columns)
    return new_columns  

def rsiFunc(prices, n=14):
    deltas = np.diff(prices)
    seed = deltas[:n+1]
    up = seed[seed>=0].sum()/n
    down = -seed[seed<0].sum()/n
    rs = up/down
    rsi = np.zeros_like(prices)
    rsi[:n] = 100. - 100./(1.+rs)

    for i in range(n, len(prices)):
        delta = deltas[i-1] # cause the diff is 1 shorter

        if delta>0:
            upval = delta
            downval = 0.
        else:
            upval = 0.
            downval = -delta

        up = (up*(n-1) + upval)/n
        down = (down*(n-1) + downval)/n

        rs = up/down
        rsi[i] = 100. - 100./(1.+rs)

    return rsi

def add_rsi(df, column, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_rsi_{}'.format(column, window_size)
        df.loc[:, colname] = rsiFunc(df[column].values, window_size)
        new_columns.append(colname)
    print(new_columns)
    return new_columns  

def add_ewma(df, column, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_ewma_{}'.format(column, window_size)
        df.loc[:, colname] = pd.Series.ewm(df[column], span=window_size).mean()
        new_columns.append(colname)
    print(new_columns)
    return new_columns 


def add_hand_feats(df):
    close_price_per_day = df.groupby('day').timestamp.max().shift(1).map(
        df[['timestamp', 'yprice']].set_index('timestamp').yprice)
    df.loc[:, 'ydiff_from_closing'] = (df.day.map(close_price_per_day) - df.yprice).fillna(0)
    close_price_per_day = df.groupby('day').timestamp.max().shift(1).map(
        df[['timestamp', 'xprice']].set_index('timestamp').xprice)
    df.loc[:, 'xdiff_from_closing'] = (df.day.map(close_price_per_day) - df.yprice).fillna(0)
    
    open_price_per_day = df.groupby('day').timestamp.min().map(
        df[['timestamp', 'yprice']].set_index('timestamp').yprice)
    df.loc[:, 'ydiff_from_opening'] = (df.day.map(open_price_per_day) - df.yprice)
    
    open_price_per_day = df.groupby('day').timestamp.min().map(
        df[['timestamp', 'xprice']].set_index('timestamp').xprice)
    df.loc[:, 'xdiff_from_opening'] = (df.day.map(open_price_per_day) - df.xprice)
    new_columns = ['ydiff_from_closing', 'xdiff_from_closing', 'ydiff_from_opening', 'xdiff_from_opening']
    print(new_columns)
    return new_columns


In [37]:
data = init_data(fname)
data.head()

Unnamed: 0,timestamp,xprice,yprice,returns,weekday,day,periods_before_closing
0,2013-01-03 20:05:00,12.3,22.075,0.3125,3,0,1409
1,2013-01-03 20:05:10,12.35,22.125,0.275,3,0,1408
2,2013-01-03 20:05:20,12.325,22.1625,0.25,3,0,1407
3,2013-01-03 20:05:30,12.3,22.1875,0.2375,3,0,1406
4,2013-01-03 20:05:40,12.3,22.1875,0.325,3,0,1405


- 6 - 1min
- 60 - 10min
- 360 - 1hour
- 1410 - 1workday (~ 4 hours per day)
- 7050 - 1workweek (5 days per week)
- 28200 - 1 workmonth (~ 4 weeks per month)

In [72]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

standart_calendar_lags = [6, 60, 360, 1410, 7050, 14100, 28200, 42300]
qazy_calendar_lags = [6, 60, 360, 720, 1410, 2820, 7050, 14100, 28200, 42300]
day_lags = 1410 * np.arange(1, 75)
valid_ratio = 0.05
test_ratio = 0.05

In [73]:
data = init_data(fname)
usecols = ['xprice', 'yprice']

yma_cols = add_curstom_rolling_operation(data, 'yprice', np.mean, 'ma', qazy_calendar_lags)    
usecols.extend(yma_cols)
xma_cols = add_curstom_rolling_operation(data, 'xprice', np.mean, 'ma', qazy_calendar_lags)
usecols.extend(xma_cols)

yma_cols = add_curstom_rolling_operation(data, 'yprice', np.max, 'max', qazy_calendar_lags)    
usecols.extend(yma_cols)
xma_cols = add_curstom_rolling_operation(data, 'xprice', np.max, 'max', qazy_calendar_lags)
usecols.extend(xma_cols)

yma_cols = add_curstom_rolling_operation(data, 'yprice', np.max, 'min', qazy_calendar_lags)    
usecols.extend(yma_cols)
xma_cols = add_curstom_rolling_operation(data, 'xprice', np.max, 'min', qazy_calendar_lags)
usecols.extend(xma_cols)

yma_cols = add_curstom_rolling_operation(data, 'yprice', np.median, 'median', qazy_calendar_lags)    
usecols.extend(yma_cols)
xma_cols = add_curstom_rolling_operation(data, 'xprice', np.median, 'median', qazy_calendar_lags)
usecols.extend(xma_cols)


yma_cols = add_ewma(data, 'yprice', qazy_calendar_lags)    
usecols.extend(yma_cols)
xma_cols = add_ewma(data, 'xprice', qazy_calendar_lags)
usecols.extend(xma_cols)

yma_cols = add_shifts(data, 'yprice', [1,4,6])
usecols.extend(yma_cols)
xma_cols = add_shifts(data, 'xprice', [1,4,6])
usecols.extend(xma_cols)

#[6, 60, 600, 6000]
yma_cols = add_curstom_rolling_operation(data, 'yprice', 'skew', 'skew', [6, 60, 600, 6000, 14100]) 
for col in yma_cols:
    data[col].fillna(0, inplace=True)
usecols.extend(yma_cols)
xma_cols = add_curstom_rolling_operation(data, 'xprice', 'skew', 'skew', [6, 60, 600, 6000, 14100])
for col in xma_cols:
    data[col].fillna(0, inplace=True)
usecols.extend(xma_cols)


train, valid, test, merged_test = time_split(data, valid_ratio, test_ratio)
train.dropna(inplace=True)

trivial_solution = np.ones_like(test.returns.values) * test.returns.mean()

print('Zero Prediction MSE: \t {:.5}'.format(np.mean(np.square(test.returns.values))))
print('Mean Prediction MSE: \t {:.5}'.format(mean_squared_error(test.returns, trivial_solution)))
print('Mean Prediction R2: \t {:.5}'.format(r2_score(test.returns, trivial_solution)))

['yprice_ma_6', 'yprice_ma_60', 'yprice_ma_360', 'yprice_ma_720', 'yprice_ma_1410', 'yprice_ma_2820', 'yprice_ma_7050', 'yprice_ma_14100', 'yprice_ma_28200', 'yprice_ma_42300']
['xprice_ma_6', 'xprice_ma_60', 'xprice_ma_360', 'xprice_ma_720', 'xprice_ma_1410', 'xprice_ma_2820', 'xprice_ma_7050', 'xprice_ma_14100', 'xprice_ma_28200', 'xprice_ma_42300']
['yprice_max_6', 'yprice_max_60', 'yprice_max_360', 'yprice_max_720', 'yprice_max_1410', 'yprice_max_2820', 'yprice_max_7050', 'yprice_max_14100', 'yprice_max_28200', 'yprice_max_42300']
['xprice_max_6', 'xprice_max_60', 'xprice_max_360', 'xprice_max_720', 'xprice_max_1410', 'xprice_max_2820', 'xprice_max_7050', 'xprice_max_14100', 'xprice_max_28200', 'xprice_max_42300']
['yprice_min_6', 'yprice_min_60', 'yprice_min_360', 'yprice_min_720', 'yprice_min_1410', 'yprice_min_2820', 'yprice_min_7050', 'yprice_min_14100', 'yprice_min_28200', 'yprice_min_42300']
['xprice_min_6', 'xprice_min_60', 'xprice_min_360', 'xprice_min_720', 'xprice_min_141

In [74]:
from lightgbm import LGBMRegressor
model = LGBMRegressor(
    n_estimators=2000, learning_rate=0.0005,
    max_depth=5, num_leaves=15, min_child_samples=200,
    colsample_bytree=0.9, subsample=0.7, subsample_freq=1,
    reg_alpha=10, reg_lambda=80,
)

print(len(usecols))
eval_set = ([(valid[usecols], valid.returns)])

model.fit(
    train[usecols], train.returns, 
    eval_set=eval_set, verbose=200, early_stopping_rounds=150, 
)

y_predicted = model.predict(valid[usecols])
y_predicted[valid.periods_before_closing == 0] = 0
print('\nMSE: \t {:.5}'.format(mean_squared_error(y_predicted, valid.returns)))
print('R2: \t {:.5}'.format(r2_score(valid.returns, y_predicted)))

118
Training until validation scores don't improve for 150 rounds
[200]	valid_0's l2: 0.0170038
[400]	valid_0's l2: 0.0169953
[600]	valid_0's l2: 0.0169914
[800]	valid_0's l2: 0.0169846
[1000]	valid_0's l2: 0.0169819
[1200]	valid_0's l2: 0.01698
[1400]	valid_0's l2: 0.0169778
[1600]	valid_0's l2: 0.0169777
[1800]	valid_0's l2: 0.0169769
Early stopping, best iteration is:
[1658]	valid_0's l2: 0.0169763

MSE: 	 0.016976
R2: 	 0.00025086


In [75]:
from lightgbm import LGBMRegressor
model = LGBMRegressor(
    n_estimators=2000, learning_rate=0.0005,
    max_depth=4, num_leaves=15, min_child_samples=200,
    colsample_bytree=0.9, subsample=0.7, subsample_freq=1,
    reg_alpha=10, reg_lambda=80,
)

print(len(usecols))
eval_set = ([(valid[usecols], valid.returns)])

model.fit(
    train[usecols], train.returns, 
    eval_set=eval_set, verbose=200, early_stopping_rounds=150, 
)

y_predicted = model.predict(valid[usecols])
y_predicted[valid.periods_before_closing == 0] = 0
print('\nMSE: \t {:.5}'.format(mean_squared_error(y_predicted, valid.returns)))
print('R2: \t {:.5}'.format(r2_score(valid.returns, y_predicted)))

118
Training until validation scores don't improve for 150 rounds
[200]	valid_0's l2: 0.0170015
[400]	valid_0's l2: 0.0169931
[600]	valid_0's l2: 0.0169877
[800]	valid_0's l2: 0.0169826
[1000]	valid_0's l2: 0.0169791
[1200]	valid_0's l2: 0.0169766
[1400]	valid_0's l2: 0.0169721
[1600]	valid_0's l2: 0.0169688
[1800]	valid_0's l2: 0.0169633
[2000]	valid_0's l2: 0.016959
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l2: 0.016959

MSE: 	 0.016959
R2: 	 0.0012702


In [78]:
model = LGBMRegressor(
    n_estimators=2000, learning_rate=0.0005,
    max_depth=4, num_leaves=15, min_child_samples=200,
    colsample_bytree=0.9, subsample=0.7, subsample_freq=1,
    reg_alpha=10, reg_lambda=80,
)

model.fit(train.append(valid)[usecols], train.append(valid).returns)
y_predicted = model.predict(test[usecols])
y_predicted[test.periods_before_closing == 0] = 0

print('\nMSE: \t {:.5}'.format(mean_squared_error(y_predicted, test.returns)))
print('R2: \t {:.5}'.format(r2_score(test.returns, y_predicted)))


MSE: 	 0.013336
R2: 	 -0.023742


In [59]:
selected_importances = model.feature_importances_
pd.DataFrame(list(zip(usecols, selected_importances))).sort_values(1)

Unnamed: 0,0,1
41,xprice_max_42300,0
49,yprice_min_14100,0
113,xprice_skew_6,0
108,yprice_skew_6,0
31,yprice_max_42300,0
61,xprice_min_42300,0
50,yprice_min_28200,1
51,yprice_min_42300,1
52,xprice_min_6,1
59,xprice_min_14100,1


In [60]:
filtered_cols = [col for col, imp in zip(usecols, selected_importances) if imp > 0]
print(len(filtered_cols))

from lightgbm import LGBMRegressor
model = LGBMRegressor(
    n_estimators=2000, learning_rate=0.0005,
    max_depth=5, num_leaves=15, min_child_samples=200,
    colsample_bytree=0.9, subsample=0.7, subsample_freq=1,
    reg_alpha=0, reg_lambda=80,
)

eval_set = ([(test[filtered_cols], test.returns)])
model.fit(
    train[filtered_cols], train.returns, 
    eval_set=eval_set, verbose=100, early_stopping_rounds=150, 
)

y_predicted = model.predict(test[filtered_cols])

print('\nMSE: \t {:.5}'.format(mean_squared_error(y_predicted, test.returns)))
print('R2: \t {:.5}'.format(r2_score(test.returns, y_predicted)))

112
Training until validation scores don't improve for 150 rounds
[100]	valid_0's l2: 0.0186376
[200]	valid_0's l2: 0.0185916
[300]	valid_0's l2: 0.0185516
[400]	valid_0's l2: 0.0185174
[500]	valid_0's l2: 0.0184929
[600]	valid_0's l2: 0.0184775
[700]	valid_0's l2: 0.0184747
[800]	valid_0's l2: 0.0184806
Early stopping, best iteration is:
[729]	valid_0's l2: 0.0184746

MSE: 	 0.018475
R2: 	 0.0087371
