## feature engineering: 
- smoothness of oscillator curves
- smoothness of price curves
- smoothness of MACD curves
- price percentage change compared to day #0
- daily price movement (percentage change %)
- oscillator absolute values change compared to day #40 
- MACD values normalized

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re 
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import normalize
import time

In [44]:
from xgboost import XGBClassifier

In [50]:
file_path = '../data/buy/'

timeframe = '3days'
if timeframe == '3days':
    bar_cnt = 41
elif timeframe == '9days':
    bar_cnt = 118

In [35]:
def calc_smoothness(data, type): 
    smoothness = []
    for i in range(data.shape[0]):
        row = np.array(data.iloc[i, :])
        # normalize row data, so that we could get comparable smoothness data 
        row = (row - min(row))/(max(row) - min(row)) 
        diff1 = row[1:] - row[:-1]
        diff2 = diff1[1:] - diff1[:-1]
        smth = sum(map(lambda x: x**2, diff2))
        smoothness.append(smth)
    return pd.Series(smoothness, name=type + '_smth', index=data.index)

In [23]:
def calc_derivative(data, name):
    derivatives = []
    for i in range(data.shape[0]):
        i_derivative = list(np.gradient(pd.Series(data.iloc[i,:]))/pd.Series(data.iloc[i,:]))
        derivatives.append(i_derivative)
    
    names = []
    for i in range(data.shape[1]):
        names.append(name + "_drv"+ str(i)) 
    
    return pd.DataFrame(derivatives, columns=names, index=data.index)

In [24]:
def calc_prc_chg(data, name):
    prc_chg = []
    for i in range(data.shape[0]):
        i_chg = list(data.iloc[i,:]/data.iloc[i, 0])
        prc_chg.append(i_chg)
    
    names = []
    for i in range(data.shape[1]):
        names.append(name + "_chg"+ str(i)) 
    
    return pd.DataFrame(prc_chg, columns=names, index=data.index)

In [25]:
def calc_level(data, name):
    
    level = []
    for i in range(data.shape[0]):
        max_num = max(data.iloc[i, :])
        min_num = min(data.iloc[i, :])
        level.append((data.iloc[i, -1] - min_num)/(max_num - min_num))
        
    return pd.Series(level, name = name + '_lev', index = data.index)

In [26]:
def normalize_data(data): 
    for i in range(data.shape[0]): 
        row = data.iloc[i, :] 
        data.iloc[i, :] = (row - min(row))/(max(row) - min(row)) 
    return data

In [32]:
def load_data():
    data = pd.DataFrame()
    
    for filename in os.listdir(file_path):
        if filename.endswith(".txt"): 
            tmp_df = pd.read_csv(os.path.join(file_path, filename), delimiter= '\s+', header = None)
            tmp_df = tmp_df.astype(float)
            tmp_df = tmp_df.iloc[20:,:]
            # column 180 data is wrong, so replace it with mean value of column 179 and 181
            # tmp_df[180] = (tmp_df[179] + tmp_df[181])/2    
            data = pd.concat([data, tmp_df])
    
    # remove duplicates 
    data = data.drop_duplicates()
    
    # remove extreme oscillator values 
    data = data[(data[0] > -7) & (data[0] < 7)] 
    
    # remove positive oscillator values
    # data = data[data[0] < 0]
    
    # reset index 
    data = data.reset_index(drop=True)
    
    # set column names 
    osc_names = ['osc' + str(i) for i in range(bar_cnt)]
    prc_names = ['prc' + str(i) for i in range(bar_cnt)]
    macd_names = ['macd' + str(i) for i in range(bar_cnt)]
    data.columns = osc_names + prc_names + macd_names + ['rtn']
    
    # preprocess oscillator data 
    osc_data = data.iloc[:, :bar_cnt].reset_index(drop=True)
    osc_data = osc_data[osc_data.columns[::-1]]
    osc_smth = calc_smoothness(osc_data, 'osc')   # add oscillator smoothness feature 
    osc_lev = calc_level(osc_data, 'osc')   # calculate relative oscillator level 
    # benchmark oscillator values on the oldest one 
    for i in range(osc_data.shape[0]):
        osc_data.iloc[i, :] = osc_data.iloc[i, :] - osc_data.iloc[i, 0] 
    
    # preprocess price data 
    prc_data = data.iloc[:, bar_cnt:bar_cnt*2].reset_index(drop=True)
    prc_data = prc_data[prc_data.columns[::-1]]
    prc_smth = calc_smoothness(prc_data, 'prc')  # add price smoothness feature
    prc_lev = calc_level(prc_data, 'prc')   # calculate relative price level 
    prc_drv = calc_derivative(prc_data, 'prc')   # calculate price derivatives 
    prc_chg = calc_prc_chg(prc_data, 'prc')
    
    # preprocess MACD data 
    macd_data = data.iloc[:, bar_cnt*2:bar_cnt*3].reset_index(drop=True)
    macd_data = macd_data[macd_data.columns[::-1]]
    macd_smth = calc_smoothness(macd_data, 'macd')   # add macd smoothness feature
    macd_lev = calc_level(macd_data, 'macd')    # calculate relative macd level 
    macd_data = normalize_data(macd_data)    # normalize macd data 
    
    rtn = data['rtn'].reset_index(drop=True)
    label = pd.Series(map(lambda x: 1 if x>0 else 0, rtn), name = 'label')
    
    data = pd.concat([osc_data, osc_smth, osc_lev,
                      prc_drv, prc_chg, prc_smth, prc_lev, 
                      macd_data, macd_smth, macd_lev, 
                      rtn, label], axis=1)
    data = data.reset_index(drop=True)
    
    return data

In [33]:
class StockData():
    def __init__(self, data):
        
        # balance the data set 
        pos_data = data[data['label'] == 1]
        neg_data = data[data['label'] == 0]
        if pos_data.shape[0] > neg_data.shape[0]:
            pos_data = pos_data.sample(neg_data.shape[0])
        else:
            neg_data = neg_data.sample(pos_data.shape[0])
        data = pd.concat([pos_data, neg_data], axis = 0)
        data = data.reset_index(drop = True) 
        
        X = data.iloc[:, :-1]
        y = data['label']
        
        X_train_valid_w_rtn, X_test_w_rtn, y_train_valid, y_test = train_test_split(X, y, test_size=0.25) 
        # train_valid set
        self.train_valid_rtn = X_train_valid_w_rtn['rtn']
        self.X_train_valid = X_train_valid_w_rtn.drop(['rtn'], axis = 1)
        self.y_train_valid = y_train_valid
        
        X_train_w_rtn, X_valid_w_rtn, y_train, y_valid = train_test_split(X_train_valid_w_rtn, y_train_valid, test_size=0.25) 
        # training set 
        self.train_rtn = X_train_w_rtn['rtn'] 
        self.X_train = X_train_w_rtn.drop(['rtn'], axis = 1)
        self.y_train = y_train
        # validation set
        self.valid_rtn = X_valid_w_rtn['rtn']
        self.X_valid = X_valid_w_rtn.drop(['rtn'], axis = 1) 
        self.y_valid = y_valid
        # test set 
        self.test_rtn = X_test_w_rtn['rtn']
        self.X_test = X_test_w_rtn.drop(['rtn'], axis = 1)        
        self.y_test = y_test 

In [36]:
data = load_data()
data.shape

(100987, 172)

In [37]:
data.to_csv(file_path + 'data_XGB1.csv')

In [51]:
data = pd.read_csv(file_path + 'data_XGB1.csv', header=0, index_col=0)
data.shape

(100987, 172)

In [52]:
data = data.sample(12000)
data.shape

(12000, 172)

In [53]:
data.head()

Unnamed: 0,osc40,osc39,osc38,osc37,osc36,osc35,osc34,osc33,osc32,osc31,...,macd5,macd4,macd3,macd2,macd1,macd0,macd_smth,macd_lev,rtn,label
27921,0.0,0.2,0.46,0.8,1.2,1.59,2.0,2.13,2.23,2.34,...,0.905501,0.991537,1.0,0.911142,0.819464,0.897038,0.29309,0.897038,1.36,1
36720,0.0,0.16,0.44,0.53,0.73,0.77,0.93,1.01,1.13,1.07,...,0.415547,0.422265,0.389635,0.359885,0.307102,0.366603,0.298496,0.366603,-1.41,0
32576,0.0,0.17,0.29,0.41,0.31,0.33,0.32,0.25,0.03,-0.34,...,0.4,0.3,0.3,0.2,0.2,0.3,0.42,0.3,0.0,0
16850,0.0,-0.16,-0.39,-0.65,-0.85,-0.92,-1.04,-1.04,-1.06,-0.97,...,0.352941,0.25,0.485294,0.602941,0.647059,0.617647,0.420631,0.617647,-0.3,0
25459,0.0,0.01,-0.02,-0.06,-0.12,-0.05,-0.01,0.03,0.07,0.06,...,0.710526,0.763158,0.842105,0.815789,0.736842,0.684211,0.257618,0.684211,0.17,1


In [54]:
stock_data = StockData(data)

print(stock_data.X_train.shape)
print(stock_data.X_valid.shape)
print(stock_data.X_test.shape)

(5486, 170)
(1829, 170)
(2439, 170)


In [None]:
depths = [4*i for i in range(2, 21)]
estimators = [10*i for i in range(5, 21)] 
results = []
for d in depths:
    row = [d]
    for e in estimators:
        start = time.time() 
        clf = XGBClassifier(max_depth=d, n_estimators=e)
        clf.fit(stock_data.X_train, stock_data.y_train) 
        valid_score = np.round(clf.score(stock_data.X_valid, stock_data.y_valid), 4)
        row.append(valid_score)
        end = time.time()
        print("max_depth = {}, n_estimators = {}, validation score = {}, run time = {}".format(d, e, valid_score, start-end))
    results.append(row)

max_depth = 8, n_estimators = 50, validation score = 0.491, run time = -14.678264856338501
max_depth = 8, n_estimators = 60, validation score = 0.4926, run time = -16.947750329971313
max_depth = 8, n_estimators = 70, validation score = 0.4877, run time = -18.795708417892456
max_depth = 8, n_estimators = 80, validation score = 0.503, run time = -18.398114681243896
max_depth = 8, n_estimators = 90, validation score = 0.5036, run time = -26.051838636398315
max_depth = 8, n_estimators = 100, validation score = 0.5118, run time = -25.49998927116394
max_depth = 8, n_estimators = 110, validation score = 0.5107, run time = -32.863261222839355
max_depth = 8, n_estimators = 120, validation score = 0.509, run time = -32.711719274520874
max_depth = 8, n_estimators = 130, validation score = 0.5036, run time = -69.05991077423096
max_depth = 8, n_estimators = 140, validation score = 0.503, run time = -44.18846774101257
max_depth = 8, n_estimators = 150, validation score = 0.5014, run time = -48.15799

max_depth = 28, n_estimators = 140, validation score = 0.5249, run time = -84.6555106639862
max_depth = 28, n_estimators = 150, validation score = 0.5216, run time = -89.16725873947144
max_depth = 28, n_estimators = 160, validation score = 0.5227, run time = -93.6120297908783
max_depth = 28, n_estimators = 170, validation score = 0.5227, run time = -97.32328343391418
max_depth = 28, n_estimators = 180, validation score = 0.5221, run time = -104.222829580307
max_depth = 28, n_estimators = 190, validation score = 0.5205, run time = -105.11792945861816
max_depth = 28, n_estimators = 200, validation score = 0.5189, run time = -109.04696941375732
max_depth = 32, n_estimators = 50, validation score = 0.4992, run time = -36.92799782752991
max_depth = 32, n_estimators = 60, validation score = 0.5107, run time = -44.168410539627075
max_depth = 32, n_estimators = 70, validation score = 0.5101, run time = -48.74819588661194
max_depth = 32, n_estimators = 80, validation score = 0.5128, run time = 

max_depth = 52, n_estimators = 70, validation score = 0.5063, run time = -22.03700065612793
max_depth = 52, n_estimators = 80, validation score = 0.5074, run time = -24.737998485565186
max_depth = 52, n_estimators = 90, validation score = 0.5128, run time = -26.908003568649292
max_depth = 52, n_estimators = 100, validation score = 0.503, run time = -27.376036643981934
max_depth = 52, n_estimators = 110, validation score = 0.515, run time = -27.512962341308594
max_depth = 52, n_estimators = 120, validation score = 0.5123, run time = -34.09699845314026
max_depth = 52, n_estimators = 130, validation score = 0.5134, run time = -35.117997884750366
max_depth = 52, n_estimators = 140, validation score = 0.5238, run time = -33.81100344657898
max_depth = 52, n_estimators = 150, validation score = 0.5232, run time = -37.53400278091431
max_depth = 52, n_estimators = 160, validation score = 0.5194, run time = -41.74099540710449
max_depth = 52, n_estimators = 170, validation score = 0.5243, run tim

max_depth = 72, n_estimators = 160, validation score = 0.5194, run time = -40.8600013256073
max_depth = 72, n_estimators = 170, validation score = 0.5243, run time = -37.838001012802124
max_depth = 72, n_estimators = 180, validation score = 0.5194, run time = -39.542999505996704
max_depth = 72, n_estimators = 190, validation score = 0.5205, run time = -44.330000162124634
max_depth = 72, n_estimators = 200, validation score = 0.5232, run time = -46.72899889945984
max_depth = 76, n_estimators = 50, validation score = 0.5025, run time = -14.35799527168274
max_depth = 76, n_estimators = 60, validation score = 0.5046, run time = -18.40499997138977
max_depth = 76, n_estimators = 70, validation score = 0.5063, run time = -20.959999561309814
max_depth = 76, n_estimators = 80, validation score = 0.5074, run time = -23.284998178482056
max_depth = 76, n_estimators = 90, validation score = 0.5128, run time = -22.32699990272522
max_depth = 76, n_estimators = 100, validation score = 0.503, run time 

In [None]:
col_names = ['est=' + str(i) for i in estimators]
col_names.insert(0, 'depth')
results = pd.DataFrame(results, columns=col_names)

In [45]:
clf = XGBClassifier() 
estimators = [10*i for i in range(5, 21)] 
depths = [4*i for i in range(2, 21)]
parameters = {'n_estimators':estimators, 'max_depth': depths} 
gs = GridSearchCV(clf, parameters, cv=5)
gs.fit(stock_data.X_train_valid, stock_data.y_train_valid)
print(gs.best_params_)
print(gs.best_score_)
print(gs.score(stock_data.X_test, stock_data.y_test))

KeyboardInterrupt: 