## feature engineering: 
- smoothness of oscillator curves
- smoothness of price curves
- smoothness of MACD curves
- daily price movement (percentage change %)
- oscillator absolute value changes compared to day #40 
- MACD values normalized 
- signal's oscillator level 
- signal's price level 
- signal's macd level 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re 
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
import spectrum
from spectrum import Periodogram, data_cosine

In [2]:
def calc_smoothness(data, name): 
    smoothness = []
    for i in range(data.shape[0]):
        smth = np.std(list(data.iloc[i, :] - data.iloc[i, :].shift(1))[1:])
        smoothness.append(smth)
    return pd.Series(smoothness, name = name + '_smth', index = data.index)

In [3]:
def calc_derivative(data, name):
    derivatives = []
    for i in range(data.shape[0]):
        i_derivative = list(np.gradient(pd.Series(data.iloc[i,:]))/pd.Series(data.iloc[i,:]))
        derivatives.append(i_derivative)
    
    names = []
    for i in range(data.shape[1]):
        names.append(name + "_d"+ str(i)) 
    
    return pd.DataFrame(derivatives, columns=names, index = data.index)

In [4]:
def calc_level(data, name):
    
    level = []
    for i in range(data.shape[0]):
        max_num = max(data.iloc[i, :])
        min_num = min(data.iloc[i, :])
        level.append((data.iloc[i, -1] - min_num)/(max_num - min_num))
        
    return pd.Series(level, name = name + '_lev', index = data.index)

In [5]:
def normalize_data(data): 
    for i in range(data.shape[0]): 
        row = data.iloc[i, :] 
        data.iloc[i, :] = (row - min(row))/(max(row) - min(row)) 
    return data

In [6]:
def load_data():
    data = pd.DataFrame()
    
    for filename in os.listdir('../data/buy/'):
        if filename.endswith(".txt"): 
            tmp_df = pd.read_csv(os.path.join('../data/buy/', filename), delimiter= '\s+', header = None)
            tmp_df = tmp_df.astype(float)
            # symbol = filename.split("1")[0]
            # tmp_df = pd.concat([pd.Series([symbol] * tmp_df.shape[0], name = 'symbol'), tmp_df], axis=1)
            tmp_df = tmp_df.iloc[20:,:]
            data = pd.concat([data, tmp_df])
    
    # remove duplicates 
    data = data.drop_duplicates()
    
    # remove extreme oscillator values 
    data = data[(data[0] > -7) & (data[0] < 7)] 
    
    # remove positive oscillator values
    # data = data[data[0] < 0]
    
    # reset index 
    data = data.reset_index(drop=True)
    
    # set column names 
    osc_names = ['osc' + str(i) for i in range(41)]
    prc_names = ['prc' + str(i) for i in range(41)]
    macd_names = ['macd' + str(i) for i in range(41)]
    data.columns = osc_names + prc_names + macd_names + ['rtn']
    
    # preprocess oscillator data 
    osc_data = data.iloc[:, :41].reset_index(drop=True)
    osc_data = osc_data[osc_data.columns[::-1]]
    osc_smth = calc_smoothness(osc_data, 'osc')   # add oscillator smoothness feature 
    osc_lev = calc_level(osc_data, 'osc')   # calculate relative oscillator level 
    # benchmark oscillator values on oldest one 
    for i in range(osc_data.shape[0]):
        osc_data.iloc[i, :] = osc_data.iloc[i, :] - osc_data.iloc[i, 0] 
    
    # preprocess price data 
    prc_data = data.iloc[:, 41:82].reset_index(drop=True)
    prc_data = prc_data[prc_data.columns[::-1]]
    prc_smth = calc_smoothness(prc_data, 'prc')  # add price smoothness feature
    prc_lev = calc_level(prc_data, 'prc')   # calculate relative price level 
    prc_drv = calc_derivative(prc_data, 'prc')   # calculate price derivatives 
    
    # preprocess MACD data 
    macd_data = data.iloc[:, 82:123].reset_index(drop=True)
    macd_data = macd_data[macd_data.columns[::-1]]
    macd_smth = calc_smoothness(macd_data, 'macd')   # add macd smoothness feature
    macd_lev = calc_level(macd_data, 'macd')    # calculate relative macd level 
    macd_data = normalize_data(macd_data)    # normalize macd data 
    
    rtn = data['rtn'].reset_index(drop=True)
    label = pd.Series(map(lambda x: 1 if x>0 else 0, rtn), name = 'label')
    
    data = pd.concat([osc_data, osc_smth, osc_lev,
                      prc_drv, prc_smth, prc_lev, 
                      macd_data, macd_smth, macd_lev, 
                      rtn, label], axis=1)
    data = data.reset_index(drop=True)
    
    return data

In [7]:
class StockData():
    def __init__(self, data):
        
        # balance the data set 
        pos_data = data[data['label'] == 1]
        neg_data = data[data['label'] == 0]
        if pos_data.shape[0] > neg_data.shape[0]:
            pos_data = pos_data.sample(neg_data.shape[0])
        else:
            neg_data = neg_data.sample(pos_data.shape[0])
        data = pd.concat([pos_data, neg_data], axis = 0)
        data = data.reset_index(drop = True) 
        
        X = data.iloc[:, :-1]
        y = data['label']
        
        X_train_valid_w_rtn, X_test_w_rtn, y_train_valid, y_test = train_test_split(X, y, test_size=0.25) 
        # train_valid set
        self.train_valid_rtn = X_train_valid_w_rtn['rtn']
        self.X_train_valid = X_train_valid_w_rtn.drop(['rtn'], axis = 1)
        self.y_train_valid = y_train_valid
        
        X_train_w_rtn, X_valid_w_rtn, y_train, y_valid = train_test_split(X_train_valid_w_rtn, y_train_valid, test_size=0.25) 
        # training set 
        self.train_rtn = X_train_w_rtn['rtn'] 
        self.X_train = X_train_w_rtn.drop(['rtn'], axis = 1)
        self.y_train = y_train
        # validation set
        self.valid_rtn = X_valid_w_rtn['rtn']
        self.X_valid = X_valid_w_rtn.drop(['rtn'], axis = 1) 
        self.y_valid = y_valid
        # test set 
        self.test_rtn = X_test_w_rtn['rtn']
        self.X_test = X_test_w_rtn.drop(['rtn'], axis = 1)        
        self.y_test = y_test 

In [19]:
class SeahorseBuyModel(): 
    def __init__(self):
        self.clf = SVC(gamma='auto', probability=True)
        self.features = []
        return 
    
    def fit(self, X_train, y_train):
        self.clf.fit(X_train, y_train)
        self.features = X_train.columns
        return
    
    def score(self, X, y):
        return self.clf.score(X, y)
    
    def evaluate(self, X, y, rtn):
        probs = self.clf.predict_proba(X)

        win_probs = []
        for i in range(len(probs)):
            win_probs.append(probs[i][1])

        trade_classes = [int(x * 10)/10 for x in win_probs]
            
        results = pd.concat([pd.Series(win_probs, name='win_prob'), 
                             pd.Series(trade_classes, name='trade_class'), 
                             y.reset_index(drop=True), 
                             rtn.reset_index(drop=True)], 
                            axis = 1)

        summary = pd.concat([results.groupby(['trade_class']).mean(), 
                             results.groupby(['trade_class']).count()['win_prob'], 
                             100 * results.groupby(['trade_class']).count()['win_prob']/results.shape[0]], 
                            axis=1)
        summary.columns = ['predicted win prob', 'actual win rate', 'mean return', '# of trades', '% of all trades']
        # summary = summary.reindex([i/10 for i in range(10)])
        print(summary.to_string())
        return 
    
    def feature_importances(self):
        f = {'f_name': list(self.features), 
             'f_importance': list(self.clf.feature_importances_)}
        f = pd.DataFrame(f)
        f = f.sort_values(by=['f_importance'], ascending=False)
        return f

In [11]:
data = pd.read_csv('../data/data_RF3.csv', header=0, index_col=0)

In [20]:
data.shape

(100987, 131)

In [13]:
data.tail()

Unnamed: 0,osc40,osc39,osc38,osc37,osc36,osc35,osc34,osc33,osc32,osc31,...,macd5,macd4,macd3,macd2,macd1,macd0,macd_smth,macd_lev,rtn,label
100982,0.0,-0.15,-0.32,-0.56,-0.45,-0.38,-0.23,-0.14,-0.02,0.11,...,0.475728,0.42233,0.288026,0.257282,0.242718,0.229773,0.650905,0.229773,-0.33,0
100983,0.0,0.28,0.2,0.18,0.15,0.15,0.29,0.13,-0.07,-0.27,...,0.464853,0.37415,0.24263,0.197279,0.195011,0.21542,0.517516,0.21542,-0.11,0
100984,0.0,0.0,0.14,-0.02,-0.22,-0.42,-0.55,-0.66,-0.82,-0.97,...,0.231183,0.255376,0.303763,0.287634,0.247312,0.303763,0.494327,0.303763,1.04,1
100985,0.0,0.04,0.09,0.06,0.1,0.07,-0.1,-0.29,-0.48,-0.54,...,0.040404,0.161616,0.232323,0.228956,0.228956,0.208754,0.340592,0.208754,-0.86,0
100986,0.0,0.09,0.26,0.41,0.45,0.46,0.42,0.4,0.38,0.29,...,0.082405,0.160356,0.171492,0.256125,0.342984,0.505568,0.326886,0.505568,4.55,1


In [14]:
stock_data = StockData(data)

In [16]:
model = SeahorseBuyModel()
model.fit(stock_data.X_train, stock_data.y_train)



In [17]:
print(model.score(stock_data.X_train, stock_data.y_train))
print(model.score(stock_data.X_valid, stock_data.y_valid))
print(model.score(stock_data.X_test, stock_data.y_test))

0.5507115718977397
0.5111726447292163
0.5059886023374868


In [18]:
model.evaluate(stock_data.X_valid, stock_data.y_valid, stock_data.valid_rtn)

AttributeError: predict_proba is not available when  probability=False