## feature engineering: 
- smoothness of oscillator curves
- smoothness of price curves
- smoothness of MACD curves
- daily price movement (percentage change %)
- oscillator absolute value changes compared to day #40 
- MACD values normalized 
- signal's oscillator level 
- signal's price level 
- signal's macd level 

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re 
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import normalize
import spectrum
from spectrum import Periodogram, data_cosine

In [8]:
def calc_smoothness(data, name): 
    smoothness = []
    for i in range(data.shape[0]):
        smth = np.std(list(data.iloc[i, :] - data.iloc[i, :].shift(1))[1:])
        smoothness.append(smth)
    return pd.Series(smoothness, name = name + '_smth', index = data.index)

In [9]:
def calc_derivative(data, name):
    derivatives = []
    for i in range(data.shape[0]):
        i_derivative = list(np.gradient(pd.Series(data.iloc[i,:]))/pd.Series(data.iloc[i,:]))
        derivatives.append(i_derivative)
    
    names = []
    for i in range(data.shape[1]):
        names.append(name + "_d"+ str(i)) 
    
    return pd.DataFrame(derivatives, columns=names, index = data.index)

In [10]:
def calc_level(data, name):
    
    level = []
    for i in range(data.shape[0]):
        max_num = max(data.iloc[i, :])
        min_num = min(data.iloc[i, :])
        level.append((data.iloc[i, -1] - min_num)/(max_num - min_num))
        
    return pd.Series(level, name = name + '_lev', index = data.index)

In [11]:
def normalize_data(data): 
    for i in range(data.shape[0]): 
        row = data.iloc[i, :] 
        data.iloc[i, :] = (row - min(row))/(max(row) - min(row)) 
    return data

In [12]:
def load_data():
    data = pd.DataFrame()
    
    for filename in os.listdir('../data/buy/'):
        if filename.endswith(".txt"): 
            tmp_df = pd.read_csv(os.path.join('../data/buy/', filename), delimiter= '\s+', header = None)
            tmp_df = tmp_df.astype(float)
            # symbol = filename.split("1")[0]
            # tmp_df = pd.concat([pd.Series([symbol] * tmp_df.shape[0], name = 'symbol'), tmp_df], axis=1)
            tmp_df = tmp_df.iloc[20:,:]
            data = pd.concat([data, tmp_df])
    
    # remove duplicates 
    data = data.drop_duplicates()
    
    # remove extreme oscillator values 
    data = data[(data[0] > -7) & (data[0] < 7)] 
    
    # remove positive oscillator values
    # data = data[data[0] < 0]
    
    # reset index 
    data = data.reset_index(drop=True)
    
    # set column names 
    osc_names = ['osc' + str(i) for i in range(41)]
    prc_names = ['prc' + str(i) for i in range(41)]
    macd_names = ['macd' + str(i) for i in range(41)]
    data.columns = osc_names + prc_names + macd_names + ['rtn']
    
    # preprocess oscillator data 
    osc_data = data.iloc[:, :41].reset_index(drop=True)
    osc_data = osc_data[osc_data.columns[::-1]]
    osc_smth = calc_smoothness(osc_data, 'osc')   # add oscillator smoothness feature 
    osc_lev = calc_level(osc_data, 'osc')   # calculate relative oscillator level 
    # benchmark oscillator values on oldest one 
    for i in range(osc_data.shape[0]):
        osc_data.iloc[i, :] = osc_data.iloc[i, :] - osc_data.iloc[i, 0] 
    
    # preprocess price data 
    prc_data = data.iloc[:, 41:82].reset_index(drop=True)
    prc_data = prc_data[prc_data.columns[::-1]]
    prc_smth = calc_smoothness(prc_data, 'prc')  # add price smoothness feature
    prc_lev = calc_level(prc_data, 'prc')   # calculate relative price level 
    prc_drv = calc_derivative(prc_data, 'prc')   # calculate price derivatives 
    
    # preprocess MACD data 
    macd_data = data.iloc[:, 82:123].reset_index(drop=True)
    macd_data = macd_data[macd_data.columns[::-1]]
    macd_smth = calc_smoothness(macd_data, 'macd')   # add macd smoothness feature
    macd_lev = calc_level(macd_data, 'macd')    # calculate relative macd level 
    macd_data = normalize_data(macd_data)    # normalize macd data 
    
    rtn = data['rtn'].reset_index(drop=True)
    label = pd.Series(map(lambda x: 1 if x>0 else 0, rtn), name = 'label')
    
    data = pd.concat([osc_data, osc_smth, osc_lev,
                      prc_drv, prc_smth, prc_lev, 
                      macd_data, macd_smth, macd_lev, 
                      rtn, label], axis=1)
    data = data.reset_index(drop=True)
    
    return data

In [13]:
class StockData():
    def __init__(self, data):
        
        # balance the data set 
        pos_data = data[data['label'] == 1]
        neg_data = data[data['label'] == 0]
        if pos_data.shape[0] > neg_data.shape[0]:
            pos_data = pos_data.sample(neg_data.shape[0])
        else:
            neg_data = neg_data.sample(pos_data.shape[0])
        data = pd.concat([pos_data, neg_data], axis = 0)
        data = data.reset_index(drop = True) 
        
        X = data.iloc[:, :-1]
        y = data['label']
        
        X_train_valid_w_rtn, X_test_w_rtn, y_train_valid, y_test = train_test_split(X, y, test_size=0.25) 
        # train_valid set
        self.train_valid_rtn = X_train_valid_w_rtn['rtn']
        self.X_train_valid = X_train_valid_w_rtn.drop(['rtn'], axis = 1)
        self.y_train_valid = y_train_valid
        
        X_train_w_rtn, X_valid_w_rtn, y_train, y_valid = train_test_split(X_train_valid_w_rtn, y_train_valid, test_size=0.25) 
        # training set 
        self.train_rtn = X_train_w_rtn['rtn'] 
        self.X_train = X_train_w_rtn.drop(['rtn'], axis = 1)
        self.y_train = y_train
        # validation set
        self.valid_rtn = X_valid_w_rtn['rtn']
        self.X_valid = X_valid_w_rtn.drop(['rtn'], axis = 1) 
        self.y_valid = y_valid
        # test set 
        self.test_rtn = X_test_w_rtn['rtn']
        self.X_test = X_test_w_rtn.drop(['rtn'], axis = 1)        
        self.y_test = y_test 

In [14]:
class SeahorseBuyModel(): 
    def __init__(self):
        self.clf = RandomForestClassifier(n_estimators = 32, max_depth = 50)
        self.features = []
        return 
    
    def fit(self, X_train, y_train):
        self.clf.fit(X_train, y_train)
        self.features = X_train.columns
        return
    
    def score(self, X, y):
        return self.clf.score(X, y)
    
    def evaluate(self, X, y, rtn):
        probs = self.clf.predict_proba(X)

        win_probs = []
        for i in range(len(probs)):
            win_probs.append(probs[i][1])

        trade_classes = [int(x * 10)/10 for x in win_probs]
            
        results = pd.concat([pd.Series(win_probs, name='win_prob'), 
                             pd.Series(trade_classes, name='trade_class'), 
                             y.reset_index(drop=True), 
                             rtn.reset_index(drop=True)], 
                            axis = 1)

        summary = pd.concat([results.groupby(['trade_class']).mean(), 
                             results.groupby(['trade_class']).count()['win_prob'], 
                             100 * results.groupby(['trade_class']).count()['win_prob']/results.shape[0]], 
                            axis=1)
        summary.columns = ['predicted win prob', 'actual win rate', 'mean return', '# of trades', '% of all trades']
        # summary = summary.reindex([i/10 for i in range(10)])
        print(summary.to_string())
        return 
    
    def feature_importances(self):
        f = {'f_name': list(self.features), 
             'f_importance': list(self.clf.feature_importances_)}
        f = pd.DataFrame(f)
        f = f.sort_values(by=['f_importance'], ascending=False)
        return f

In [15]:
data = load_data()

In [16]:
data.to_csv('../data/data_RF3.csv')

In [17]:
data.head()

Unnamed: 0,osc40,osc39,osc38,osc37,osc36,osc35,osc34,osc33,osc32,osc31,...,macd5,macd4,macd3,macd2,macd1,macd0,macd_smth,macd_lev,rtn,label
0,0.0,-1.73,-4.29,-6.6,-8.31,-8.75,-9.1,-8.87,-8.76,-8.97,...,0.777778,0.814815,0.842593,0.851852,0.87963,0.87963,0.036516,0.87963,-1.06,0
1,0.0,-1.71,-2.15,-2.5,-2.27,-2.16,-2.37,-1.39,-0.64,-0.15,...,0.833333,0.864583,0.864583,0.854167,0.833333,0.822917,0.037382,0.822917,-0.53,0
2,0.0,-0.35,-0.12,-0.01,-0.22,0.76,1.51,2.0,2.64,3.27,...,0.858696,0.847826,0.826087,0.815217,0.793478,0.793478,0.038006,0.793478,-5.32,0
3,0.0,0.62,1.38,1.97,2.48,2.11,1.15,-0.17,-1.45,-3.21,...,0.464286,0.482143,0.5,0.482143,0.482143,0.464286,0.045233,0.464286,-0.56,0
4,0.0,0.59,1.1,0.73,-0.23,-1.55,-2.83,-4.59,-6.45,-8.49,...,0.5,0.482143,0.482143,0.464286,0.446429,0.446429,0.045249,0.446429,-5.03,0


In [18]:
stock_data = StockData(data)

In [19]:
model = SeahorseBuyModel()
model.fit(stock_data.X_train, stock_data.y_train)

In [20]:
print(model.score(stock_data.X_train, stock_data.y_train))
print(model.score(stock_data.X_valid, stock_data.y_valid))
print(model.score(stock_data.X_test, stock_data.y_test))

0.9999570695687638
0.5113014360229249
0.5097556263884865


In [21]:
model.evaluate(stock_data.X_valid, stock_data.y_valid, stock_data.valid_rtn)

             predicted win prob  actual win rate  mean return  # of trades  % of all trades
trade_class                                                                                
0.1                    0.174270         0.500000     0.567500           12         0.077275
0.2                    0.266184         0.485106     0.466255          235         1.513298
0.3                    0.354143         0.470401     0.304061         1571        10.116556
0.4                    0.442442         0.482700     0.199489         4740        30.523537
0.5                    0.541388         0.497929     0.259493         7001        45.083392
0.6                    0.644675         0.510441     0.344780         1724        11.101810
0.7                    0.733766         0.542373     0.347331          236         1.519737
0.8                    0.828125         0.800000     0.259000           10         0.064396


In [22]:
model.evaluate(stock_data.X_test, stock_data.y_test, stock_data.test_rtn)

             predicted win prob  actual win rate  mean return  # of trades  % of all trades
trade_class                                                                                
0.1                    0.175000         0.400000    -0.105500           20         0.096590
0.2                    0.266718         0.457249     0.159628          269         1.299140
0.3                    0.354465         0.470035     0.210599         2019         9.750797
0.4                    0.442618         0.488712     0.256358         6423        31.019994
0.5                    0.541738         0.513679     0.310579         9321        45.015937
0.6                    0.645132         0.500000     0.248261         2352        11.359026
0.7                    0.733595         0.505226     0.310348          287         1.386072
0.8                    0.821429         0.642857     0.604286           14         0.067613
0.9                    0.906250         1.000000     0.430000            1      

In [23]:
model.feature_importances()

Unnamed: 0,f_name,f_importance
84,prc_smth,0.011051
127,macd_smth,0.010694
80,prc_d37,0.010055
71,prc_d28,0.009971
83,prc_d40,0.009946
63,prc_d20,0.009796
64,prc_d21,0.009645
52,prc_d9,0.009643
68,prc_d25,0.009623
67,prc_d24,0.009620
