In [1]:
import pandas as pd
import numpy as np
import requests
import json
import urllib
import httpx
from datetime import datetime

import sys
sys.path.append('../')
from config.tda.config import CONSUMER_KEY, REDIRECT_URI, JSON_PATH
from src.PaperTrader import PaperTrader
from src.TechAnalysis import TechAnalysis

from tda import auth, client
from tda.auth import easy_client
from tda.client import Client

token_path = JSON_PATH
api_key = CONSUMER_KEY
redirect_uri = REDIRECT_URI

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score
from statsmodels.tsa.stattools import adfuller

In [2]:
"""
Login to TDAMERITRADE
"""

try:
    c = auth.client_from_token_file(token_path, api_key)
except:
    from selenium import webdriver
    with webdriver.Chrome('/home/daniel/chromedriver') as driver:
        c = auth.client_from_login_flow(
            driver, api_key, redirect_uri, token_path)

In [3]:
"""
Login/pull data from TDAMERTIRADE
"""

stock_ticker = 'VOO'

c = easy_client(
        api_key=api_key,
        redirect_uri=REDIRECT_URI,
        token_path=JSON_PATH)

resp = c.get_price_history(stock_ticker,
        period_type=Client.PriceHistory.PeriodType.YEAR,
        period=Client.PriceHistory.Period.TWENTY_YEARS,
        frequency_type=Client.PriceHistory.FrequencyType.DAILY,
        frequency=Client.PriceHistory.Frequency.DAILY)
assert resp.status_code == httpx.codes.OK


data = pd.DataFrame(c.get_price_history_every_minute(stock_ticker).json()['candles'])
data['target'] = data['close'].shift(-1)

In [4]:
"""
Time function
"""
def time_processing(df):
    dt = np.array(df['datetime'].values)/1000
    func = lambda x: datetime.fromtimestamp(x)
    funcvec = np.vectorize(func)
    dt = funcvec(dt)
    df['datetime'] = dt
    return df

Weights formula:  
$w_k = -w_{k-1} \frac{d - k + 1}{k}$

Weight converges to zero: 
$w_k \rightarrow 0$

In [5]:
"""
Use Fractional Differencing to create stationary data for predictions
Data offsets by weights_window_size
"""

def get_weights_floored(d, num_k, floor=1e-3):
    r"""Calculate weights ($w$) for each lag ($k$) through
    $w_k = -w_{k-1} \frac{d - k + 1}{k}$ provided weight above a minimum value
    (floor) for the weights to prevent computation of weights for the entire
    time series.

    Args:
        d (int): differencing value.
        num_k (int): number of lags (typically length of timeseries) to calculate w.
        floor (float): minimum value for the weights for computational efficiency.
    """
    w_k = np.array([1])
    k = 1

    while k < num_k:
        w_k_latest = -w_k[-1] * ((d - k + 1)) / k
        if abs(w_k_latest) <= floor:
            break

        w_k = np.append(w_k, w_k_latest)

        k += 1

    w_k = w_k.reshape(-1, 1) 

    return w_k

def frac_diff(df, d, floor=1e-3):
    r"""Fractionally difference time series via CPU.

    Args:
        df (pd.DataFrame): dataframe of raw time series values.
        d (float): differencing value from 0 to 1 where > 1 has no FD.
        floor (float): minimum value of weights, ignoring anything smaller.
    """
    # Get weights window
    weights = get_weights_floored(d=d, num_k=len(df), floor=floor)
    weights_window_size = len(weights)

    # Reverse weights
    weights = weights[::-1]

    # Blank fractionally differenced series to be filled
    df_fd = []

    # Slide window of time series, to calculated fractionally differenced values
    # per window
    for idx in range(weights_window_size, df.shape[0]):
        # Dot product of weights and original values
        # to get fractionally differenced values
        date_idx = df.index[idx]
        df_fd.append(np.dot(weights.T, df.iloc[idx - weights_window_size:idx]).item())

    # Return FD values and weights
    df_fd = pd.DataFrame(df_fd)

    return df_fd, weights

def find_best_d_value(df, alpha):
    """
    Run iterations of fractional differing to find best d_value.
    Do this by running adfuller test. A test of Stationarity. Low p_value = Stationarity
    Returns values, weights, and d_value of best_d_value
    """
    adfs = []
    
    for d in np.linspace(0,1,51):
        df_fd, weights = frac_diff(data['close'], d)
        pvalue = adfuller(df_fd.values)[1]
        adfs.append([d, pvalue])
        if pvalue < alpha:
            break      
    best_d_value = adfs[-1][0]
    return df_fd, weights, best_d_value



In [6]:
"""
Imputes fractional differencing into data
"""
df_fd, weights, best_d_value = find_best_d_value(data['close'], alpha=.05)
data['frac_diff_cost'] = np.nan
data['frac_diff_cost'].iloc[len(weights):] = df_fd[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [7]:
"""
TechAnalysis tutorial
"""
column_name = 'close'
ta = TechAnalysis(data)
# data['ma'] = ta.moving_average(column_name, 5)
# data['rsi'] = ta.rsi(column_name, 10)
# data['macd'] = ta.macd(column_name, 10, 30)
# data['upper'], data['lower'] = ta.bollinger_bands(column_name, 10)
# ta.fib_retracement()

In [8]:
"""
Create mass features
"""

column_name = 'close'
ta = TechAnalysis(data)
price_offset=1.000001

steps = [5, 10, 20, 30, 40, 50]
macds = [[2,10],[5,10],[10,20],[10,30],[20,30]]
bbs_std = [1, 1.5, 2]

for step in steps:
    data[f'ma_{step}'] = ta.moving_average(column_name, step)
    data[f'ewa_{step}'] = ta.moving_average(column_name, step, simple=False)
    data[f'rsi_{step}'] = ta.rsi(column_name, step)
    for std in bbs_std:
        data[f'bb_{step}_{std}_upper'],  data[f'bb_{step}_{std}_lower']= ta.bollinger_bands(column_name, step, std = std)


for macd in macds:
    short, long = macd
    data[f'rsi_{step}'] = ta.macd(column_name, short, long)

data = time_processing(data)
data['target_classifier'] = 0
data['target_classifier'][data['target']>data['close']*price_offset] = 1
data.reset_index(inplace=True, drop=True)
data.drop(['datetime', 'target', 'open', 'high', 'low'], axis=1, inplace=True)
data.dropna(inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
data

Unnamed: 0,close,volume,frac_diff_cost,ma_5,ewa_5,rsi_5,bb_5_1_upper,bb_5_1_lower,bb_5_1.5_upper,bb_5_1.5_lower,...,ma_50,ewa_50,rsi_50,bb_50_1_upper,bb_50_1_lower,bb_50_1.5_upper,bb_50_1.5_lower,bb_50_2_upper,bb_50_2_lower,target_classifier
64,429.39,8329,85.687128,429.108,429.325484,73.372781,429.350012,428.865988,429.471019,428.744981,...,429.624500,429.325484,0.103917,429.920449,429.328551,430.068424,429.180576,430.216399,429.032601,1
65,429.57,12236,85.614409,429.236,429.488495,83.333333,429.525016,428.946984,429.669523,428.802477,...,429.626500,429.488495,0.104083,429.921721,429.331279,430.069331,429.183669,430.216942,429.036058,1
66,429.82,9947,85.747129,429.396,429.709498,85.714286,429.749737,429.042263,429.926606,428.865394,...,429.630900,429.709498,0.104083,429.927355,429.334445,430.075582,429.186218,430.223809,429.037991,1
67,430.01,32679,85.920633,429.626,429.909833,100.000000,429.911359,429.340641,430.054039,429.197961,...,429.640900,429.909833,0.101750,429.941596,429.340204,430.091944,429.189856,430.242292,429.039508,0
68,429.97,17666,86.000940,429.752,429.949944,94.366197,430.017932,429.486068,430.150898,429.353102,...,429.662500,429.949944,0.100583,429.946481,429.378519,430.088471,429.236529,430.230462,429.094538,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16185,428.42,100,85.229844,428.416,428.416951,60.000000,428.424944,428.407056,428.429416,428.402584,...,429.011282,428.416951,0.176833,429.432111,428.590453,429.642525,428.380039,429.852940,428.169624,0
16186,428.37,100,85.248166,428.404,428.385650,12.500000,428.423494,428.384506,428.433240,428.374760,...,428.992682,428.385650,0.153333,429.420976,428.564388,429.635122,428.350242,429.849269,428.136095,1
16187,428.45,447,85.202050,428.412,428.428550,64.285714,428.440636,428.383364,428.454953,428.369047,...,428.974682,428.428550,0.126500,429.406549,428.542815,429.622483,428.326881,429.838417,428.110947,1
16188,428.53,206,85.303332,428.436,428.496183,77.272727,428.495833,428.376167,428.525750,428.346250,...,428.951882,428.496183,0.110667,429.376320,428.527444,429.588539,428.315225,429.800758,428.103006,1


In [10]:
"""
test/train/ver split
"""


train_size = round(.9*len(data))

train = data[:train_size]
test = data[train_size:]

y_train = train.pop('target_classifier')
X_train = train
# X_train.drop('datetime', axis=1, inplace=True)

y_test = test.pop('target_classifier')
X_test = test


In [11]:
bestfeatures = SelectKBest(score_func=f_regression, k=10)
fit = bestfeatures.fit(X_train,y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)
featureScores = pd.concat([dfcolumns, dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

              Specs     Score
23           rsi_20  5.272517
50           rsi_50  4.246009
2    frac_diff_cost  3.553181
10     bb_5_2_upper  2.849324
19    bb_10_2_upper  2.823238
8    bb_5_1.5_upper  2.750887
17  bb_10_1.5_upper  2.724885
6      bb_5_1_upper  2.653967
15    bb_10_1_upper  2.627751
3              ma_5  2.464915


In [12]:
# f_reg_largest = featureScores.nlargest(10,'Score')
# X_train_filtered = X_train[X_train.columns[f_reg_largest.index]]
# X_test_filtered = X_test[X_test.columns[f_reg_largest.index]]

In [13]:
pipe = make_pipeline(SelectKBest(score_func=f_regression, k=5), GradientBoostingClassifier())
tscv = TimeSeriesSplit(n_splits=2)
parameters = {}
clf = GridSearchCV(pipe, parameters, n_jobs=-1, cv=tscv)
clf.fit(X_train,y_train)
clf.best_params_

{}

In [14]:
clf.cv_results_
clf.best_score_

0.5130246020260492

In [15]:
y_pred = clf.predict(X_train)
print(precision_score(y_train, y_pred), accuracy_score(y_train, y_pred))

y_pred = clf.predict(X_test)
print(precision_score(y_test, y_pred), accuracy_score(y_test, y_pred))


0.6185176941909637 0.5830059954517263
0.49209932279909707 0.5235732009925558


In [16]:
"""
Tutorial of PaperTrader Class
"""
# PT = PaperTrader(500)
# key = PT.buy('MSFT', 100, 3)
# print(PT.current_record())
# print(PT.current_free_cash())
# PT.sell(key, 102, 3)
# print(pd.DataFrame(PT.current_record()))
# print(PT.current_free_cash())
        

'\nTutorial of PaperTrader Class\n'

In [17]:
print(X_test.head())
print(y_pred[:10])

          close  volume  frac_diff_cost       ma_5       ewa_5      rsi_5  \
14577  439.5000    2308       87.674688  439.52600  439.514078  25.690430   
14578  439.4500    3602       87.627582  439.52000  439.471359  42.105263   
14579  439.5500    4733       87.582754  439.51800  439.523786  47.619048   
14580  439.6150    6395       87.696201  439.53100  439.584595  62.264151   
14581  439.6626   11972       87.730794  439.55552  439.636598  70.257766   

       bb_5_1_upper  bb_5_1_lower  bb_5_1.5_upper  bb_5_1.5_lower  ...  \
14577    439.560351    439.491649      439.577527      439.474473  ...   
14578    439.565277    439.474723      439.587915      439.452085  ...   
14579    439.561243    439.474757      439.582865      439.453135  ...   
14580    439.592278    439.469722      439.622917      439.439083  ...   
14581    439.641035    439.470005      439.683793      439.427247  ...   

       bb_40_2_lower       ma_50      ewa_50    rsi_50  bb_50_1_upper  \
14577     438.65250

In [18]:
PT_test = PaperTrader(5000)
X_test_temp = X_test['close'].reset_index()
for i, pred in enumerate(y_pred):
    if pred:
        key = PT_test.buy('VOO', X_test_temp['close'][i], 1)
        try:
            PT_test.sell(key, X_test_temp['close'][i+1],1)
        except:
            pass
print(pd.DataFrame(PT_test.current_record()))
print(PT_test.current_free_cash())

     key ticker  buy_price  buy_amount  buy_total_amount             buy_time  \
0      0    VOO     439.45           1            439.45  2022-01-10 01:08:23   
1      1    VOO     439.55           1            439.55  2022-01-10 01:08:23   
2      2    VOO     439.39           1            439.39  2022-01-10 01:08:23   
3      3    VOO     439.46           1            439.46  2022-01-10 01:08:23   
4      4    VOO     439.58           1            439.58  2022-01-10 01:08:23   
..   ...    ...        ...         ...               ...                  ...   
438  438    VOO     428.56           1            428.56  2022-01-10 01:08:23   
439  439    VOO     428.52           1            428.52  2022-01-10 01:08:23   
440  440    VOO     428.49           1            428.49  2022-01-10 01:08:23   
441  441    VOO     428.41           1            428.41  2022-01-10 01:08:23   
442  442    VOO     428.37           1            428.37  2022-01-10 01:08:23   

     sell_price  sell_amoun