# Back Track Research



### Set up notebook (not important)
change directory to root and import stuff

In [5]:
import os
import random

folder_name = "demo"
if folder_name in os.getcwd():
    os.chdir(os.path.abspath(os.pardir))
%pwd

'/Users/Ethan/Developer/Projects/College/大四下/數據分析/期中'

In [6]:
from datasets.docs_dataset import DocsDataset, DbDocsDataset
from datasets.stock_dataset import StockMeta
from preprocess.preprocess_pipeline import PreprocessPipeline
from preprocess.docs_filterer import IDocsFilterer, StockNameFilterer, Word2VecSimilarFilterer
from preprocess.docs_labeler import IDocsLabeler, FutureReturnDocsLabeler
from preprocess.keyword_extractor import IKeywordExtractor, JiebaKeywordExtractor, TFIDFKeywordExtractor, ChiSquareKeywordExtractor
from preprocess.vectorlizer import IVectorlizer, KeywordsTfIdfVectorlizer
from sklearn.linear_model import LogisticRegression
from back_track.back_track import BackTrackConfig, BackTrackResult, BackTrack
from datetime import datetime, timedelta
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [7]:
# set up config
docs_dataset = DbDocsDataset()
stock_meta = StockMeta("./organized_data/stock_metadata.csv")
stock = stock_meta.get_stock_by_name("群創")
clf = make_pipeline(StandardScaler(), SVC(probability=True))
preprocess_pipeline = PreprocessPipeline(
    docs_filterer=Word2VecSimilarFilterer(topn=5, white_noise_ratio=0),
    docs_labeler=FutureReturnDocsLabeler(s=3, threshold=0.1),
    keywords_extractor=JiebaKeywordExtractor(),
    vectorizer=KeywordsTfIdfVectorlizer(count_features=100, pca_components=10)
)

config = BackTrackConfig(
    s=3,
    docs_dataset=docs_dataset,
    start_date=datetime(2019, 4, 1),
    end_date=datetime(2019, 12, 30),
    train_span=timedelta(days=90),
    inference_span=timedelta(days=30),
    take_shot_threshold=0.1,
    preprocess_pipeline=preprocess_pipeline
)

# run backtest
backtrack = BackTrack(config)
result = backtrack.run(stock, clf)

backtest start date: 2019-04-01 00:00:00
backtest end date: 2019-12-30 00:00:00
train span: 90 days, 0:00:00
inference span: 30 days, 0:00:00


train span: 2019-04-01 00:00:00 => 2019-06-30 00:00:00

inference span: 2019-06-30 00:00:00 => 2019-07-30 00:00:00


test date: 2019-07-30 00:00:00: 31it [00:51,  1.65s/it]                        




train span: 2019-05-01 00:00:00 => 2019-07-30 00:00:00

inference span: 2019-07-30 00:00:00 => 2019-08-29 00:00:00


test date: 2019-08-29 00:00:00: 31it [00:44,  1.44s/it]                        




train span: 2019-05-31 00:00:00 => 2019-08-29 00:00:00

inference span: 2019-08-29 00:00:00 => 2019-09-28 00:00:00


test date: 2019-09-27 00:00:00: 31it [00:43,  1.42s/it]                        




train span: 2019-06-30 00:00:00 => 2019-09-28 00:00:00

inference span: 2019-09-28 00:00:00 => 2019-10-28 00:00:00


test date: 2019-10-28 00:00:00: 31it [00:41,  1.33s/it]                        




train span: 2019-07-30 00:00:00 => 2019-10-28 00:00:00

inference span: 2019-10-28 00:00:00 => 2019-11-27 00:00:00


test date: 2019-11-27 00:00:00: 31it [00:49,  1.61s/it]                        




train span: 2019-08-29 00:00:00 => 2019-11-27 00:00:00

inference span: 2019-11-27 00:00:00 => 2019-12-27 00:00:00


test date: 2019-12-27 00:00:00: 31it [00:48,  1.57s/it]                        



train span: 2019-09-28 00:00:00 => 2019-12-27 00:00:00
backtest end





# 回測成果展示

In [1]:
def display_result(result):
    # convdrt result to pd dateframe
    df = result.to_df()
    df["date"] = df["date"].apply(lambda x: x.strftime("%Y-%m-%d"))
    df["train_start_date"] = df["train_start_date"].apply(lambda x: x.strftime("%Y-%m-%d"))
    df["train_end_date"] = df["train_end_date"].apply(lambda x: x.strftime("%Y-%m-%d"))
    df["inference_start_date"] = df["inference_start_date"].apply(lambda x: x.strftime("%Y-%m-%d"))
    df["inference_end_date"] = df["inference_end_date"].apply(lambda x: x.strftime("%Y-%m-%d"))
    df["train_accuracy"] = df["train_accuracy"].apply(lambda x: round(x, 4))

    # display result
    display(df)
    # print result
    print("總出手次數: ", round(result.total_profit, 4))
    print("總準確率: ", round(result.total_profit_rate, 4))

display(result)

2019-07-01
y_pred        day  count_up  count_down 當天實際 當天預測 出手
0      2019-07-01       0.0         8.0    跌    跌  是
1      2019-07-02       0.0        11.0    跌    跌  是
2      2019-07-03       0.0         5.0    漲    跌  是
3      2019-07-04       0.0         4.0    跌    漲  是
4      2019-07-05       0.0         7.0    跌    漲  是
7      2019-07-08       0.0         6.0    漲    漲  是
8      2019-07-09       6.0        17.0    漲    漲  是
9      2019-07-10       0.0         6.0    跌    漲  是
10     2019-07-11       0.0         7.0    漲    漲  是
11     2019-07-12       2.0        17.0    漲    漲  是
14     2019-07-15       0.0        16.0    漲    漲  是
15     2019-07-16       0.0        17.0    漲    跌  是
16     2019-07-17       0.0        38.0    跌    跌  是
17     2019-07-18       0.0        18.0    跌    跌  是
18     2019-07-19       0.0        10.0    跌    跌  是
20     2019-07-22       0.0        10.0    跌    跌  是
21     2019-07-23       0.0         9.0    跌    跌  是
22     2019-07-24       2.0        