# Pipeline Research
when developing a pipeline, I use this notebook to play around with the data and see what I can do with it.

In [2]:
"""
Preprocessing pipeline
"""
from preprocess.keyword_extractor import DefaultKeywordExtractor
from datasets.docs_dataset import DbDocsDataset
from datasets.stock_dataset import StockMeta
from preprocess.preprocess_pipeline import PreprocessPipeline, PreprocessPipeLineConfig
from preprocess.docs_filterer import DefaultFilterer
from preprocess.docs_labeler import DefaultDocsLabeler
from preprocess.vectorlizer import TFIDFVectorlizer
from preprocess.labeled_docs_filterer import Near0returnFilterer


docs_dataset = DbDocsDataset()
stock_meta = StockMeta(stock_meta_path="./organized_data/stock_metadata.csv")
stock_name = '台積電'
stock = stock_meta.get_stock_by_name(stock_name)

pipeline_config = PreprocessPipeLineConfig(
    docs_filterer=DefaultFilterer(),
    docs_labeler=DefaultDocsLabeler(s=3),
    labeled_docs_filterer=Near0returnFilterer(threshold=0.01),
    keywords_extractor=DefaultKeywordExtractor(),
    vectorizer=TFIDFVectorlizer()
)

pipeline = PreprocessPipeline(pipeline_config)

dataset = pipeline.preprocess(docs_dataset, stock, verbose=True)

[DefaultFilterer] filtering documents by whether doc title or content contains the stock name


filtering documents:   0%|          | 1000/2362610 [00:10<7:00:09, 93.68it/s]


left with 1001 documents after filtering
[DefaultDocsLabeler] labeling documents by the s day future return percentage of the stock...


labeling documents: 1026it [00:01, 725.64it/s]


[Near 0 return filterer] filtering out documents with return close to 0 within 0.01


filtering documents: 100%|██████████| 503/503 [00:00<00:00, 502317.84it/s]


left with 503 documents after filtering
[DefaultKeywordExtractor] extract keywords from documents using jieba


extracting keywords: 100%|██████████| 503/503 [00:00<?, ?it/s]


remove 9 docs because of empty keywords
left with 494 docs
[TFIDFVectorlizer] converting 494 docs to vectors using tf-idf
tfidf vectorizer fitting...
tfidf vectorizer fitted.


In [3]:
"""
Train & validate model
"""
from sklearn.svm import SVR
from model_evaluation import train_val_model, display_evaluation_result
from utils.data import random_split_train_val

# set model and datasets
model = SVR()
train_dataset, val_dataset = random_split_train_val(dataset, 0.8)

# train and validate model
result = train_val_model(model, train_dataset, val_dataset)

# display result
display_evaluation_result(result)

ValueError: Number of classes, 1, does not match size of target_names, 2. Try specifying the labels parameter