# Pipeline Research
when developing a pipeline, I use this notebook to play around with the data and see what I can do with it.

In [None]:
"""
Preprocessing pipeline
"""
from preprocess.keyword_extractor import DefaultKeywordExtractor
from datasets.docs_dataset import DbDocsDataset
from datasets.stock_dataset import StockMeta
from preprocess.preprocess_pipeline import PreprocessPipeline, PreprocessPipeLineConfig
from preprocess.docs_filterer import DefaultFilterer
from preprocess.docs_labeler import DefaultDocsLabeler
from preprocess.vectorlizer import TFIDFVectorlizer
from preprocess.labeled_docs_filterer import Near0returnFilterer


stock_meta = StockMeta(stock_meta_path="./organized_data/stock_metadata.csv")
stock_name = '台積電'
stock = stock_meta.get_stock_by_name(stock_name)

pipeline_config = PreprocessPipeLineConfig(
    docs_dataset=DbDocsDataset(),
    stock=stock,
    docs_filterer=DefaultFilterer(),
    docs_labeler=DefaultDocsLabeler(s=3),
    labeled_docs_filterer=Near0returnFilterer(threshold=5),
    keywords_extractor=DefaultKeywordExtractor(),
    vectorizer=TFIDFVectorlizer()
)

pipeline = PreprocessPipeline(pipeline_config)

dataset = pipeline.preprocess(verbose=True)

In [None]:
"""
Train & validate model
"""
from sklearn.svm import SVR
from model_evaluation import train_val_model, display_evaluation_result
from utils.data import random_split_train_val

# set model and datasets
model = SVR()
train_dataset, val_dataset = random_split_train_val(dataset, 0.8)

# train and validate model
result = train_val_model(model, train_dataset, val_dataset)

# display result
display_evaluation_result(result)