# Pipeline Research
when developing a pipeline, I use this notebook to play around with the data and see what I can do with it.

In [None]:
from pipeline.keyword_extractor import DefaultKeywordExtractor
from datasets.docs_dataset import DocsDataset, DbDocsDataset
from datasets.stock_dataset import StockMeta
from pipeline.preprocess_pipeline import PreprocessPipeline, PreprocessPipeLineConfig
from pipeline.docs_filterer import DefaultFilterer
from pipeline.docs_labeler import DefaultDocsLabeler
from pipeline.vectorlizer import TFIDFVectorlizer
from pipeline.labeled_docs_filterer import Near0returnFilterer
from utils.data import random_split_train_val

In [None]:
"""
Preprocessing pipeline
"""
# doc_dataset = DocsDataset(documents_csv_path="./organized_data/documents.csv")
stock_meta = StockMeta(stock_meta_path="./organized_data/stock_metadata.csv")
stock_name = '台積電'
stock = stock_meta.get_stock_by_name(stock_name)

pipeline_config = PreprocessPipeLineConfig(
    docs_dataset=DbDocsDataset(),
    stock=stock,
    docs_filterer=DefaultFilterer(),
    docs_labeler=DefaultDocsLabeler(s=3),
    labeled_docs_filterer=Near0returnFilterer(threshold=5),
    keywords_extractor=DefaultKeywordExtractor(),
    vectorizer=TFIDFVectorlizer()
)

pipeline = PreprocessPipeline(pipeline_config)

dataset = pipeline.preprocess(verbose=True)

In [2]:
"""
Train & validate model
"""

train_dataset, val_dataset = random_split_train_val(dataset, 0.8)

Feature names: ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
Shape of the TF-IDF matrix: (4, 9)
TF-IDF matrix:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [None]:
from sklearn.svm import SVR
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Function to convert continuous labels to binary
def convert_to_binary_labels(y_values):
    return [1 if y > 0 else 0 for y in y_values]

# Split the dataset
train_dataset, val_dataset = random_split_train_val(labeled_vectors, 0.8)

# Extract features and labels from the training and validation datasets
X_train, y_train = zip(*train_dataset)
X_val, y_val = zip(*val_dataset)

# Train the SVR model
model = SVR()
model.fit(X_train, y_train)

# Make predictions on the training and validation datasets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)

# Convert predictions to binary labels for evaluation
y_train_pred_binary = convert_to_binary_labels(y_train_pred)
y_val_pred_binary = convert_to_binary_labels(y_val_pred)

# Convert true labels to binary for evaluation
y_train_binary = convert_to_binary_labels(y_train)
y_val_binary = convert_to_binary_labels(y_val)

# Calculate performance metrics
train_acc = accuracy_score(y_train_binary, y_train_pred_binary)
val_acc = accuracy_score(y_val_binary, y_val_pred_binary)

train_report = classification_report(y_train_binary, y_train_pred_binary)
val_report = classification_report(y_val_binary, y_val_pred_binary)

train_conf_matrix = confusion_matrix(y_train_binary, y_train_pred_binary)
val_conf_matrix = confusion_matrix(y_val_binary, y_val_pred_binary)

# Display the results
print("Training accuracy:", train_acc)
print("Validation accuracy:", val_acc)

print("\nTraining classification report:")
print(train_report)

print("\nValidation classification report:")
print(val_report)

print("\nTraining confusion matrix:")
print(train_conf_matrix)

print("\nValidation confusion matrix:")
print(val_conf_matrix)

# Plot confusion matrices
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

ConfusionMatrixDisplay(train_conf_matrix, display_labels=np.unique(y_train_binary)).plot(ax=ax1, cmap='Blues', xticks_rotation=45)
ax1.set_title("Training Confusion Matrix")

ConfusionMatrixDisplay(val_conf_matrix, display_labels=np.unique(y_val_binary)).plot(ax=ax2, cmap='Blues', xticks_rotation=45)
ax2.set_title("Validation Confusion Matrix")

plt.tight_layout()
plt.show()
