# import TF-IDF, Doc2Vec, SBERT

In [1]:
import pandas as pd
import pickle
import numpy as np
from scipy import sparse

# load labels for all vectorization methods - same for all
df = pd.read_csv("../00_data_cleaning/out/20newsgroup_preprocessed.csv")
labels_df = df["target"]


# TF-IDF vectors from NPZ file
with open("../01_vectorization/out/tf_idf/X_tfidf.npz", "rb") as f:
    X_tfidf = pickle.load(f)

# Doc2Vec vectors from NPY file
with open("../01_vectorization/out/doc2vec/d2v_vectors.npy", "rb") as f:
    X_doc2vec = np.load(f)

# SBERT vectors from NPY file
with open("../01_vectorization/out/sbert/sbert_vectors.npy", "rb") as f:
    X_sbert = np.load(f)


print("\nTF-IDF matrix shape:", X_tfidf.shape)
print("\nDoc2Vec matrix shape:", X_doc2vec.shape)
print("\nSBERT matrix shape:", X_sbert.shape)


TF-IDF matrix shape: (18846, 35000)

Doc2Vec matrix shape: (18846, 100)

SBERT matrix shape: (18846, 384)


# train/validate/test split & scaling

In [2]:
from utils import train_val_test_split

# all splits: train 0.7, val 0.15, test 0.15
test_size = val_size = 0.15
shuffle = True
stratify = labels_df
rnd = 42

# TF-IDF
X_TF_train, X_TF_val, X_TF_test, y_TF_train, y_TF_val, y_TF_test = train_val_test_split(
    X_tfidf,
    labels_df,
    test_size=test_size,
    random_state=rnd,
    shuffle=shuffle,
    stratify=stratify,
)

# Doc2Vec
X_D2_train, X_D2_val, X_D2_test, y_D2_train, y_D2_val, y_D2_test = train_val_test_split(
    X_doc2vec,
    labels_df,
    test_size=test_size,
    random_state=rnd,
    shuffle=shuffle,
    stratify=stratify,
)

# SBERT
X_SBERT_train, X_SBERT_val, X_SBERT_test, y_SBERT_train, y_SBERT_val, y_SBERT_test = (
    train_val_test_split(
        X_sbert,
        labels_df,
        test_size=test_size,
        random_state=rnd,
        shuffle=shuffle,
        stratify=stratify,
    )
)

# TODO: scaling? (e.g. StandardScaler, MinMaxScaler, etc. but not mixing splits with test to avoid data leakage)

# SVM

In [3]:
from svm import train_svm

# Train and evaluate SVM for each vectorization method
print("Training SVM with TF-IDF vectors...")
tfidf_results = train_svm(
    X_TF_train, X_TF_val, X_TF_test, y_TF_train, y_TF_val, y_TF_test
)

# print("\nTraining SVM with Doc2Vec vectors...")
# doc2vec_results = train_svm(
#     X_D2_train, X_D2_val, X_D2_test, y_D2_train, y_D2_val, y_D2_test
# )

# print("\nTraining SVM with SBERT vectors...")
# sbert_results = train_svm(
#     X_SBERT_train, X_SBERT_val, X_SBERT_test, y_SBERT_train, y_SBERT_val, y_SBERT_test
# )

# Access results like this:
print("\nTF-IDF Test Report:")
print(tfidf_results["test_report"])

Training SVM with TF-IDF vectors...
Fitting 3 folds for each of 2 candidates, totalling 6 fits

TF-IDF Test Report:
              precision    recall  f1-score   support

           0       0.60      0.57      0.58       120
           1       0.63      0.69      0.66       146
           2       0.66      0.66      0.66       148
           3       0.63      0.64      0.63       147
           4       0.75      0.65      0.69       144
           5       0.84      0.76      0.80       148
           6       0.75      0.67      0.71       146
           7       0.66      0.74      0.70       149
           8       0.48      0.75      0.59       149
           9       0.83      0.83      0.83       149
          10       0.93      0.83      0.88       150
          11       0.88      0.79      0.83       149
          12       0.68      0.72      0.70       148
          13       0.81      0.85      0.83       149
          14       0.74      0.75      0.75       148
          15       

# plot SVM


In [4]:
from utils import plot_classification_results

# Create dictionaries for results and test data
results_dict = {
    "TF-IDF": tfidf_results,
    # "Doc2Vec": doc2vec_results,
    # "SBERT": sbert_results,
}

X_test_dict = {"TF-IDF": X_TF_test}  # , "Doc2Vec": X_D2_test, "SBERT": X_SBERT_test}

y_test_dict = {"TF-IDF": y_TF_test}  # , "Doc2Vec": y_D2_test, "SBERT": y_SBERT_test}

# Plot results
metrics_df = plot_classification_results(
    results_dict,
    X_test_dict,
    y_test_dict,
    save_path="classification_results.png",  # Optional: save plot to file
)

# Print detailed metrics
print("\nDetailed Performance Metrics:")
print(metrics_df.round(3))


Detailed Performance Metrics:
   Accuracy  Macro F1  Method
0      0.72      0.72  TF-IDF


# MLPClassifier

# Decision Tree