In [4]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, classification_report, confusion_matrix)
import time

# ------------------- LOAD DATA -------------------
X_train_tfidf = sparse.load_npz('../../data_use/tf_idf/X_train_tfidf.npz')
X_test_tfidf = sparse.load_npz('../../data_use/tf_idf/X_test_tfidf.npz')
X_train_lsa = np.load('../../data_use/lsa/X_train_lsa.npy')
X_test_lsa = np.load('../../data_use/lsa/X_test_lsa.npy')
X_train_pca = np.load('../../data_use/pca/X_train_pca.npy')
X_test_pca = np.load('../../data_use/pca/X_test_pca.npy')
y_train = np.load('../../data_use/tf_idf/y_train.npy', allow_pickle=True)
y_test = np.load('../../data_use/tf_idf/y_test.npy', allow_pickle=True)

datasets = {
    'TF-IDF': (X_train_tfidf, X_test_tfidf),
    'LSA': (X_train_lsa, X_test_lsa),
    'PCA': (X_train_pca, X_test_pca)
}

In [5]:
results_dt = {}

for name, (X_train, X_test) in datasets.items():
    print(f"\nDataset: {name}")
    
    model = DecisionTreeClassifier(max_depth=20, min_samples_split=10, 
                                   min_samples_leaf=5, random_state=42)
    
    start_train = time.time()
    model.fit(X_train, y_train)
    t_train = time.time() - start_train
    
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    print(f"Accuracy: {acc*100:.2f}%, F1: {f1:.3f}, Time: {t_train:.2f}s")
    
    results_dt[name] = {
        'model': model,
        'acc': acc,
        'prec': prec,
        'rec': rec,
        'f1': f1,
        'cm': confusion_matrix(y_test, y_pred),
        'report': classification_report(y_test, y_pred, zero_division=0)
    }

# Save results
with open('results/dt_results.pkl', 'wb') as f:
    pickle.dump(results_dt, f)
print("\nSaved dt_results.pkl")



Dataset: TF-IDF
Accuracy: 85.59%, F1: 0.857, Time: 0.30s

Dataset: LSA
Accuracy: 94.82%, F1: 0.949, Time: 0.55s

Dataset: PCA
Accuracy: 95.27%, F1: 0.953, Time: 0.57s

Saved dt_results.pkl
