In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, classification_report, confusion_matrix)
import time

# ------------------- LOAD DATA -------------------
X_train_tfidf = sparse.load_npz('../../data_use/tf_idf/X_train_tfidf.npz')
X_test_tfidf = sparse.load_npz('../../data_use/tf_idf/X_test_tfidf.npz')
X_train_lsa = np.load('../../data_use/lsa/X_train_lsa.npy')
X_test_lsa = np.load('../../data_use/lsa/X_test_lsa.npy')
X_train_pca = np.load('../../data_use/pca/X_train_pca.npy')
X_test_pca = np.load('../../data_use/pca/X_test_pca.npy')
y_train = np.load('../../data_use/tf_idf/y_train.npy', allow_pickle=True)
y_test = np.load('../../data_use/tf_idf/y_test.npy', allow_pickle=True)

datasets = {
    'TF-IDF': (X_train_tfidf, X_test_tfidf),
    'LSA': (X_train_lsa, X_test_lsa),
    'PCA': (X_train_pca, X_test_pca)
}

In [2]:
results_lr = {}

for name, (X_train, X_test) in datasets.items():
    print(f"\nDataset: {name}")
    
    model = LogisticRegression(max_iter=1000, random_state=42, solver='lbfgs', multi_class='multinomial')
    
    start_train = time.time()
    model.fit(X_train, y_train)
    t_train = time.time() - start_train
    
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    print(f"Accuracy: {acc*100:.2f}%, F1: {f1:.3f}, Time: {t_train:.2f}s")
    
    results_lr[name] = {
        'model': model,
        'acc': acc,
        'prec': prec,
        'rec': rec,
        'f1': f1,
        'cm': confusion_matrix(y_test, y_pred),
        'report': classification_report(y_test, y_pred, zero_division=0)
    }

# Save results
with open('results/lr_results.pkl', 'wb') as f:
    pickle.dump(results_lr, f)
print("\nSaved lr_results.pkl")



Dataset: TF-IDF




Accuracy: 99.10%, F1: 0.991, Time: 0.21s

Dataset: LSA
Accuracy: 98.65%, F1: 0.986, Time: 0.07s

Dataset: PCA
Accuracy: 98.42%, F1: 0.984, Time: 0.05s

Saved lr_results.pkl


