In [2]:
pip install numpy pandas scikit-learn joblib flask nltk scipy


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: C:\Users\Deiv\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
# news_tfidf_classifier.py
"""
TF-IDF -> classifier for selected 20Newsgroups categories.
- Uses TfidfVectorizer with built-in stopword removal for speed.
- Uses LogisticRegression (fast & well-suited to sparse text).
- Prints accuracy, precision/recall/f1, classification report.
- Saves pipeline to news_tfidf_lr.joblib
"""

from html import parser
import joblib
import re
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import argparse

def minimal_clean(text):
    # optional mild cleanup (keep it fast)
    if not isinstance(text, str):
        return ""
    text = text.lower()
    # remove long runs of whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def load_data(categories):
    data = fetch_20newsgroups(categories=categories, subset='all',
                              remove=('headers','footers','quotes'))
    X = [minimal_clean(t) for t in data.data]
    y = data.target
    target_names = data.target_names
    return X, y, target_names

def build_pipeline():
    vect = TfidfVectorizer(ngram_range=(1,2),
                           max_df=0.9,
                           min_df=3,
                           sublinear_tf=True,
                           stop_words='english')  # use built-in stopwords
    clf = LogisticRegression(max_iter=2000, solver='saga', n_jobs=-1, random_state=42)
    pipe = Pipeline([
        ("tfidf", vect),
        ("clf", clf)
    ])
    return pipe

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--test_size", type=float, default=0.2)
    args, _ = parser.parse_known_args()

    categories = ['comp.graphics', 'misc.forsale', 'rec.sport.baseball',
                  'soc.religion.christian', 'talk.politics.guns']

    print("Loading data...")
    X, y, target_names = load_data(categories)
    print(f"samples: {len(X)}, classes: {len(target_names)}")

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=args.test_size,
                                                        stratify=y,
                                                        random_state=42)

    pipe = build_pipeline()
    print("Training pipeline...")
    pipe.fit(X_train, y_train)

    print("Predicting test set...")
    y_pred = pipe.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"Precision (weighted): {prec:.4f}")
    print(f"Recall (weighted): {rec:.4f}")
    print(f"F1 (weighted): {f1:.4f}")
    print("\nClassification report:\n")
    print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0))

    # Save pipeline
    model_path = "news_tfidf_lr.joblib"
    joblib.dump(pipe, model_path)
    print(f"Saved pipeline to {model_path}")

if __name__ == "__main__":
    main()


Loading data...
samples: 4849, classes: 5
Training pipeline...
Predicting test set...
Accuracy: 0.9124
Precision (weighted): 0.9167
Recall (weighted): 0.9124
F1 (weighted): 0.9125

Classification report:

                        precision    recall  f1-score   support

         comp.graphics       0.91      0.91      0.91       195
          misc.forsale       0.96      0.88      0.92       195
    rec.sport.baseball       0.84      0.99      0.91       199
soc.religion.christian       0.96      0.87      0.92       199
    talk.politics.guns       0.92      0.91      0.91       182

              accuracy                           0.91       970
             macro avg       0.92      0.91      0.91       970
          weighted avg       0.92      0.91      0.91       970

Saved pipeline to news_tfidf_lr.joblib


In [1]:
import joblib
from sklearn.datasets import fetch_20newsgroups

# Load saved model
model = joblib.load("news_tfidf_lr.joblib")

# Same categories as training
categories = ['comp.graphics', 'misc.forsale', 'rec.sport.baseball',
              'soc.religion.christian', 'talk.politics.guns']
target_names = fetch_20newsgroups(categories=categories, subset='train').target_names

# Try some custom texts
examples = [
    "rtx 4050 laptops are good",
    "I bought a laptop on sale last week.",
    "The baseball team won their match yesterday.",
    "God is love and faith is important in life.",
    "The debate on gun laws is heating up again in politics.",
    "He bought a new computer with a powerful GPU."
]

for text in examples:
    pred = model.predict([text])[0]
    print(f"Text: {text}\n → Predicted category: {target_names[pred]}\n")


Text: rtx 4050 laptops are good
 → Predicted category: rec.sport.baseball

Text: I bought a laptop on sale last week.
 → Predicted category: misc.forsale

Text: The baseball team won their match yesterday.
 → Predicted category: rec.sport.baseball

Text: God is love and faith is important in life.
 → Predicted category: soc.religion.christian

Text: The debate on gun laws is heating up again in politics.
 → Predicted category: talk.politics.guns

Text: He bought a new computer with a powerful GPU.
 → Predicted category: misc.forsale



In [2]:
text = "3050 is good"
pred = model.predict([text])[0]
print("Predicted category:", target_names[pred])


Predicted category: rec.sport.baseball


In [4]:
# hybripyd_tfidf_lsa.
"""
Hybrid TF-IDF + LSA -> Logistic Regression classifier.

Saves model components to news_tfidf_lsa_lr.joblib.

Usage:
    python hybrid_tfidf_lsa.py --n_components 100 --test_size 0.2
"""
import argparse
import re
import joblib
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, classification_report
)
from sklearn.preprocessing import StandardScaler
from scipy import sparse
import numpy as np
import os

def minimal_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def load_data(categories):
    data = fetch_20newsgroups(categories=categories, subset='all', remove=('headers','footers','quotes'))
    X = [minimal_clean(t) for t in data.data]
    y = data.target
    target_names = data.target_names
    return X, y, target_names

def build_tfidf(train_texts, ngram_range=(1,2), max_df=0.9, min_df=3):
    vect = TfidfVectorizer(ngram_range=ngram_range,
                           max_df=max_df, min_df=min_df,
                           sublinear_tf=True, stop_words='english')
    X_tfidf = vect.fit_transform(train_texts)
    return vect, X_tfidf

def combine_sparse_dense(X_sparse, X_dense):
    """Concatenate sparse matrix with dense array (converted to sparse)."""
    return sparse.hstack([X_sparse, sparse.csr_matrix(X_dense)], format='csr')

def evaluate(y_true, y_pred, target_names):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision (weighted): {prec:.4f}")
    print(f"Recall (weighted): {rec:.4f}")
    print(f"F1 (weighted): {f1:.4f}\n")
    print("Classification report:\n")
    print(classification_report(y_true, y_pred, target_names=target_names, zero_division=0))
    return acc, prec, rec, f1

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--n_components", type=int, default=100, help="TruncatedSVD components (LSA dims)")
    parser.add_argument("--test_size", type=float, default=0.2)
    parser.add_argument("--compare_baseline", action="store_true",
                        help="If set, attempt to load news_tfidf_lr.joblib and compare performance on the same test set")
    # safe for notebooks
    args, _ = parser.parse_known_args()

    categories = ['comp.graphics', 'misc.forsale', 'rec.sport.baseball',
                  'soc.religion.christian', 'talk.politics.guns']

    print("Loading data...")
    X, y, target_names = load_data(categories)
    print(f"samples: {len(X)}, classes: {len(target_names)}")

    # split raw texts (we'll fit tfidf on train only)
    X_train_texts, X_test_texts, y_train, y_test = train_test_split(
        X, y, test_size=args.test_size, stratify=y, random_state=42
    )

    # TF-IDF
    print("Building TF-IDF...")
    tfidf_vect, X_train_tfidf = build_tfidf(X_train_texts)
    X_test_tfidf = tfidf_vect.transform(X_test_texts)
    print("TF-IDF shapes:", X_train_tfidf.shape, X_test_tfidf.shape)

    # LSA via TruncatedSVD on train TF-IDF
    print(f"Running TruncatedSVD (n_components={args.n_components})...")
    svd = TruncatedSVD(n_components=args.n_components, random_state=42)
    X_train_lsa = svd.fit_transform(X_train_tfidf)
    X_test_lsa = svd.transform(X_test_tfidf)
    print("LSA shapes:", X_train_lsa.shape, X_test_lsa.shape)

    # Scale dense chunk (important when concatenating)
    scaler = StandardScaler()
    X_train_lsa_scaled = scaler.fit_transform(X_train_lsa)
    X_test_lsa_scaled = scaler.transform(X_test_lsa)

    # Concatenate sparse TF-IDF + dense LSA (converted to sparse)
    X_train_comb = combine_sparse_dense(X_train_tfidf, X_train_lsa_scaled)
    X_test_comb  = combine_sparse_dense(X_test_tfidf, X_test_lsa_scaled)
    print("Combined shapes:", X_train_comb.shape, X_test_comb.shape)

    # Train classifier (LogisticRegression)
    clf = LogisticRegression(max_iter=2000, solver='saga', n_jobs=-1, random_state=42)
    print("Training classifier on combined features...")
    clf.fit(X_train_comb, y_train)

    # Predict + evaluate
    print("Predicting test set...")
    y_pred = clf.predict(X_test_comb)
    evaluate(y_test, y_pred, target_names)

    # Save the needed components so we can load later for inference
    model_bundle = {
        "tfidf_vect": tfidf_vect,
        "svd": svd,
        "scaler": scaler,
        "clf": clf,
        "categories": categories
    }
    out_path = "news_tfidf_lsa_lr.joblib"
    joblib.dump(model_bundle, out_path)
    print(f"Saved hybrid model bundle to {out_path}")

    # Optional: compare with baseline model if user asked and baseline exists
    if args.compare_baseline:
        baseline_path = "news_tfidf_lr.joblib"
        if os.path.exists(baseline_path):
            print("\nLoading baseline pipeline and comparing on the same test set...")
            baseline = joblib.load(baseline_path)
            # baseline expects raw text input (pipeline), so pass X_test_texts
            base_pred = baseline.predict(X_test_texts)
            print("Baseline metrics:")
            evaluate(y_test, base_pred, target_names)
            print("Hybrid model metrics (repeating for convenience):")
            evaluate(y_test, y_pred, target_names)
        else:
            print(f"Baseline file {baseline_path} not found. Skip baseline comparison.")

if __name__ == "__main__":
    main()


Loading data...
samples: 4849, classes: 5
Building TF-IDF...
TF-IDF shapes: (3879, 26918) (970, 26918)
Running TruncatedSVD (n_components=100)...
LSA shapes: (3879, 100) (970, 100)
Combined shapes: (3879, 27018) (970, 27018)
Training classifier on combined features...
Predicting test set...
Accuracy: 0.9010
Precision (weighted): 0.9041
Recall (weighted): 0.9010
F1 (weighted): 0.9010

Classification report:

                        precision    recall  f1-score   support

         comp.graphics       0.90      0.88      0.89       195
          misc.forsale       0.92      0.89      0.90       195
    rec.sport.baseball       0.84      0.98      0.90       199
soc.religion.christian       0.95      0.87      0.91       199
    talk.politics.guns       0.91      0.88      0.90       182

              accuracy                           0.90       970
             macro avg       0.90      0.90      0.90       970
          weighted avg       0.90      0.90      0.90       970

Saved hybr

In [2]:
# compare_models.py
"""
Train & compare:
 - Baseline: TF-IDF -> LogisticRegression
 - Hybrid:  TF-IDF -> TruncatedSVD -> concat -> LogisticRegression
Runs hybrid for multiple n_components and prints a comparison table.
"""
import argparse
import re
import joblib
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from scipy import sparse

def minimal_clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def load_data(categories):
    data = fetch_20newsgroups(categories=categories, subset='all', remove=('headers','footers','quotes'))
    X = [minimal_clean(t) for t in data.data]
    y = data.target
    target_names = data.target_names
    return X, y, target_names

def evaluate_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    return acc, prec, rec, f1

def baseline_pipeline_train_eval(X_train, X_test, y_train, y_test):
    vect = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=3, sublinear_tf=True, stop_words='english')
    clf = LogisticRegression(max_iter=2000, solver='saga', n_jobs=-1, random_state=42)
    # Fit
    X_train_tfidf = vect.fit_transform(X_train)
    X_test_tfidf = vect.transform(X_test)
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)
    metrics = evaluate_metrics(y_test, y_pred)
    bundle = {"vect": vect, "clf": clf}
    return metrics, bundle

def hybrid_train_eval(X_train, X_test, y_train, y_test, n_components):
    # TF-IDF on train
    vect = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=3, sublinear_tf=True, stop_words='english')
    X_train_tfidf = vect.fit_transform(X_train)
    X_test_tfidf = vect.transform(X_test)

    # SVD
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    X_train_lsa = svd.fit_transform(X_train_tfidf)
    X_test_lsa = svd.transform(X_test_tfidf)

    # scale dense chunk
    scaler = StandardScaler()
    X_train_lsa_s = scaler.fit_transform(X_train_lsa)
    X_test_lsa_s = scaler.transform(X_test_lsa)

    # combine
    X_train_comb = sparse.hstack([X_train_tfidf, sparse.csr_matrix(X_train_lsa_s)], format='csr')
    X_test_comb  = sparse.hstack([X_test_tfidf,  sparse.csr_matrix(X_test_lsa_s)], format='csr')

    clf = LogisticRegression(max_iter=2000, solver='saga', n_jobs=-1, random_state=42)
    clf.fit(X_train_comb, y_train)
    y_pred = clf.predict(X_test_comb)
    metrics = evaluate_metrics(y_test, y_pred)
    bundle = {"vect": vect, "svd": svd, "scaler": scaler, "clf": clf}
    return metrics, bundle

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--test_size", type=float, default=0.2)
    parser.add_argument("--n_components", type=int, nargs="+", default=[50,100,200],
                        help="List of n_components to try for TruncatedSVD (e.g. --n_components 50 100 200)")
    parser.add_argument("--save_best", action="store_true", help="Save best baseline and best hybrid bundles")
    args, _ = parser.parse_known_args()

    categories = ['comp.graphics', 'misc.forsale', 'rec.sport.baseball',
                  'soc.religion.christian', 'talk.politics.guns']

    print("Loading data...")
    X, y, target_names = load_data(categories)
    print(f"Samples: {len(X)}, classes: {len(target_names)}")

    # Single split used for fair comparison
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_size, stratify=y, random_state=42)

    results = []
    print("\n--- Baseline (TF-IDF -> LR) ---")
    base_metrics, base_bundle = baseline_pipeline_train_eval(X_train, X_test, y_train, y_test)
    print(f"Baseline metrics (acc, prec, rec, f1): {base_metrics}")
    results.append({
        "model": "baseline_tfidf_lr",
        "n_components": None,
        "accuracy": base_metrics[0],
        "precision": base_metrics[1],
        "recall": base_metrics[2],
        "f1": base_metrics[3]
    })

    best_hybrid = None
    best_hybrid_score = -1.0
    hybrid_bundles = {}

    for nc in args.n_components:
        print(f"\n--- Hybrid (n_components={nc}) ---")
        metrics, bundle = hybrid_train_eval(X_train, X_test, y_train, y_test, n_components=nc)
        print(f"Hybrid (n={nc}) metrics (acc, prec, rec, f1): {metrics}")
        results.append({
            "model": "hybrid_tfidf_lsa_lr",
            "n_components": nc,
            "accuracy": metrics[0],
            "precision": metrics[1],
            "recall": metrics[2],
            "f1": metrics[3]
        })
        hybrid_bundles[nc] = bundle
        # use accuracy (or f1) to pick best hybrid
        if metrics[0] > best_hybrid_score:
            best_hybrid_score = metrics[0]
            best_hybrid = (nc, bundle, metrics)

    # Show results as DataFrame sorted by accuracy
    df = pd.DataFrame(results)
    df = df.sort_values(by="accuracy", ascending=False).reset_index(drop=True)
    print("\n\n=== Comparison table (sorted by accuracy) ===")
    print(df.to_string(index=False, float_format="{:.4f}".format))

    # Optionally save best models
    if args.save_best:
        print("\nSaving best models...")
        joblib.dump(base_bundle, "best_baseline_news_tfidf_lr.joblib")
        if best_hybrid is not None:
            nc_best, bundle_best, metrics_best = best_hybrid
            joblib.dump(bundle_best, f"best_hybrid_news_tfidf_lsa_lr_n{nc_best}.joblib")
            print(f"Saved baseline and best hybrid (n_components={nc_best})")
        else:
            print("No hybrid models found to save.")

if __name__ == "__main__":
    main()


Loading data...
Samples: 4849, classes: 5

--- Baseline (TF-IDF -> LR) ---
Baseline metrics (acc, prec, rec, f1): (0.9123711340206185, 0.9167256087225616, 0.9123711340206185, 0.9125281263649558)

--- Hybrid (n_components=50) ---
Hybrid (n=50) metrics (acc, prec, rec, f1): (0.9072164948453608, 0.9101398589692358, 0.9072164948453608, 0.9073412024506444)

--- Hybrid (n_components=100) ---
Hybrid (n=100) metrics (acc, prec, rec, f1): (0.9010309278350516, 0.9040882038651065, 0.9010309278350516, 0.9010129511678747)

--- Hybrid (n_components=200) ---
Hybrid (n=200) metrics (acc, prec, rec, f1): (0.9010309278350516, 0.903825314219625, 0.9010309278350516, 0.9011147864027895)


=== Comparison table (sorted by accuracy) ===
              model  n_components  accuracy  precision  recall     f1
  baseline_tfidf_lr           NaN    0.9124     0.9167  0.9124 0.9125
hybrid_tfidf_lsa_lr       50.0000    0.9072     0.9101  0.9072 0.9073
hybrid_tfidf_lsa_lr      100.0000    0.9010     0.9041  0.9010 0.90

In [3]:
import joblib

# Load pipeline
pipeline = joblib.load("news_tfidf_lr.joblib")

# See the steps inside the pipeline
print(pipeline)
print(pipeline.named_steps)  # dictionary of steps


Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.9, min_df=3, ngram_range=(1, 2),
                                 stop_words='english', sublinear_tf=True)),
                ('clf',
                 LogisticRegression(max_iter=2000, n_jobs=-1, random_state=42,
                                    solver='saga'))])
{'tfidf': TfidfVectorizer(max_df=0.9, min_df=3, ngram_range=(1, 2), stop_words='english',
                sublinear_tf=True), 'clf': LogisticRegression(max_iter=2000, n_jobs=-1, random_state=42, solver='saga')}


In [5]:
sample_text = ["i play football"]
prediction = pipeline.predict(sample_text)
print("Predicted category:", prediction)


Predicted category: [2]


In [4]:
from sklearn.datasets import fetch_20newsgroups

# Load dataset target names
categories = ['comp.graphics', 'misc.forsale', 'rec.sport.baseball',
              'soc.religion.christian', 'talk.politics.guns']

sample_text = ["i play football"]
prediction = pipeline.predict(sample_text)
print("Predicted category:", categories[prediction[0]])


Predicted category: soc.religion.christian


In [7]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Choose categories (added more tech ones for better GPU detection)
categories = [
    'comp.graphics',
    'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware',
    'sci.electronics',
    'rec.sport.baseball',
    'soc.religion.christian',
    'talk.politics.guns'
]

# Load training data
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

# Build pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000))
])

# Train
pipeline.fit(newsgroups_train.data, newsgroups_train.target)

# Sample text to test
sample_text = ["messi plays football very well"]

# Prediction
prediction = pipeline.predict(sample_text)[0]
print("Predicted category:", categories[prediction])

# Show probabilities for better understanding
probs = pipeline.predict_proba(sample_text)[0]
print("\nCategory probabilities:")
for cat, prob in zip(categories, probs):
    print(f"{cat}: {prob:.4f}")


Predicted category: sci.electronics

Category probabilities:
comp.graphics: 0.1845
comp.sys.ibm.pc.hardware: 0.1313
comp.sys.mac.hardware: 0.1268
sci.electronics: 0.2023
rec.sport.baseball: 0.1649
soc.religion.christian: 0.1074
talk.politics.guns: 0.0827


In [None]:
import joblib
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Categories
categories = [
    'comp.graphics',
    'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware',
    'sci.electronics',
    'rec.sport.baseball',
    'soc.religion.christian',
    'talk.politics.guns'
]

# Load dataset
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)

# Build pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000))
])

# Train
pipeline.fit(newsgroups_train.data, newsgroups_train.target)

# ✅ Save the trained model
joblib.dump((pipeline, categories), "news_classifier.joblib")

print("Model saved as news_classifier.joblib")


Model saved as news_classifier.joblib


In [7]:
import joblib

# Load model + categories
pipeline, categories = joblib.load("news_classifier.joblib")

# Test
sample_text = ["graphic card prices increased recently"]
prediction = pipeline.predict(sample_text)[0]

print("Predicted category:", categories[prediction])

# Show probabilities
probs = pipeline.predict_proba(sample_text)[0]
print("\nCategory probabilities:")
for cat, prob in zip(categories, probs):
    print(f"{cat}: {prob:.4f}")


Predicted category: comp.sys.ibm.pc.hardware

Category probabilities:
comp.graphics: 0.1836
comp.sys.ibm.pc.hardware: 0.3434
comp.sys.mac.hardware: 0.1596
sci.electronics: 0.0650
rec.sport.baseball: 0.1062
soc.religion.christian: 0.0693
talk.politics.guns: 0.0730


In [8]:
"""
Script to show what's inside the .joblib files for professor demonstration
"""
import joblib
import os

def examine_joblib_file(filepath):
    """Examine contents of a joblib file"""
    if not os.path.exists(filepath):
        print(f"File {filepath} not found")
        return
    
    print(f"\n=== Examining {filepath} ===")
    
    try:
        # Load the saved model bundle
        bundle = joblib.load(filepath)
        
        print(f"Type: {type(bundle)}")
        
        if isinstance(bundle, dict):
            print("Contents:")
            for key, value in bundle.items():
                print(f"  {key}: {type(value)}")
                
                # Show more details for specific components
                if key == 'vect':  # TF-IDF Vectorizer
                    print(f"    - Vocabulary size: {len(value.vocabulary_)} words")
                    print(f"    - N-gram range: {value.ngram_range}")
                    print(f"    - Max features: {value.max_features}")
                    
                elif key == 'svd':  # TruncatedSVD (LSA)
                    print(f"    - Components: {value.n_components}")
                    print(f"    - Explained variance ratio: {value.explained_variance_ratio_[:5]}...")
                    
                elif key == 'clf':  # Classifier
                    print(f"    - Classes: {len(value.classes_)}")
                    print(f"    - Algorithm: {value.solver}")
                    
                elif key == 'scaler':  # StandardScaler
                    print(f"    - Features scaled: {len(value.mean_)}")
        
        print(f"File size: {os.path.getsize(filepath)} bytes")
        
    except Exception as e:
        print(f"Error loading {filepath}: {e}")

# Check if the files exist and examine them
files_to_check = [
    'news_tfidf_lr.joblib',
    'news_tfidf_lsa_lr.joblib',
    'best_baseline_news_tfidf_lr.joblib',
    'best_hybrid_news_tfidf_lsa_lr_n50.joblib',
    'best_hybrid_news_tfidf_lsa_lr_n100.joblib',
    'best_hybrid_news_tfidf_lsa_lr_n200.joblib'
]

print("=== JOBLIB FILES ANALYSIS ===")
print("These files contain trained machine learning models and preprocessors")
print("They represent the 'processed dataset' in machine learning pipeline")

for filepath in files_to_check:
    examine_joblib_file(filepath)

print("\n=== SUMMARY ===")
print("The .joblib files you see in the project folder contain:")
print("1. Trained models that can make predictions on new text")
print("2. Fitted preprocessors (TF-IDF vectorizers) that convert text to numbers")
print("3. All the learned parameters from training on the 20 Newsgroups dataset")
print("4. These are the 'final products' of the machine learning pipeline")

=== JOBLIB FILES ANALYSIS ===
These files contain trained machine learning models and preprocessors
They represent the 'processed dataset' in machine learning pipeline

=== Examining news_tfidf_lr.joblib ===
Type: <class 'sklearn.pipeline.Pipeline'>
File size: 1707044 bytes

=== Examining news_tfidf_lsa_lr.joblib ===
Type: <class 'dict'>
Contents:
  tfidf_vect: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
  svd: <class 'sklearn.decomposition._truncated_svd.TruncatedSVD'>
    - Components: 100
    - Explained variance ratio: [0.00291051 0.00488337 0.00362288 0.00329944 0.00275583]...
  scaler: <class 'sklearn.preprocessing._data.StandardScaler'>
    - Features scaled: 100
  clf: <class 'sklearn.linear_model._logistic.LogisticRegression'>
    - Classes: 5
    - Algorithm: saga
  categories: <class 'list'>
File size: 23251148 bytes
File best_baseline_news_tfidf_lr.joblib not found
File best_hybrid_news_tfidf_lsa_lr_n50.joblib not found
File best_hybrid_news_tfidf_lsa_lr_n100.