# Project 2: NB Classifier

### Course: CS 5420

### Author: Cooper Wooley

In [1]:
import tarfile
import tempfile
import shutil
import os
import atexit
from collections import defaultdict, Counter
import random

### Helper Functions for Managing Dataset

In [2]:
def extract_dataset(zip_path):
    # Create a temporary directory to extract into
    temp_dir = tempfile.mkdtemp(prefix="dataset_")

    # Extract contents
    with tarfile.open(zip_path, 'r:gz') as tar_ref:
        tar_ref.extractall(temp_dir)

    # Register cleanup handler so even if program crashes, data is removed
    atexit.register(lambda: cleanup_dataset(temp_dir))

    # Find the first subdirectory inside extracted directory
    contents = [os.path.join(temp_dir, d) for d in os.listdir(temp_dir)]
    subdirs = [d for d in contents if os.path.isdir(d)]

    if len(subdirs) == 1:
        data_root = subdirs[0]
    else:
        data_root = tempdir # fallback if already data root

    return data_root

def cleanup_dataset(directory):
    if os.path.exists(directory):
        shutil.rmtree(directory)
        print(f"Cleaned up dataset directory: {directory}")

# Example usage
# if __name__ == "__main__":
#     zip_path = "path/to/your_dataset.zip"

#     # Step 1: Extract Data
#     extracted_path = extract_dataset(zip_path)
#     print(f"Dataset extracted to: {extracted_path}")

#     # Step 2: load data, train models, etc.

#     # Step 3: Cleanup Data
#     # cleanup_dataset(extracted_path)


### Split Data

In [3]:
tar_path = "20_newsgroups.tar.gz"
extracted_path = extract_dataset(tar_path)
print(f"Dataset extracted to : {extracted_path}")

def split_dataset(base_dir, train_ratio=0.5, seed=42):
    random.seed(seed)

    train_files = []
    test_files = []
    train_labels = []
    test_labels = []

    for d in os.listdir(base_dir):
        d_path = os.path.join(base_dir, d)
        if not os.path.isdir(d_path):
            continue

        files = [
            os.path.join(d_path, f)
            for f in os.listdir(d_path)
            if os.path.isfile(os.path.join(d_path, f))
        ]

        random.shuffle(files)
        split_index = int(len(files) * train_ratio)

        train_files.extend(files[:split_index])
        test_files.extend(files[split_index:])
        train_categories.extend([d] * split_index)
        test_categories.extend([d] * split_index)

    return train_files, test_files, train_labels, test_labels # train_X, test_Y, train_Y, test_Y

Dataset extracted to : /tmp/dataset_o01hu76p/20_newsgroups


## NB Classifier

### Training

In [None]:
def train_naive_bayes(train_files, train_labels):
    vocab = set()
    word_counts = defaultdict(Counter) # class: word: count
    class_count = 20
    total_docs = len(train_labels)

    for path, label in zip(train_files, train_labels):
        with open(path, 'r', errors='ignore') as f:
            words = f.read().lower()
            vocab.update(words)
            word_counts[label].update(words)

        # Computer P(Y)
        priors = None

        # Compute P(X|Y)
        likelihoods = {}
        vocab_size = len(vocab)

        for cls, counts in word_counts.items():
            likelihoods[cls] = None

    return priors, likelihoods, vocab

### Predicting

In [None]:
def predict(text, priors, likelihoods, vocab):
    pass

### Evaluation

In [None]:
def evaluate(test_files, test_labels, priors, likelihoods, vocab):
    pass

In [None]:
train_files, test_files, train_labels, test_labels = split_dataset(extracted_path)

priors, likelihoods, vocab = train_naive_bayes(train_files, train_labels)

accuracy = evaluate(test_files, test_labels, priors, likelihoods, vocab)
print(f"Accuracy: {accuracy:.2f}")

In [5]:
cleanup_dataset(extracted_path)

Cleaned up dataset directory: /tmp/dataset_o01hu76p/20_newsgroups
