In [3]:
import os
import re
from collections import defaultdict
from math import log
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def preprocess(text):
    if not text:
        return []
    text = text.lower()
    text = text.replace('_', '')
    text = re.sub(r'[^\w\s]', ' ', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    porter = PorterStemmer()
    stemmed_tokens = [porter.stem(word) for word in tokens]
    return stemmed_tokens

def load_training_data(file_path):
    training_data = defaultdict(list)
    with open(file_path, 'r') as file:
        for line in file:
            tokens = list(map(int, line.split()))
            class_label = tokens[0]
            training_data[class_label].extend(tokens[1:])
    return training_data

def read_files(file_indices, folder):
    data = []
    labels = []
    file_ids = []
    for class_label, indices in file_indices.items():
        for idx in indices:
            file_path = os.path.join(folder, f"{idx}.txt")
            with open(file_path, 'r') as f:
                content = f.read()
                processed_tokens = preprocess(content)
                data.append(processed_tokens)
                labels.append(class_label)
                file_ids.append(idx)
    return data, labels, file_ids

def compute_chi2(data, labels, vocab):
    word_class_count = defaultdict(lambda: defaultdict(int))
    class_count = defaultdict(int)
    total_word_count = sum(len(tokens) for tokens in data)

    for tokens, label in zip(data, labels):
        class_count[label] += 1
        for word in tokens:
            if word in vocab:
                word_class_count[word][label] += 1

    chi2_stats = {}
    for word in vocab:
        observed = word_class_count[word]
        row_sum = sum(observed.values())
        chi2 = 0
        for label in class_count:
            expected = (row_sum * class_count[label]) / total_word_count
            observed_freq = observed.get(label, 0)
            chi2 += (observed_freq - expected) ** 2 / expected if expected > 0 else 0
        chi2_stats[word] = chi2
    return chi2_stats

def select_features_with_chi2(data, labels, top_k=1000):
    vocab = set(word for tokens in data for word in tokens)
    chi2_stats = compute_chi2(data, labels, vocab)
    sorted_features = sorted(chi2_stats.items(), key=lambda x: x[1], reverse=True)[:top_k]
    selected_features = [word for word, _ in sorted_features]
    return selected_features

def train_multinomial_nb(data, labels, vocab):
    class_word_counts = defaultdict(lambda: defaultdict(int))
    class_doc_counts = defaultdict(int)
    total_docs = len(labels)
    vocab_size = len(vocab)

    for tokens, label in zip(data, labels):
        class_doc_counts[label] += 1
        for word in tokens:
            if word in vocab:
                class_word_counts[label][word] += 1

    class_cond_probs = {}
    class_probs = {}
    for label, doc_count in class_doc_counts.items():
        total_word_count = sum(class_word_counts[label].values()) + vocab_size
        class_probs[label] = log(doc_count / total_docs)
        class_cond_probs[label] = {
            word: log((class_word_counts[label].get(word, 0) + 1) / total_word_count)
            for word in vocab
        }
    return class_probs, class_cond_probs

def predict(tokens, vocab, class_probs, class_cond_probs):
    scores = {}
    for label in class_probs:
        score = class_probs[label]
        for word in tokens:
            if word in vocab:
                score += class_cond_probs[label].get(word, 0)
        scores[label] = score
    return max(scores, key=scores.get)



In [4]:
training_file = "./training.txt"
data_folder = "./data"
top_k = 500

training_data = load_training_data(training_file)

train_data, train_labels, _ = read_files(training_data, data_folder)

selected_features = select_features_with_chi2(train_data, train_labels, top_k)

train_data_selected = [
    [word for word in tokens if word in selected_features] 
    for tokens in train_data
]

class_probs, class_cond_probs = train_multinomial_nb(train_data_selected, train_labels, selected_features)

test_indices = set(range(1, 1096)) - {idx for indices in training_data.values() for idx in indices}
test_data, test_labels, test_file_ids = read_files({None: list(test_indices)}, data_folder)

predictions = []
for tokens, file_id in zip(test_data, test_file_ids):
    selected_tokens = [word for word in tokens if word in selected_features]
    predicted_label = predict(selected_tokens, selected_features, class_probs, class_cond_probs)
    predictions.append((file_id, predicted_label))

df = pd.DataFrame(predictions, columns=["Id", "Value"])
df.to_csv("predictions.csv", index=False)

print("Predictions have been saved to 'predictions.csv'.")

Predictions have been saved to 'predictions.csv'.
