In [None]:
from sklearn.datasets import fetch_20newsgroups

# Download the dataset
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

X_train, y_train = newsgroups_train.data, newsgroups_train.target
X_test, y_test = newsgroups_test.data, newsgroups_test.target


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
y_pred_nb = nb_classifier.predict(X_test_tfidf)

f1_score_nb = f1_score(y_test, y_pred_nb, average='weighted')
print("F-score (Naive Bayes):", f1_score_nb)


F-score (Naive Bayes): 0.6393534482841904


In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_tfidf, y_train)
y_pred_knn = knn_classifier.predict(X_test_tfidf)

f1_score_knn = f1_score(y_test, y_pred_knn, average='weighted')
print("F-score (k-Nearest Neighbor):", f1_score_knn)


F-score (k-Nearest Neighbor): 0.054375449537716364


In [None]:
import numpy as np

class RocchioClassifier:
    def __init__(self, num_classes):
        self.num_classes = num_classes
        self.class_centroids = []

    def train(self, X_train, y_train):
        # Initialize class centroids as zero vectors
        self.class_centroids = [np.zeros(X_train.shape[1]) for _ in range(self.num_classes)]

        # Calculate class centroids
        for i in range(len(X_train)):
            class_index = y_train[i]
            self.class_centroids[class_index] += X_train[i]

        # Normalize class centroids
        for i in range(self.num_classes):
            self.class_centroids[i] /= np.linalg.norm(self.class_centroids[i])

    def predict(self, X_test):
        predictions = []

        for i in range(len(X_test)):
            test_doc = X_test[i]
            min_distance = float('inf')
            predicted_class = -1

            for j in range(self.num_classes):
                distance = np.linalg.norm(test_doc - self.class_centroids[j])
                if distance < min_distance:
                    min_distance = distance
                    predicted_class = j

            predictions.append(predicted_class)

        return predictions




In [None]:
import numpy as np

class RocchioClassifier:
    def __init__(self, num_classes):
        self.num_classes = num_classes
        self.class_centroids = []

    def train(self, X_train, y_train):
        # Initialize class centroids as zero vectors
        self.class_centroids = [np.zeros(X_train.shape[1]) for _ in range(self.num_classes)]

        # Calculate class centroids
        for i in range(X_train.shape[0]):  # Use shape[0] to get the number of samples
            class_index = y_train[i]
            self.class_centroids[class_index] += X_train[i]

        # Normalize class centroids
        for i in range(self.num_classes):
            self.class_centroids[i] /= np.linalg.norm(self.class_centroids[i])

    def predict(self, X_test):
        predictions = []

        for i in range(X_test.shape[0]):  # Use shape[0] to get the number of samples
            test_doc = X_test[i]
            min_distance = float('inf')
            predicted_class = -1

            for j in range(self.num_classes):
                distance = np.linalg.norm(test_doc - self.class_centroids[j])
                if distance < min_distance:
                    min_distance = distance
                    predicted_class = j

            predictions.append(predicted_class)

        return predictions


# Example usage
from sklearn.metrics import f1_score

# Create and train the Rocchio classifier
num_classes = len(np.unique(y_train))
rocchio_classifier = RocchioClassifier(num_classes)
rocchio_classifier.train(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred_rocchio = rocchio_classifier.predict(X_test_tfidf)

# Calculate F-score
f1_score_rocchio = f1_score(y_test, y_pred_rocchio, average='weighted')
print("F-score (Rocchio):", f1_score_rocchio)



F-score (Rocchio): 0.6212454826874126


In [None]:
# Compare results
print("F-score (Naive Bayes):", f1_score_nb)
print("F-score (Rocchio):", f1_score_rocchio)
print("F-score (k-Nearest Neighbor):", f1_score_knn)

F-score (Naive Bayes): 0.6393534482841904
F-score (Rocchio): 0.6212454826874126
F-score (k-Nearest Neighbor): 0.054375449537716364


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Confusion matrix and classification report for Naive Bayes
confusion_matrix_nb = confusion_matrix(y_test, y_pred_nb)
classification_report_nb = classification_report(y_test, y_pred_nb)

# Confusion matrix and classification report for Rocchio
confusion_matrix_rocchio = confusion_matrix(y_test, y_pred_rocchio)
classification_report_rocchio = classification_report(y_test, y_pred_rocchio)

# Confusion matrix and classification report for k-Nearest Neighbor
confusion_matrix_knn = confusion_matrix(y_test, y_pred_knn)
classification_report_knn = classification_report(y_test, y_pred_knn)

# Print the results
print("Confusion Matrix (Naive Bayes):\n", confusion_matrix_nb)
print("Classification Report (Naive Bayes):\n", classification_report_nb)

print("Confusion Matrix (Rocchio):\n", confusion_matrix_rocchio)
print("Classification Report (Rocchio):\n", classification_report_rocchio)

print("Confusion Matrix (k-Nearest Neighbor):\n", confusion_matrix_knn)
print("Classification Report (k-Nearest Neighbor):\n", classification_report_knn)


Confusion Matrix (Naive Bayes):
 [[100   4   2   0   1   2   0   5   5   5  14   4   3   9  14 121  11  13
    4   2]
 [  1 254  27  11   9  33   7   2   6   2   8   7   3   4   7   7   0   1
    0   0]
 [  1  27 243  40   9  25   3   2   3   1  16   4   3   5   6   4   0   0
    1   1]
 [  0   9  37 260  32   4  13   3   0   1   8   3  20   0   1   0   1   0
    0   0]
 [  0   7  21  45 238   4  17   7   3   1  15   4  13   2   4   3   1   0
    0   0]
 [  0  43  25   6   3 289   5   2   2   2   9   1   1   0   2   3   2   0
    0   0]
 [  0   1   3  27  12   1 304   9   1   4   9   1   7   0   5   3   2   0
    1   0]
 [  3   1   1   1   4   1  12 269  24   3  31   2  19   5   6   5   4   2
    3   0]
 [  2   3   0   2   3   1   7  31 281  13  17   2  10   3   3   7   6   2
    5   0]
 [  1   7   1   1   0   4   4   2   1 306  45   0   1   3   0  12   5   3
    1   0]
 [  1   2   1   0   1   0   0   0   2  15 357   2   0   1   1   8   6   2
    0   0]
 [  0   9   7   2   5   5   6   