In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import nltk

nltk.download('stopwords')

data = {
    'text': [
        "The economy is growing rapidly.", 
        "The president gave a powerful speech.",  
        "The football team won their last match.",  
        "Stocks are doing well today.",  
        "The senator addressed the nation.",  
        "The footballer scored a hat trick in the match."  
    ],
    'label': [
        "Business", "Politics", "Sports", 
        "Business", "Politics", "Sports"
    ]
}

data = pd.DataFrame(data)


def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])

    words = text.split()
    words = [word for word in words if word not in stop_words]

    words = [stemmer.stem(word) for word in words]

    return ' '.join(words)

data['processed_text'] = data['text'].apply(preprocess_text)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features
X = tfidf_vectorizer.fit_transform(data['processed_text']).toarray()
y = data['label']

nb_classifier = MultinomialNB()
nb_classifier.fit(X, y)

y_pred = nb_classifier.predict(X)

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, average='weighted', zero_division=0)
recall = recall_score(y, y_pred, average='weighted', zero_division=0)
conf_matrix = confusion_matrix(y, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)


[nltk_data] Downloading package stopwords to /home/jupyter-
[nltk_data]     ra2312701010029/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
Confusion Matrix:
 [[2 0 0]
 [0 2 0]
 [0 0 2]]
