In [None]:
#import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt

In [8]:
#Load dataset
from sklearn.datasets import fetch_20newsgroups

categories=['rec.sport.baseball','sci.med','comp.graphics','talk.politics.mideast','soc.religion.christian']
newsgroups=fetch_20newsgroups(subset='all',categories=categories)
X=newsgroups.data
Y=newsgroups.target



In [9]:
X_train,X_temp,Y_train,Y_temp=train_test_split(X,Y,test_size=0.3,random_state=42)
X_val,X_test,Y_val,Y_test=train_test_split(X_temp,Y_temp,test_size=0.5,random_state=42)
print("Shape of X_train:",len(X_train))
print("Shape of X_val:",len(X_val))
print("Shape of X_test:",len(X_test))
print("Shape of Y_train:",len(Y_train))
print("Shape of Y_val:",len(Y_val))
print("Shape of Y_test:",len(Y_test)) 

Shape of X_train: 3425
Shape of X_val: 734
Shape of X_test: 735
Shape of Y_train: 3425
Shape of Y_val: 734
Shape of Y_test: 735


In [10]:
#vectorization Train dat , i.e convert text to numerical data ,counting the number of times a word appears in the document
vectorizer = CountVectorizer(stop_words='english')
X_train_counts = vectorizer.fit_transform(X_train) #creating the sparse vectors for train data
X_val_counts = vectorizer.transform(X_val) #creating the sparse vectors for validation data

#TF-IDF transformation this gives weightage to the words inversly highest count have less weightage
tfidf = TfidfTransformer() 
X_train_tfidf=tfidf.fit_transform(X_train_counts) #fit and transform the train data
X_val_tfidf=tfidf.transform(X_val_counts) #transform the validation data




In [11]:
model=MultinomialNB()
model.fit(X_train_tfidf,Y_train)

Y_val_pred=model.predict(X_val_tfidf)

In [12]:
acc= accuracy_score(Y_val,Y_val_pred)
print("Validation Accuracy:",acc)

Validation Accuracy: 0.9809264305177112


In [13]:
#checking the test set
X_test_counts = vectorizer.transform(X_test) #creating the sparse vectors for test data
X_test_tfidf=tfidf.transform(X_test_counts) #transform the test data
Y_test_pred=model.predict(X_test_tfidf)
test_acc= accuracy_score(Y_test,Y_test_pred)
print("Test Accuracy:",test_acc)

Test Accuracy: 0.9700680272108844


In [15]:
# Loop through first 10 test samples
for i in range(10):
    print(f"News Article:\n{X_test[i]}\n")
    print(f"Actual Category: {categories[Y_test[i]]}")
    print(f"Predicted Category: {categories[Y_test_pred[i]]}")
    print("-"*80)


News Article:
From: noye@midway.uchicago.edu (vera shanti noyes)
Subject: Re: Why do people become atheists?
Reply-To: noye@midway.uchicago.edu
Organization: University of Chicago
Lines: 108

In article <May.11.02.36.32.1993.28071@athos.rutgers.edu> mayne@nu.cs.fsu.edu writes:
>In article <May.9.05.40.51.1993.27526@athos.rutgers.edu> noye@midway.uchicago.edu writes:
[my stuff about dealing with defferences deleted]

>This is not at all comparable. Christianity is the main stream in
>western culture. You are trivializing the experiences of others.

i am sorry; i did not mean to.  i think i understand how your
experiences were much worse than the small bit of ridicule i have had
to put up with.  i guess i didn't really understand before; now i do.

>I remember what it was like being "different" as a Christian. We
>were told all the time that we were different, and in fact that
>only members of the our church were really Christians (though others
>who believed in God weren't as bad as ath