In [1]:
import numpy as np
import nltk
nltk.download('stopwords')
from sklearn.datasets import fetch_20newsgroups
from nltk.stem.snowball import SnowballStemmer
# Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,precision_score, recall_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/divyajyoti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [4]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [5]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)

In [6]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [7]:
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

In [8]:
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()),('clf',MultinomialNB(fit_prior=False))])

In [9]:
text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)
print(text_mnb_stemmed)

Pipeline(memory=None,
         steps=[('vect',
                 StemmedCountVectorizer(analyzer='word', binary=False,
                                        decode_error='strict',
                                        dtype=<class 'numpy.int64'>,
                                        encoding='utf-8', input='content',
                                        lowercase=True, max_df=1.0,
                                        max_features=None, min_df=1,
                                        ngram_range=(1, 1), preprocessor=None,
                                        stop_words='english',
                                        strip_accents=None,
                                        token_pattern='(?u)\\b\\w\\w+\\b',
                                        tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
     

In [10]:
predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)

In [11]:
np.mean(predicted_mnb_stemmed == twenty_test.target)

0.8167817312798725

In [12]:
print(precision_score(predicted_mnb_stemmed,twenty_test.target, average="macro"))

0.803538189701977


In [13]:
print(recall_score(predicted_mnb_stemmed,twenty_test.target, average="macro"))

0.8335586735879094
