In [6]:
import numpy as np
import nltk
nltk.download('stopwords')
from sklearn.datasets import fetch_20newsgroups
from nltk.stem.snowball import SnowballStemmer
# Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,precision_score, recall_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vishvanatarajan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

In [8]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [9]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [10]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)

In [11]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [12]:
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

In [13]:
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()),('clf',MultinomialNB(fit_prior=False))])

In [15]:
text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)

In [16]:
predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)

In [17]:
np.mean(predicted_mnb_stemmed == twenty_test.target)

0.8167817312798725

In [18]:
print(precision_score(predicted_mnb_stemmed,twenty_test.target, average="macro"))

0.803538189701977


In [19]:
print(recall_score(predicted_mnb_stemmed,twenty_test.target, average="macro"))

0.8335586735879094
