In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score, KFold

from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

from scipy.stats import sem

from nltk.tokenize import word_tokenize

import warnings
warnings.simplefilter("ignore")

import pandas as pd



In [2]:
news = fetch_20newsgroups(subset='all')

In [3]:
type(news.data)

list

In [4]:
news.target

array([10,  3, 17, ...,  3,  1,  7])

In [5]:
print(news.target_names)
print('\n')
type(news.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']




list

In [6]:
news.data[0]

"From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>\nSubject: Pens fans reactions\nOrganization: Post Office, Carnegie Mellon, Pittsburgh, PA\nLines: 12\nNNTP-Posting-Host: po4.andrew.cmu.edu\n\n\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n"

In [7]:
news.target_names[0]

'alt.atheism'

In [3]:
Split_perc = 0.75
split_size = int(len(news.data)*Split_perc)

In [4]:
x_train = news.data[:split_size]
x_test = news.data[split_size:]
y_train = news.target[:split_size]
y_test = news.target[split_size:]

In [20]:
print('x_test : ',len(x_test))
print('x_train : ', len(x_train))
print('y_test : ',len(y_test))
print('y_train : ', len(y_train))

x_test :  4712
x_train :  14134
y_test :  4712
y_train :  14134


In [5]:
clf_1 = Pipeline([('vect', CountVectorizer()),
                 ('clf', MultinomialNB())])

In [6]:
clf_2 = Pipeline([('vect', HashingVectorizer(non_negative=True)),
                 ('clf', MultinomialNB())])

In [7]:
clf_3 = Pipeline([('vect', TfidfVectorizer()),
                 ('clf', MultinomialNB())])

In [8]:
def eval_cross_val(clfs, x,y,K):
    
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    
    scores = cross_val_score(clf,x,y,cv=cv)
    print(scores)
    print(np.mean(scores))

In [9]:
clfs=[clf_1, clf_2, clf_3]
for clf in clfs:
    eval_cross_val(clf, news.data, news.target, 5)

[0.85782493 0.85725657 0.84664367 0.85911382 0.8458477 ]
0.853337340146793
[0.75543767 0.77659857 0.77049615 0.78508888 0.76200584]
0.7699254211904598
[0.84482759 0.85990979 0.84558238 0.85990979 0.84213319]
0.8504725482840962


In [10]:
stops = set(stopwords.words('english'))

In [29]:
clf_5 = Pipeline([
    ('vect', TfidfVectorizer(
        stop_words=stops
    )), 
    ('clf', MultinomialNB())
                 ])

In [30]:
eval_cross_val(clf, news.data, news.target, 5)

[0.84482759 0.85990979 0.84558238 0.85990979 0.84213319]
0.8504725482840962


In [51]:
def get_data():
    vectorizer = CountVectorizer()

    categories = ['alt.atheism', 'talk.religion.misc',
                  'comp.graphics', 'sci.space']

    # Train set
    newsgroups_train = fetch_20newsgroups(subset='train',
                                          categories=categories, shuffle=True)
    X_train = vectorizer.fit_transform(newsgroups_train.data)
    y_train = newsgroups_train.target

    # Test set
    newsgroups_test = fetch_20newsgroups(subset='test',
                                         categories=categories, shuffle=True)
    X_test = vectorizer.transform(newsgroups_test.data)
    y_test = newsgroups_test.target

    return X_train, y_train, X_test, y_test 

In [52]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
prediction = mnb.predict(X_test)
accuracy_score(y_test,prediction)

0.8987435328898744

In [45]:
print(confusion_matrix(y_test,prediction))

[[281   3   5  30]
 [  4 373  10   2]
 [  8  14 372   0]
 [ 48   4   9 190]]


In [46]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [47]:
print(classification_report(y_test,prediction))

             precision    recall  f1-score   support

          0       0.82      0.88      0.85       319
          1       0.95      0.96      0.95       389
          2       0.94      0.94      0.94       394
          3       0.86      0.76      0.80       251

avg / total       0.90      0.90      0.90      1353



In [68]:
def get_data2():
    vectorizer = TfidfVectorizer('english', stop_words=stops)

#     categories = ['alt.atheism', 'talk.religion.misc',
#                   'comp.graphics', 'sci.space']

    # Train set
    newsgroups_train = fetch_20newsgroups(subset='train',
                                           shuffle=True)
    X_train = vectorizer.fit_transform(newsgroups_train.data)
    y_train = newsgroups_train.target

    # Test set
    newsgroups_test = fetch_20newsgroups(subset='test',
                                          shuffle=True)
    X_test = vectorizer.transform(newsgroups_test.data)
    y_test = newsgroups_test.target

    return X_train, y_train, X_test, y_test

In [74]:
mnb = MultinomialNB(.01)
mnb.fit(X_train, y_train)
prediction = mnb.predict(X_test)
accuracy_score(y_test,prediction)

0.893569844789357

In [89]:

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

# Convert data to tf-idf

vectorizer = TfidfVectorizer(stop_words=stops)
train_data = vectorizer.fit_transform(newsgroups_train.data)
test_data = vectorizer.transform(newsgroups_test.data)
train_data = train_data.todense()
test_data = test_data.todense()
train_labels = newsgroups_train.target
test_labels = newsgroups_test.target


In [None]:
mnb = MultinomialNB(.01)
mnb.fit(train_data, train_labels)
prediction = mnb.predict(test_data)
accuracy_score(test_labels,prediction)