In [1]:
## Import required Packages and Modules
import sklearn
import sklearn.datasets as skd
from sklearn import metrics
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer as TT
from sklearn.naive_bayes import MultinomialNB

In [2]:
## Make a helper function to define name corresponding to each class number for better understanding

def naming(n):
    name=''
    if(n==0):
        name='Athesim'
    elif(n==1):
        name='Computer Graphics'
    elif(n==2):
        name='MS Windows'
    elif(n==3):
        name='IBM Hardware'
    elif(n==4):
        name='Mac Hardware'
    elif(n==5):
        name='Windows X'
    elif(n==6):
        name='For Sale'
    elif(n==7):
        name='Automobile'
    elif(n==8):
        name='Motorcycle'
    elif(n==9):
        name='Baseball'
    elif(n==10):
        name='Hockey'
    elif(n==11):
        name='Cryptography'
    elif(n==12):
        name='Electronics'
    elif(n==13):
        name='Medical'
    elif(n==14):
        name='Space'
    elif(n==15):
        name='Politics Guns'
    elif(n==16):
        name='Politics Mideast'
    elif(n==17):
        name='Politics Misc'
    elif(n==18):
        name='Christian'
    elif(n==19):
        name='Religion Misc'
    else:
        name='No Class found'
    return name

In [3]:
## Define list of categories of data
## Load Dataset into train and test variables

categories = ['comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware',
              'comp.sys.mac.hardware','alt.atheism','talk.religion.christian',
              'sci.med','comp.windows.x','misc.forsale','rec.autos',
             'rec.motorcycles','rec.sport.baseball','rec.sport.hockey','sci.crypt','sci.electronics',
             'sci.space','talk.politics.guns','talk.politics.mideast','talk.politics.misc','talk.religion.misc']
news_train = skd.load_files(r'/home/suraj/Documents/text/20_newsgroups',categories=categories,encoding='ISO-8859-1')
news_test = skd.load_files(r'/home/suraj/Documents/text/mini_newsgroups',categories=categories,encoding='ISO-8859-1')

In [4]:
print(len(news_train.data))
print(len(news_test.data))

20000
2000


In [5]:
news_train.target.shape

(20000,)

In [6]:
## Define countvectorizer function(Try using sklearn's count vectoriser function)
## if you are aware of NLTK's countvectoriser you can also use that

count_vect = CountVectorizer()

## Transform your data to sparse matrix

x_train_tf= count_vect.fit_transform(news_train.data)
x_train_tf.shape

(20000, 209403)

In [7]:
count_vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [8]:
x_train_tf

<20000x209403 sparse matrix of type '<class 'numpy.int64'>'
	with 3982224 stored elements in Compressed Sparse Row format>

In [9]:
tfidf_transformer = TT()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_tf)
x_train_tfidf.shape

(20000, 209403)

In [10]:
## Define Multinomial Naive Bayes classifier function
## Train it on training data
clf = MultinomialNB().fit(x_train_tfidf, news_train.target)

In [11]:
clf

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
## Transform test data

x_test_tf= count_vect.transform(news_test.data)
x_test_tfidf = tfidf_transformer.transform(x_test_tf)
x_test_tfidf.shape

(2000, 209403)

In [16]:
## Do prediction on test data

predicted=clf.predict(x_test_tfidf)
print(predicted)

[ 4 11 11 ... 16  8  6]


In [17]:
## Print precision recall f1-score and support of each class
## also print accuracy of model obtained and confusion matrix for the same

print('Accuracy achieved is ' + str(np.mean(predicted == news_test.target)))
print(metrics.classification_report(news_test.target, predicted, target_names=news_test.target_names)),
metrics.confusion_matrix(news_test.target, predicted)

Accuracy achieved is 0.943
                          precision    recall  f1-score   support

             alt.atheism       0.82      0.87      0.84       100
           comp.graphics       0.96      0.94      0.95       100
 comp.os.ms-windows.misc       0.94      0.97      0.96       100
comp.sys.ibm.pc.hardware       0.93      0.97      0.95       100
   comp.sys.mac.hardware       0.99      0.96      0.97       100
          comp.windows.x       0.98      0.93      0.95       100
            misc.forsale       0.99      0.95      0.97       100
               rec.autos       1.00      0.98      0.99       100
         rec.motorcycles       0.98      1.00      0.99       100
      rec.sport.baseball       0.98      0.98      0.98       100
        rec.sport.hockey       0.98      0.99      0.99       100
               sci.crypt       0.96      1.00      0.98       100
         sci.electronics       0.97      0.95      0.96       100
                 sci.med       1.00      0.97   

array([[ 87,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   2,   1,  10],
       [  0,  94,   2,   0,   0,   2,   0,   0,   0,   0,   0,   0,   2,
          0,   0,   0,   0,   0,   0,   0],
       [  0,   0,  97,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0],
       [  1,   0,   2,  97,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   1,   2,  96,   0,   1,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0],
       [  0,   1,   1,   0,   0,  93,   0,   0,   1,   0,   0,   3,   0,
          0,   0,   1,   0,   0,   0,   0],
       [  1,   0,   0,   0,   0,   0,  95,   0,   0,   1,   1,   0,   0,
          0,   0,   1,   0,   1,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,  98,   0,   0,   0,   0,   1,
          0,   0,   1,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0, 100,   0,   0,  

In [18]:
## Take as input a test_Sample from user

test_sample = input("Enter data for which you want to check: ")

Enter data for which you want to check: hockey is a game


In [20]:
## transform it to an np_array input

test_sample = [test_sample]

In [21]:
## transform test sample using count vectoriser.transform

test_sample_tf= count_vect.transform(test_sample)
test_sample_tfidf = tfidf_transformer.transform(test_sample_tf)
test_sample_tfidf.shape

(1, 209403)

In [22]:
## Predict the class of sample using classifier trained earlier

predict_sample = clf.predict(test_sample_tfidf)

In [23]:
test_sample[0]

'hockey is a game'

In [24]:
print(predict_sample)

[10]


In [25]:
## Print class predected

print(naming(predict_sample))

Hockey
