# Classifying text data


## Get example dataset

This is the 20 Newsgroups dataset

In [6]:
from sklearn.datasets import fetch_20newsgroups

fetch_20newsgroups

# Create some training and test data
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train', categories=categories)
twenty_test = fetch_20newsgroups(subset='test', categories=categories)

print('Keys: {}'.format(twenty_train.keys()))
print('Targets: {}'.format(twenty_train['target']))
print('First document: {}'.format(twenty_train['data'][0]))

Keys: dict_keys(['DESCR', 'target', 'target_names', 'description', 'data', 'filenames'])
Targets: [1 1 3 ..., 2 2 2]
First document: From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.



## Count the words in each input document



In [7]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

print("Number of documents and words: {}".format(X_train_counts.shape))
print("{}".format(X_train_counts))
print('{}'.format(count_vect.vocabulary_))

Number of documents and words: (2257, 35788)
  (0, 14887)	1
  (0, 29022)	1
  (0, 8696)	4
  (0, 4017)	2
  (0, 33256)	2
  (0, 21661)	3
  (0, 9031)	3
  (0, 31077)	1
  (0, 9805)	2
  (0, 17366)	1
  (0, 32493)	4
  (0, 16916)	2
  (0, 19780)	2
  (0, 17302)	2
  (0, 23122)	1
  (0, 25663)	1
  (0, 16881)	1
  (0, 16082)	1
  (0, 23915)	1
  (0, 32142)	5
  (0, 33597)	2
  (0, 20253)	1
  (0, 587)	1
  (0, 12051)	1
  (0, 5201)	1
  :	:
  (2256, 13740)	1
  (2256, 14662)	1
  (2256, 20201)	1
  (2256, 12443)	6
  (2256, 30325)	3
  (2256, 4610)	1
  (2256, 33844)	1
  (2256, 17354)	1
  (2256, 26998)	1
  (2256, 20277)	1
  (2256, 20695)	1
  (2256, 20702)	1
  (2256, 9649)	1
  (2256, 9086)	1
  (2256, 26254)	1
  (2256, 17133)	2
  (2256, 4490)	1
  (2256, 13720)	1
  (2256, 5016)	1
  (2256, 9632)	1
  (2256, 11824)	1
  (2256, 29993)	1
  (2256, 1298)	1
  (2256, 2375)	1
  (2256, 3921)	1


## Convert the word counts into term frequencies

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)

print("Number of documents and words: {}".format(X_train_tfidf.shape))
print("{}".format(X_train_tfidf))

Number of documents and words: (2257, 35788)
  (0, 230)	0.13487105543
  (0, 12541)	0.13487105543
  (0, 3166)	0.13487105543
  (0, 14085)	0.0666645213786
  (0, 20459)	0.109605855078
  (0, 35416)	0.13487105543
  (0, 3062)	0.107836029574
  (0, 2326)	0.246455407094
  (0, 177)	0.256120262391
  (0, 31915)	0.0863191513116
  (0, 33572)	0.093130075546
  (0, 9338)	0.0496718454933
  (0, 26175)	0.0849746094347
  (0, 4378)	0.068661128808
  (0, 17556)	0.0189454643496
  (0, 32135)	0.0491023738045
  (0, 15837)	0.0541740417987
  (0, 9932)	0.063505656472
  (0, 32270)	0.0238711427382
  (0, 18474)	0.0199648817519
  (0, 27836)	0.0689905081067
  (0, 5195)	0.0310951485922
  (0, 12833)	0.125601499991
  (0, 25337)	0.0493588338398
  (0, 25361)	0.119479381457
  :	:
  (2256, 6430)	0.0482527867471
  (2256, 24052)	0.0381881683598
  (2256, 22270)	0.0464220456686
  (2256, 35638)	0.0532405739102
  (2256, 32233)	0.0341379218213
  (2256, 35157)	0.0281624509901
  (2256, 4938)	0.0320317964628
  (2256, 34923)	0.031529300665

## Train a classifier using these document term frequencies and the expected newsgroups

In [9]:
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

print("Class counts: {}".format(nb_classifier.class_count_))
print("Priors: {}".format(nb_classifier.class_log_prior_))

Class counts: [ 480.  584.  594.  599.]
Priors: [-1.54800567 -1.35189079 -1.33491246 -1.32653018]


## Try your new classifier on some example data

In [10]:
# Create some test data, and process it to match the training data
docs_test = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_test)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

# Run the classifier on the test data
predicted = nb_classifier.predict(X_new_tfidf)
for doc, category in zip(docs_test, predicted):
     print('{} => {}'.format(doc, twenty_train.target_names[category]))

God is love => soc.religion.christian
OpenGL on the GPU is fast => comp.graphics
