# Topic modeling

## imports

In [1]:
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import joblib

In [2]:
train_data = datasets.fetch_20newsgroups(subset='train')
test_data = datasets.fetch_20newsgroups(subset='test')

In [9]:
print(test_data.data[2])

From: mathew <mathew@mantis.co.uk>
Subject: Re: STRONG & weak Atheism
Organization: Mantis Consultants, Cambridge. UK.
X-Newsreader: rusnews v1.02
Lines: 9

acooper@mac.cc.macalstr.edu (Turin Turambar, ME Department of Utter Misery) writes:
> Did that FAQ ever got modified to re-define strong atheists as not those who
> assert the nonexistence of God, but as those who assert that they BELIEVE in 
> the nonexistence of God?

In a word, yes.


mathew



In [6]:
test_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [10]:
test_data.target_names[test_data.target[2]]

'alt.atheism'

In [12]:
print(test_data.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

Classes                     20
Samples total            18846
Dimensionality               1
Features                  text

In [None]:
test_data.target

In [None]:
test_data.target_names

## Model training

In [11]:
vectorizer = TfidfVectorizer(min_df=100)

vectorized_train_data = vectorizer.fit_transform(train_data.data)
vectorized_test_data = vectorizer.transform(test_data.data)

In [41]:
vectorizer.vocabulary_

{'from': 934,
 'umd': 2254,
 'edu': 754,
 'where': 2364,
 'my': 1455,
 'thing': 2171,
 'subject': 2089,
 'what': 2360,
 'car': 429,
 'is': 1156,
 'this': 2177,
 'nntp': 1490,
 'posting': 1677,
 'host': 1076,
 'organization': 1560,
 'university': 2266,
 'of': 1523,
 'maryland': 1344,
 'college': 512,
 'park': 1593,
 'lines': 1275,
 '15': 21,
 'was': 2337,
 'wondering': 2396,
 'if': 1099,
 'anyone': 229,
 'out': 1572,
 'there': 2167,
 'could': 587,
 'me': 1358,
 'on': 1538,
 'saw': 1894,
 'the': 2161,
 'other': 1567,
 'day': 631,
 'it': 1166,
 'door': 718,
 'sports': 2040,
 'looked': 1297,
 'to': 2198,
 'be': 322,
 'late': 1230,
 'early': 742,
 'called': 417,
 'were': 2356,
 'really': 1783,
 'small': 1994,
 'in': 1115,
 'addition': 161,
 'front': 935,
 'separate': 1935,
 'rest': 1846,
 'body': 371,
 'all': 187,
 'know': 1213,
 'can': 423,
 'model': 1413,
 'name': 1458,
 'engine': 777,
 'specs': 2032,
 'years': 2419,
 'production': 1716,
 'made': 1319,
 'history': 1063,
 'or': 1556,
 'wha

In [13]:
model = RandomForestClassifier(n_estimators=100, n_jobs=-1)

In [14]:
model.fit(vectorized_train_data, train_data.target)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
train_preds = model.predict(vectorized_train_data)
test_preds = model.predict(vectorized_test_data)

In [16]:
accuracy_score(train_preds, train_data.target)

0.9999116139296447

In [17]:
accuracy_score(test_preds, test_data.target)

0.6990175252257037

In [35]:
test_obj = 'I like to play baseball. Hockey is winter sport, a lot of people like it as well'

In [36]:
vectorized_test_obj = vectorizer.transform([test_obj])

In [37]:
print(vectorized_test_obj)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 13 stored elements and shape (1, 2430)>
  Coords	Values
  (0, 268)	0.14855472302993877
  (0, 313)	0.38955525023966275
  (0, 1065)	0.3995253682023586
  (0, 1156)	0.10591423756917533
  (0, 1166)	0.1105549281529538
  (0, 1270)	0.3567593321839033
  (0, 1305)	0.28841070453530554
  (0, 1523)	0.0942810397606895
  (0, 1613)	0.2069585413256459
  (0, 1646)	0.3412733730345204
  (0, 2039)	0.456130725666751
  (0, 2198)	0.09210834242515185
  (0, 2354)	0.2189339053673978


In [38]:
model.predict(vectorized_test_obj)[0]

np.int64(10)

In [39]:
train_data.target_names[model.predict(vectorized_test_obj)[0]]

'rec.sport.hockey'

## Dump vectorizer and model

In [42]:
with open('news_vectorizer_dump_tfidf.bin', 'wb') as output_file:
    joblib.dump(vectorizer, output_file)

In [43]:
with open('news_model_dump_rf.bin', 'wb') as output_file:
    joblib.dump(model, output_file)