In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
selected_categories = ["sci.crypt", "sci.electronics", "sci.med", "sci.space"]

In [6]:
newsgroup_posts_train = fetch_20newsgroups(
    data_home="newsgroup_data",
    subset="train",
    categories=selected_categories,
    shuffle=True, random_state=1
    )

newsgroup_posts_test = fetch_20newsgroups(
    data_home="newsgroup_data",
    subset="test",
    categories=selected_categories,
    shuffle=True, random_state=1
    )

In [7]:
type(newsgroup_posts_train)

sklearn.utils.Bunch

In [9]:
print(newsgroup_posts_train.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

In [11]:
print(newsgroup_posts_train.data[6])

From: pmetzger@snark.shearson.com (Perry E. Metzger)
Subject: Do we need the clipper for cheap security?
Organization: Partnership for an America Free Drug
Lines: 53

amanda@intercon.com (Amanda Walker) writes:
>> The answer seems obvious to me, they wouldn't.  There is other hardware 
>> out there not compromised.  DES as an example (triple DES as a better 
>> one.) 
>
>So, where can I buy a DES-encrypted cellular phone?  How much does it cost?
>Personally, Cylink stuff is out of my budget for personal use :)...

If the Clipper chip can do cheap crypto for the masses, obviously one
could do the same thing WITHOUT building in back doors.

Indeed, even without special engineering, you can construct a good
system right now. A standard codec chip, a chip to do vocoding, a DES
chip, a V32bis integrated modem module, and a small processor to do
glue work, are all you need to have a secure phone. You can dump one
or more of the above if you have a fast processor. With integration,
you could 

In [12]:
newsgroup_posts_train.target[6]

0

In [13]:
newgroup_posts_train.target_names

['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']

In [14]:
newsgroup_posts_train.target[10]

1

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
count_vect = CountVectorizer()

In [17]:
count_vect.fit(newsgroup_posts_train.data)

CountVectorizer()

In [18]:
count_vect.get_feature_names_out()

array(['00', '000', '0000', ..., 'ête', 'íålittin', 'ýé'], dtype=object)

In [19]:
len(count_vect.get_feature_names_out())

38683

In [20]:
count_vect.get_feature_names_out()[10000:10050]

array(['cellar', 'cellphone', 'cells', 'cellsat', 'cellular', 'cellulars',
       'celluloid', 'celp', 'celsius', 'cement', 'cen', 'censoring',
       'censorship', 'censure', 'census', 'cent', 'centaur', 'centauri',
       'centaurs', 'centennial', 'center', 'centered', 'centerline',
       'centerpiece', 'centers', 'centigrade', 'centimeter',
       'centimeters', 'central', 'centralia', 'centralised', 'centralism',
       'centralization', 'centralize', 'centralized', 'centrally',
       'centre', 'centres', 'centrifuge', 'centronic', 'cents', 'centure',
       'centuries', 'century', 'ceo', 'cepek', 'cephalopods', 'cept',
       'ceramic', 'cereal'], dtype=object)

In [21]:
count_vect.vocabulary_

{'from': 16874,
 'myers': 24949,
 'cs': 12139,
 'scarolina': 31323,
 'edu': 14486,
 'daniel': 12461,
 'subject': 33688,
 're': 29468,
 'is': 20559,
 'msg': 24737,
 'sensitivity': 31723,
 'superstition': 33952,
 'organization': 26440,
 'usc': 36540,
 'department': 12983,
 'of': 26126,
 'computer': 11168,
 'science': 31420,
 'lines': 22467,
 '39': 3170,
 'frequently': 16834,
 'late': 21996,
 'have': 18389,
 'been': 8093,
 'reacting': 29479,
 'to': 35157,
 'something': 32692,
 'added': 5849,
 'restaurant': 30292,
 'foods': 16578,
 'what': 37759,
 'happens': 18290,
 'that': 34796,
 'the': 34802,
 'inside': 20111,
 'my': 24940,
 'throat': 34979,
 'starts': 33253,
 'feel': 16076,
 'puffy': 28922,
 'like': 22414,
 'cold': 10827,
 'and': 6641,
 'also': 6422,
 'at': 7375,
 'times': 35085,
 'mouth': 24652,
 'especially': 15285,
 'tongue': 35230,
 'lips': 22499,
 'situations': 32294,
 'around': 7133,
 'these': 34874,
 'symptoms': 34218,
 'almost': 6396,
 'always': 6466,
 'involve': 20469,
 'resta

In [22]:
X_train_count = count_vect.transform(newgroup_posts_train.data)

In [23]:
X_train_count.shape

(2373, 38683)

In [24]:
from sklearn.feature_extraction.text import TfidfTransformer

In [25]:
tf_transformt = TfidfTransformer(use_idf=False)

In [26]:
tf_transformt.fit(X_train_count)

TfidfTransformer(use_idf=False)

In [27]:
X_train_tf = tf_transformt.transform(X_train_count)

In [28]:
X_train_tf.shape

(2373, 38683)

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
tf_random_forest_clf = RandomForestClassifier(random_state=1)

In [32]:
tf_random_forest_clf.fit(X_train_tf, newsgroup_posts_train.target)

RandomForestClassifier(random_state=1)

In [33]:
X_test_counts = count_vect.transform(newsgroup_posts_test.data)

In [34]:
X_test_tf = tf_transformt.transform(X_test_counts)

In [35]:
tf_random_forest_clf.score(X_test_tf, newgroup_posts_test.target)

0.8511716276124129