In [13]:
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

## Load data

In [2]:
train_data = datasets.fetch_20newsgroups(subset='train')
test_data = datasets.fetch_20newsgroups(subset='test')

In [3]:
type(train_data)

sklearn.utils._bunch.Bunch

In [5]:
train_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [6]:
print(train_data['DESCR'])

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

Classes                     20
Samples total            18846
Dimensionality               1
Features                  text

In [10]:
print(test_data.data[1])

From: Rick Miller <rick@ee.uwm.edu>
Subject: X-Face?
Organization: Just me.
Lines: 17
Distribution: world
NNTP-Posting-Host: 129.89.2.33
Summary: Go ahead... swamp me.  <EEP!>

I'm not familiar at all with the format of these "X-Face:" thingies, but
after seeing them in some folks' headers, I've *got* to *see* them (and
maybe make one of my own)!

I've got "dpg-view" on my Linux box (which displays "uncompressed X-Faces")
and I've managed to compile [un]compface too... but now that I'm *looking*
for them, I can't seem to find any X-Face:'s in anyones news headers!  :-(

Could you, would you, please send me your "X-Face:" header?

I *know* I'll probably get a little swamped, but I can handle it.

	...I hope.

Rick Miller  <rick@ee.uwm.edu> | <ricxjo@discus.mil.wi.us>   Ricxjo Muelisto
Send a postcard, get one back! | Enposxtigu bildkarton kaj vi ricevos alion!
          RICK MILLER // 16203 WOODS // MUSKEGO, WIS. 53150 // USA



In [11]:
print(test_data.target[1])

5


In [12]:
print(test_data.target_names[test_data.target[1]])

comp.windows.x


## Modeling

In [14]:
vectorizer = TfidfVectorizer(min_df=100)

In [36]:
model = RandomForestClassifier(random_state=111, max_depth=20, n_jobs=-1)

In [17]:
vectorized_train_data = vectorizer.fit_transform(train_data.data)
vectorized_test_data = vectorizer.transform(test_data.data)

In [37]:
model.fit(vectorized_train_data, train_data.target)

In [38]:
train_pred = model.predict(vectorized_train_data)
test_pred = model.predict(vectorized_test_data)

In [39]:
print(f"train accuracy score: {accuracy_score(train_pred, train_data.target)}")
print(f"test accuracy score: {accuracy_score(test_pred, test_data.target)}")

train accuracy score: 0.8739614636733251
test accuracy score: 0.6548061603823686


In [40]:
my_text = """
How do I install docker to my Windows server? It there any installation guide avaliable? I'm not advanced Windows user yet.
"""

In [42]:
model.predict(vectorizer.transform([my_text]))

array([2])

In [43]:
test_data.target_names[model.predict(vectorizer.transform([my_text]))[0]]

'comp.os.ms-windows.misc'

## Dump assets

In [46]:
!mkdir demo

In [48]:
with open("demo/model.pkl", "wb") as f_out:
    joblib.dump(model, f_out)

## Load assest

In [49]:
loaded_model = joblib.load("demo/model.pkl")

In [50]:
test_data.target_names[loaded_model.predict(vectorizer.transform([my_text]))[0]]

'comp.os.ms-windows.misc'