# Text Classification

In [2]:
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import joblib

## Data load

In [3]:
train_data = datasets.fetch_20newsgroups(subset='train')

In [4]:
test_data = datasets.fetch_20newsgroups(subset='test')

In [5]:
train_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [6]:
print(train_data['DESCR'])

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

In [8]:
print(train_data.data[123])

From: JEK@cu.nih.gov
Subject: John 3:16 paraphrased
Lines: 25

At the end of a recent (Mon 19 Apr 1993) post, Alastair Thomson
offers the following "paraphrase" of John 3:16:

   "God loved the world so much, that he gave us His Son,
   to die in our place, so that we may have eternal life."

The "to die in our place" bothers me, since it inserts into the
verse a doctrine not found in the original. Moreover, I suspect that
the poster intends to affirm, not merely substitution, but forensic
(or penal) substitution.  I maintain that the Scriptures in speaking
of the Atonement teach a doctrine of Substitution, but not one of
Forensic Substitution.

Those interested in pursuing the matter are invited to send for my
essays on Genesis, either 4 thru 7 (on this question) or 1 through 7
(with lead-in).  The n'th essay can be obtained by sending to
LISTSERV@ASUACAD.BITNET or to LISTSERV@ASUVM.INRE.ASU.EDU the
message
   GET GEN0n RUFF

 Yours,
 James Kiefer

 "Any theologian worth his salt can 

In [9]:
print(train_data.target_names[train_data.target[123]])

soc.religion.christian


In [10]:
train_data.target[123]

15

## Modeling

In [45]:
vectroizer = CountVectorizer()

In [46]:
processed_train = vectroizer.fit_transform(train_data.data)
processed_test = vectroizer.transform(test_data.data)

In [47]:
print(processed_train[123])

  (0, 56979)	1
  (0, 50527)	1
  (0, 85354)	1
  (0, 111322)	1
  (0, 114731)	1
  (0, 89362)	7
  (0, 76032)	1
  (0, 80638)	1
  (0, 89860)	3
  (0, 114455)	14
  (0, 68766)	1
  (0, 115475)	7
  (0, 32311)	1
  (0, 66608)	7
  (0, 37565)	2
  (0, 90252)	3
  (0, 62221)	1
  (0, 35983)	1
  (0, 56283)	1
  (0, 106030)	1
  (0, 81263)	1
  (0, 124616)	1
  (0, 29241)	1
  (0, 108558)	2
  (0, 58293)	1
  :	:
  (0, 34530)	1
  (0, 67502)	1
  (0, 121230)	1
  (0, 48405)	2
  (0, 83720)	1
  (0, 67722)	1
  (0, 26526)	1
  (0, 111449)	4
  (0, 56341)	2
  (0, 92833)	1
  (0, 105459)	1
  (0, 30175)	1
  (0, 97099)	1
  (0, 52553)	1
  (0, 52551)	1
  (0, 76164)	2
  (0, 30024)	1
  (0, 67466)	1
  (0, 30013)	1
  (0, 58569)	1
  (0, 103446)	1
  (0, 72622)	1
  (0, 114535)	1
  (0, 104455)	1
  (0, 41228)	1


In [48]:
model = RandomForestClassifier(n_jobs=-1, n_estimators=200, max_depth=50,
                               random_state=111)

In [49]:
model.fit(processed_train, train_data.target)

RandomForestClassifier(max_depth=50, n_estimators=200, n_jobs=-1,
                       random_state=111)

In [50]:
train_preds = model.predict(processed_train)
test_preds = model.predict(processed_test)

In [51]:
accuracy_score(train_data.target, train_preds)

0.9870956337281245

In [52]:
accuracy_score(test_data.target, test_preds)

0.7712426978226234

In [55]:
test_obj = 'I like sport. \
    Everybody recently saw Olympic Games and supported athlets around the World. \
    Hockey was amazing'

In [59]:
model.predict(vectroizer.transform([test_obj]))

array([10])

In [62]:
label = model.predict(vectroizer.transform([test_obj]))[0]
train_data.target_names[label]

'rec.sport.hockey'

In [63]:
print(train_data.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


## Dump Model

In [53]:
with open('text_classification_model.pkl', 'wb') as output:
    joblib.dump(model, output)

In [54]:
with open('text_vectorizer.pkl', 'wb') as output:
    joblib.dump(vectroizer, output)

In [64]:
! ls .

newsgropus_modeling.ipynb     text_classification_model.pkl
newsgroups_classifier.py      text_vectorizer.pkl
newsgroups_demo_app.py


In [44]:
%%bash
cd ..
ls .

insurance_premium_sample_visualization.ipynb
ip_raw_data_profiling_report.html
ml_4_project_preliminarly_phase_emeli_dral.pdf
ml_5_data_sample_analysis_emeli_dral.pdf
ml_6_data_reproducable_experiments_emeli_dral.pdf
streamlit_app


In [None]:
! pip install streamlit
! streamlit hello