# Text Classification

In [3]:
#!pip install streamlit

In [9]:
from sklearn import datasets # for news data
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier # fast wo set parameters and wo overfitting
from sklearn.linear_model import LogisticRegression # good for text
from sklearn.metrics import accuracy_score

import joblib # save model

## Data load

In [10]:
train_data = datasets.fetch_20newsgroups(subset='train')
test_data = datasets.fetch_20newsgroups(subset='test')

In [11]:
train_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [12]:
print(train_data['DESCR']) # description of data

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

In [17]:
print(train_data.data[1]) # 1 example of news
print('______')
print('class_news:', train_data.target_names[1]) # 1 example of target news

From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: University of Washington
Lines: 11
NNTP-Posting-Host: carson.u.washington.edu

A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. Thanks.

Guy Kuo <guykuo@u.washington.edu>

______
class_news: comp.graphics


## Modeling

In [18]:
vectorizer = TfidfVectorizer()

In [21]:
processed_train = vectorizer.fit_transform(train_data.data)
processed_test = vectorizer.transform(test_data.data)

In [23]:
print(processed_train[1]) # vector words in news 1: [id : weight]

  (0, 28447)	0.0827399210253713
  (0, 62224)	0.06014114887968464
  (0, 48568)	0.05212944077716303
  (0, 31951)	0.0682268643545607
  (0, 73213)	0.06558492204963992
  (0, 86752)	0.0615011307981042
  (0, 108558)	0.028697612584528263
  (0, 45311)	0.056926607000293034
  (0, 117211)	0.040240996177741725
  (0, 86977)	0.05165102986373082
  (0, 111694)	0.11297527240874912
  (0, 124332)	0.02986287972296624
  (0, 101143)	0.08955513661810549
  (0, 52522)	0.05928183026939788
  (0, 29241)	0.02339528802285112
  (0, 55921)	0.09570690696010355
  (0, 20003)	0.0712646074899511
  (0, 57269)	0.09495748052852195
  (0, 47721)	0.06316047569344484
  (0, 55923)	0.075876012972072
  (0, 92942)	0.06355704213403876
  (0, 119719)	0.08790617636347903
  (0, 64153)	0.0794400362156068
  (0, 107590)	0.11297527240874912
  (0, 62501)	0.08155291589656467
  :	:
  (0, 37960)	0.1781283956161874
  (0, 60993)	0.2286641809225034
  (0, 128420)	0.03179454838164461
  (0, 114428)	0.04095433847953403
  (0, 94362)	0.08242288872628986
 

In [24]:
model = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=42)

In [25]:
model.fit(processed_train, train_data.target)

RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=42)

In [26]:
train_preds = model.predict(processed_train)
test_preds = model.predict(processed_test)

In [27]:
print('train quality:', accuracy_score(train_data.target, train_preds))
print('test quality:', accuracy_score(test_data.target, test_preds))

train quality: 0.9999116139296447
test quality: 0.7912904938927243


In [29]:
model_lr = LogisticRegression(random_state=42)

In [30]:
model_lr.fit(processed_train, train_data.target)

LogisticRegression(random_state=42)

In [31]:
train_preds_lr = model_lr.predict(processed_train)
test_preds_lr = model_lr.predict(processed_test)

In [32]:
print('train quality:', accuracy_score(train_data.target, train_preds_lr))
print('test quality:', accuracy_score(test_data.target, test_preds_lr))

train quality: 0.9761357610040657
test quality: 0.8274030801911842


## Testing

In [36]:
test_obj = "Steadman, 28, was favourite for gold five years ago but was beaten \
    by American Grace Norman after making an error in the swim section of the race. \
    The former Strictly Come Dancing contestant turned the tables in style, \
    beating Norman by 41 seconds with GB team-mate Claire Cashmore in third. \
    George Peasgood took silver in the men's PTS5 race behind Martin Schultz."

In [42]:
label = model_lr.predict(vectorizer.transform([test_obj]))[0]
train_data.target_names[label]

'rec.sport.hockey'

## Dump model

In [34]:
with open('text_vectorizer.pkl', 'wb') as output:
    joblib.dump(vectorizer, output)
with open('text_classif_model.pkl', 'wb') as output:
    joblib.dump(model_lr, output)