# News classification

## Imports

In [50]:
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

import pickle

## Data loading & processing

In [51]:
train_data = datasets.fetch_20newsgroups(subset='train')

In [52]:
test_data = datasets.fetch_20newsgroups(subset='test')

In [18]:
vectorizer = CountVectorizer(min_df=100)

vectorized_train_data = vectorizer.fit_transform(train_data.data)
vectorized_test_data = vectorizer.transform(test_data.data)

In [32]:
test_data.target

array([ 7,  5,  0, ...,  9,  6, 15])

In [33]:
test_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Model training

In [53]:
model = RandomForestClassifier(n_estimators=100)
model.fit(vectorized_train_data, train_data.target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [54]:
train_preds = model.predict(vectorized_train_data)
test_preds = model.predict(vectorized_test_data)

In [55]:
accuracy_score(train_data.target, train_preds)

0.99991161392964467

In [56]:
accuracy_score(test_data.target, test_preds)

0.70286776420605412

In [39]:
test_obj = 'I heard that python programming language now is very popular technology for scientific projects'
vectorized_test_obj = vectorizer.transform([test_obj])

In [40]:
 model.predict(vectorized_test_obj)[0]

4

In [59]:
label =  model.predict(vectorized_test_obj)[0]
test_data.target_names[label]

'comp.graphics'

## Dump vectorizer & model

In [31]:
with open('news_vectorizer_dump.pkl', 'w') as output_file:
    pickle.dump(vectorizer, output_file)

with open('news_model_dump.pkl', 'w') as output_file:
    pickle.dump(model, output_file)