# Topic modeling

## imports

In [None]:
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import joblib

In [None]:
train_data = datasets.fetch_20newsgroups(subset='train')
test_data = datasets.fetch_20newsgroups(subset='test')

In [None]:
print(test_data.data[10])

In [None]:
test_data.target_names[test_data.target[10]]

In [None]:
print(test_data.DESCR)

In [None]:
test_data.target

In [None]:
test_data.target_names

## Model training

In [None]:
vectorizer = TfidfVectorizer(min_df=100)

vectorized_train_data = vectorizer.fit_transform(train_data.data)
vectorized_test_data = vectorizer.transform(test_data.data)

In [None]:
model = RandomForestClassifier(n_estimators=100, n_jobs=-1)

In [None]:
model.fit(vectorized_train_data, train_data.target)

In [None]:
train_preds = model.predict(vectorized_train_data)
test_preds = model.predict(vectorized_test_data)

In [None]:
accuracy_score(train_preds, train_data.target)

In [None]:
accuracy_score(test_preds, test_data.target)

In [None]:
test_obj = 'I like to play hockey. Hockey is winter sport, a lot of people like it as well'

In [None]:
vectorized_test_obj = vectorizer.transform([test_obj])

In [None]:
print(vectorized_test_obj)

In [None]:
model.predict(vectorized_test_obj)[0]

In [None]:
train_data.target_names[model.predict(vectorized_test_obj)[0]]

## Dump vectorizer and model

In [None]:
with open('news_vectorizer_dump_tfidf.pkl', 'wb') as output_file:
    joblib.dump(vectorizer, output_file)

In [None]:
with open('news_model_dump_rf.pkl', 'wb') as output_file:
    joblib.dump(model, output_file)