In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

# split data into training and test sets
train_emails = fetch_20newsgroups(categories = ['rec.sport.baseball', 'rec.sport.hockey'], subset = 'train', shuffle = True, random_state = 108)
test_emails = fetch_20newsgroups(categories = ['rec.sport.baseball', 'rec.sport.hockey'], subset = 'test', shuffle = True, random_state = 108)

# create a CountVectorizer and teach it the vocabulary of the training set
counter = CountVectorizer()
counter.fit(test_emails.data + train_emails.data)
print("Vocabulary: ", counter.vocabulary_)

# transform the strings into counts of the trained words
train_counts = counter.transform(train_emails.data)
test_counts = counter.transform(test_emails.data)
print("Train Counts: ", train_counts.toarray())
print("Train Counts: ", train_counts.toarray())

# create and train the model
classifier = MultinomialNB()
classifier.fit(train_counts, train_emails.target)



Train Counts:  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Train Counts:  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Score:  0.9723618090452262


In [7]:
# predict
test_example = test_emails.data[0]
print("Target: ", test_emails.target[0])
test_example_count = counter.transform([test_example])
print("Prediction: ", classifier.predict(test_example_count))
print("Probability: ", classifier.predict_proba(test_example_count))

# score
score = classifier.score(test_counts, test_emails.target)
print("Score: ", score)

Target:  1
Prediction:  [1]
Probability:  [[1.59847867e-08 9.99999984e-01]]
Score:  0.9723618090452262
