In [1]:
# Import libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Import email dataset
emails = fetch_20newsgroups()

In [3]:
# View the different email categories
emails.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

We’re interested in seeing how effective our Naive Bayes classifier is at telling the difference between a baseball email and a hockey email.

In [4]:
# Adding new 'categories' parameter to the data for the baseball and hockey categories
emails = fetch_20newsgroups(categories = ['rec.sport.baseball', 'rec.sport.hockey'])


In [6]:
# View an email
emails.data[5]

'From: mmb@lamar.ColoState.EDU (Michael Burger)\nSubject: More TV Info\nDistribution: na\nNntp-Posting-Host: lamar.acns.colostate.edu\nOrganization: Colorado State University, Fort Collins, CO  80523\nLines: 36\n\nUnited States Coverage:\nSunday April 18\n  N.J./N.Y.I. at Pittsburgh - 1:00 EDT to Eastern Time Zone\n  ABC - Gary Thorne and Bill Clement\n\n  St. Louis at Chicago - 12:00 CDT and 11:00 MDT - to Central/Mountain Zones\n  ABC - Mike Emerick and Jim Schoenfeld\n\n  Los Angeles at Calgary - 12:00 PDT and 11:00 ADT - to Pacific/Alaskan Zones\n  ABC - Al Michaels and John Davidson\n\nTuesday, April 20\n  N.J./N.Y.I. at Pittsburgh - 7:30 EDT Nationwide\n  ESPN - Gary Thorne and Bill Clement\n\nThursday, April 22 and Saturday April 24\n  To Be Announced - 7:30 EDT Nationwide\n  ESPN - To Be Announced\n\n\nCanadian Coverage:\n\nSunday, April 18\n  Buffalo at Boston - 7:30 EDT Nationwide\n  TSN - ???\n\nTuesday, April 20\n  N.J.D./N.Y. at Pittsburgh - 7:30 EDT Nationwide\n  TSN - ??

In [8]:
# View a label
emails.target_names[emails.target[5]]

'rec.sport.hockey'

In [10]:
# Split data into training and test sets
train_emails = fetch_20newsgroups(categories = ['rec.sport.baseball', 'rec.sport.hockey'],
                                  subset='train', shuffle=True, random_state=108)

test_emails = fetch_20newsgroups(categories = ['rec.sport.baseball', 'rec.sport.hockey'],
                                  subset='test', shuffle=True, random_state=108)

In [12]:
# Create CounterVectorizer and fit it to the possible words that can exist in the emails
counter = CountVectorizer()

counter.fit(test_emails.data + train_emails.data)

In [13]:
# Make 2 lists of the counts of the words in the training and test sets
train_counts = counter.transform(train_emails.data)

test_counts = counter.transform(test_emails.data)

In [15]:
# Initiate the Naive Bayes Classifier and fit it to the training data
classifier = MultinomialNB()

classifier.fit(train_counts, train_emails.target)

In [17]:
# Test the classifier based on accuracy
accuracy = classifier.score(test_counts, test_emails.target)
accuracy

0.9723618090452262

In [19]:
# Split data into training and test sets for new categories
train_emails = fetch_20newsgroups(categories = ['comp.sys.ibm.pc.hardware','rec.sport.hockey'],
                                  subset='train', shuffle=True, random_state=108)

test_emails = fetch_20newsgroups(categories = ['comp.sys.ibm.pc.hardware','rec.sport.hockey'],
                                  subset='test', shuffle=True, random_state=108)

In [20]:
## Run previous steps tp check its accuracy for the new categories

# Create CounterVectorizer and fit it to the possible words that can exist in the emails
counter = CountVectorizer()

counter.fit(test_emails.data + train_emails.data)

# Make 2 lists of the counts of the words in the training and test sets
train_counts = counter.transform(train_emails.data)

test_counts = counter.transform(test_emails.data)

# Initiate the Naive Bayes Classifier and fit it to the training data
classifier = MultinomialNB()

classifier.fit(train_counts, train_emails.target)

# Test the classifier based on accuracy
accuracy = classifier.score(test_counts, test_emails.target)
accuracy

0.9974715549936789