In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

The naive bayes classifier is based on bayes theorem:
$$P(A|B) = \frac{P(B|A)P(A)}{P(B)}$$
The model makes the assumption, that one "word" is independent of the next. Bigram og trigram models, ease on this assumption.

In [2]:
# Create data and labels (data could be whole sentences)
train = ['Hello','how','are','friend','evil','bad','Amazing']
labels = [1,1,1,1,0,0,1]

# Test data
review = 'Amazing friend'

# Create countvectorizer, fit and print vocabulary
counter = CountVectorizer()
counter.fit(train)
print(counter.vocabulary_)

# Transform test data and print
review_counts = counter.transform([review])
print(review_counts.toarray())

# Transform training data
training_counts = counter.transform(train)

{'hello': 5, 'how': 6, 'are': 1, 'friend': 4, 'evil': 3, 'bad': 2, 'amazing': 0}
[[1 0 0 0 1 0 0]]


Data cleaning tips:
 - Remove punctuation from the training set
 - Lowercase every word in the training set
 - Use a bigram or trigram model which makes the assumption of independence more reasonable

In [3]:
# Create classifier
classifier = MultinomialNB()

# Train classifier
classifier.fit(training_counts, labels)

# Print classifier prediction on test data
print(classifier.predict(review_counts))

# Print classifier probability prediction on test data
print(classifier.predict_proba(review_counts))

[1]
[[0.1509434 0.8490566]]
