# NLP Modeling

In [None]:
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import string
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, ConfusionMatrixDisplay

# Notice that these vectorizers are from `sklearn` and not `nltk`!
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,\
HashingVectorizer

import nltk
nltk.download('wordnet')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')

## Learning Goals

- normalize a lexicon with stemming and lemmatization
- run feature engineering algorithms for NLP
    - bag-of-Words
    - vectorization
- explain the use of Bayesian Reasoning for building NLP models
- describe Laplace Smoothing
- use `sklearn` and `nltk` to build NLP models

In [None]:
corpus = pd.read_csv('data/satire_nosatire.csv')
sample_document = corpus.iloc[1].body

pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
tokenizer = RegexpTokenizer(pattern)
sample_doc = tokenizer.tokenize(sample_document)
sample_doc = [token.lower() for token in sample_doc]
sw = stopwords.words('english')
sample_doc = [token for token in sample_doc if token not in sw]

sample_doc

len(set(sample_doc))

In [None]:
corpus

## Stemming and Lemmatizing

### Stemming

Most of the semantic meaning of a word is held in the root, which is usually the beginning of a word.  Conjugations and plurality do not change the semantic meaning. "eat", "eats", and "eating" all have essentially the same meaning. The rest is grammatical variation for the sake of marking things like tense or person or number.   

Stemmers consolidate similar words by chopping off the ends of the words.

![stemmer](images/stemmer.png)

There are different stemmers available.  The two we will use here are the **Porter** and **Snowball** stemmers.  A main difference between the two is how aggressively it stems, Porter being less aggressive.

In [None]:
p_stemmer = nltk.stem.PorterStemmer()
s_stemmer = nltk.stem.SnowballStemmer(language='english')

In [None]:
sample_doc[0]

In [None]:
p_stemmer.stem(sample_doc[0])

In [None]:
s_stemmer.stem(sample_doc[0])

In [None]:
for word in sample_doc:
    p_word = p_stemmer.stem(word)
    s_word = s_stemmer.stem(word)
    
    if p_word != s_word:
        print(word, p_word, s_word)

In [None]:
sample_doc = [p_stemmer.stem(token) for token in sample_doc]

In [None]:
fdist = FreqDist(sample_doc)
plt.figure(figsize=(10, 10))
fdist.plot(30);

In [None]:
print(f'Stemming slightly reduced our token count: {len(set(sample_doc))} unique tokens')

### Lemmatizing

Lemmatizing is a bit more sophisticated than the stem choppers. Lemmatizing uses part-of-speech tagging to determine how to transform a word.

- Unlike Stemming, Lemmatization reduces the inflected words, properly ensuring that the root word belongs to the language. It can handle words such as "mouse", whose plural "mice" the stemmers would not lump together with the original. 

- In Lemmatization, the root word is called the "lemma". 

- A lemma (plural lemmas or lemmata) is the canonical form, dictionary form, or citation form of a set of words.

![lemmer](images/lemmer.png)

In [None]:
lemmatizer = WordNetLemmatizer()

print(f'"Mice" becomes: {lemmatizer.lemmatize("mice")}')
print(f'"Media" becomes: {lemmatizer.lemmatize(sample_doc[76])}')

# However, look at the output below:
    
sentence = "He saw the trees get sawed down"
lemmed_sentence = [lemmatizer.lemmatize(token) for token in sentence.split(' ')]
lemmed_sentence

Lemmatizers depend, for their full functionality, on POS tagging, and **the default tag is 'noun'**.

With a little bit of work, we can POS tag our text.

In [None]:
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
tokenizer = RegexpTokenizer(pattern)
sample_doc = tokenizer.tokenize(sample_document)
sample_doc = [token.lower() for token in sample_doc]
sample_doc = [token for token in sample_doc if token not in sw]
corpus.loc[1].body

In [None]:
nltk.help.upenn_tagset()

In [None]:
# Use nltk's pos_tag to tag our words
# Does a pretty good job, but does make some mistakes

sample_doc_tagged = pos_tag(sample_doc)
sample_doc_tagged

In [None]:
# Then transform the tags into the tags of our lemmatizers

def get_wordnet_pos(treebank_tag):
    '''
    Translate nltk POS to wordnet tags
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
sample_doc_tagged

In [None]:
sample_doc_tagged = [(token[0], get_wordnet_pos(token[1])) for
                     token in sample_doc_tagged]

sample_doc_tagged

In [None]:
sample_doc_lemmed = [lemmatizer.lemmatize(token[0], token[1]) for
                     token in sample_doc_tagged]

sample_doc[:20]

In [None]:
sample_doc_lemmed[:20]

In [None]:
print(f'There are {len(set(sample_doc_lemmed))} unique lemmas.')

In [None]:
fdist = FreqDist(sample_doc_lemmed)
plt.figure(figsize=(10, 10))
fdist.plot(30);

In [None]:
sample_doc_lemmed

## Feature Engineering for NLP

The machine learning algorithms we have encountered so far represent features as the variables that take on different value for each observation. For example, we represent individuals with distinct education levels, incomes, and such. However, in NLP, features are represented in a very different way. In order to pass text data to machine learning algorithms and perform classification, we need to represent the features in a sensible way. One such method is called **Bag-of-words (BoW)**.

A bag-of-words model, or BoW for short, is a way of extracting features from text for use in modeling. A bag-of-words is a representation of text that describes the occurrence of words within a document. It involves two things:

- A vocabulary of known words.
- A measure of the presence of known words.

It is called a “bag” of words **because any information about the order or structure of words in the document is discarded**. The model is only concerned with whether known words occur in the document, not with **where** they may occur in the document. The intuition behind BoW is that a document is similar to another if they have similar contents. The Bag of Words method can be represented as a **Document Term Matrix**, in which each column is a unique vocabulary n-gram and each observation is a document. Consider, for example, the following **corpus** of documents:

- Document 1: "I love dogs."
- Document 2: "I love cats."
- Document 3: "I love all animals."
- Document 4: "I hate dogs."

This corpus can be represented as:

$\downarrow$Doc\|Word$\rightarrow$|I|love|dogs|cats|all|animals|hate
-|-|-|-|-|-|-|-
Document_1|1|1|1|0|0|0|0
Document_2|1|1|0|1|0|0|0
Document_3|1|1|0|0|1|1|0
Document_4|1|0|1|0|0|0|1

## Vectorization

In order to get these tokens from our documents, we're going to use tools called "vectorizers".

The most straightforward vectorizer in `sklearn.feature_extraction.text` is the `CountVectorizer`, which will simply count the number of each word type in each document.

### `CountVectorizer`

In [None]:
sample_doc_lemmed

In [None]:
[" ".join(sample_doc_lemmed)]

In [None]:
# implementing it in python

# Convert a collection of text documents to a matrix of token counts

vec = CountVectorizer()
X = vec.fit_transform([" ".join(sample_doc_lemmed)])


df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
df.head()

In [None]:
vec.vocabulary_

That is not very exciting for one document. The idea is to make a document term matrix for all of the words in our corpus.

In [None]:
corpus

In [None]:
vec = CountVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)", stop_words=sw)
X = vec.fit_transform(corpus.body[1:3])

df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
df.head()

In [None]:
# Can change the 'token length'
vec = CountVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)", stop_words=sw,
                      ngram_range=[1, 2])
X = vec.fit_transform(corpus.body[0:2])

df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
df.head()

Our document term matrix gets bigger and bigger, with more and more zeros, becoming sparser and sparser.

In [None]:
vec = CountVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)", stop_words=sw,
                      ngram_range=[1, 2])
# Now fit to the entire corpus
X = vec.fit_transform(corpus.body)

df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
df.head()

In [None]:
df['aaaaaaah'].sum()

In [None]:
df

We can set upper and lower limits to the word frequency:

In [None]:
corpus.body

In [None]:
vec = CountVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)",
                      stop_words=sw, ngram_range=[1, 2],
                      min_df=2, max_df=25)
X = vec.fit_transform(corpus.body)

df_cv = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
df_cv

In [None]:
vec = CountVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)",
                      stop_words=sw, ngram_range=[1, 2], max_features=200)
X = vec.fit_transform(corpus.body)

df_cv = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
df_cv

### `TfidfVectorizer`

There are many schemas for determining the values of each entry in a document term matrix, and one of the most common uses the TF-IDF algorithm -- "Term Frequency-Inverse Document Frequency". Essentially, tf-idf *normalizes* the raw count of the document term matrix. And it represents how important a word is in the given document. 

> The goal of using tf-idf instead of the raw frequencies of occurrence of a token in a given document is to scale down the impact of tokens that occur very frequently in a given corpus and that are hence empirically less informative than features that occur in a small fraction of the training corpus.

- TF (Term Frequency)
Term frequency is the frequency of the word in the document divided by the total words in the document.

- IDF (inverse document frequency)
Inverse document frequency is a measure of how much information the word provides, i.e., if it's common or rare across all documents. It is generally calculated as the logarithmically scaled inverse fraction of the documents that contain the word (obtained by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that quotient):

$$idf(w) = log (\frac{number\ of\ documents}{num\ of\ documents\ containing\ w})$$

tf-idf is the product of term frequency and inverse document frequency, or tf * idf. 

In [None]:
tf_vec = TfidfVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)", stop_words=sw)
X = tf_vec.fit_transform(corpus.body)

df = pd.DataFrame(X.toarray(), columns=tf_vec.get_feature_names())
df.head()

In [None]:
corpus.iloc[313].body

In [None]:
df.iloc[313].sort_values(ascending=False)[:10]

Let's compare the tfidf to the count vectorizer output for one document.

In [None]:
vec = CountVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)", stop_words=sw)
X = vec.fit_transform(corpus.body)

df_cv = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
df_cv

In [None]:
df_cv.iloc[313].sort_values(ascending=False)[:10]

The tfidf lessoned the importance of some of the more common words, including a word, "also", which might have made it into the stopword list.

It also assigns "nerds" more weight than power.  

In [None]:
print(f'"Nerds" only shows up in document 313: {len(df_cv[df.nerds!=0])} document.')
print(f'"Power" shows up in {len(df_cv[df.power!=0])} documents!')

All the words are stored in a `.vocabulary_` attribute:

In [None]:
tf_vec.vocabulary_

### `HashingVectorizer`

There is also a hashing vectorizer, which will encrypt all the words of the corpus.

In [None]:
hvec = HashingVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)",
                         stop_words=sw)
X = hvec.fit_transform(corpus.body)

df_cv = pd.DataFrame(X.toarray())
df_cv

Some rules of thumb about these vectorizers:

**Tf-Idf**: Probably the most commonly used. Useful when the goal is to distinguish the **content** of documents from others in the corpus.

**Count**: Useful when the words themselves matter. If the goal is instead about identifying authors by their words, then the fact that some word appears in many documents of the corpus may be important.

**Hashing**: The advantage here is speed and low memory usage. The disadvantage is that you lose the identities of the words being tokenized. Useful for very large datasets where the ultimate model may be a bit of a black box.

## Exercise

Create a document term matrix of the 1000-document corpus. The vocabulary should have no stopwords, numbers, or punctuation, and it should be lemmatized. Use a `TfidfVectorizer`.



<details>
    <summary>Answer</summary>
    <code># Tokenizing
tokenized_docs = [tokenizer.tokenize(doc) for doc in corpus['body']]
lower_docs = [[token.lower() for token in doc] for doc in tokenized_docs]
sw_docs = [[token for token in doc if token not in sw] for doc in lower_docs]
# Initial tagging
docs_tagged = [pos_tag(doc) for doc in sw_docs]
# Tag with Wordnet tags
wordnet_docs_tagged = [[(token[0], get_wordnet_pos(token[1]))
             for token in doc] for doc in docs_tagged]
# Lemmatize
docs_lemmed = [[lemmatizer.lemmatize(token[0], token[1]) for token in doc]\
               for doc in wordnet_docs_tagged]
# Use the tf-idf vectorizer to create the matrix
X = tf_vec.fit_transform([' '.join(doc) for doc in docs_lemmed])
df = pd.DataFrame(X.toarray(), columns=tf_vec.get_feature_names())</code>
</details>

## Naive Bayes and NLP Modeling

Before returning to our satire / no-satire example, let's consider an example with a smaller but similar scope.

Suppose we are using an API to gather articles from a news website and grabbing phrases from two different types of articles:  **music** and **politics**.

But we have a problem. Only some of our articles have an indication of their category (music or politics). Is there a way we can use Machine Learning to help us label our data **quickly**?

-------------------------------
### Here are our articles
#### Music Articles:

* 'the song was popular'
* 'band leaders disagreed on sound'
* 'played for a sold out arena stadium'

#### Politics Articles

* 'world leaders met lask week'
* 'the election was close'
* 'the officials agreed on a compromise'
--------------------------------------------------------
Let's try and predict one example phrase:

* "world leaders agreed to fund the stadium"

How can we make a model that labels this for us rather than having to go through by hand?

In [None]:
music = ['the song was popular',
         'band leaders disagreed on sound',
         'played for a sold out arena stadium']

politics = ['world leaders met last week',
            'the election was close',
            'the officials agreed on a compromise']

test_statement = 'world leaders agreed to fund the stadium'

In [None]:
#labels : {'music', 'politics'}
#features: words
test_statement_2 = 'officials met at the arena'

### Bayes's Theorem Again

Let's revisit Bayes's Theorem. Remember, the idea is to calculate the probability of the correct application of a class label (c) given some data (x). To do so, we calculate the **likelihood** (the distribution of our data within a given class) and the **prior** probabiliity of each class (the probability of seeing the class in the population). We are going generally to ignore the denominator of the right side of the equation because it will be constant.

<img src ="images/naive_bayes_icon.png">

### Another way of looking at it
<img src = "images/another_one.png">

## So, in the context of our problem......



$\large P(politics | phrase) = \frac{P(phrase|politics)P(politics)}{P(phrase)}$

$\large P(politics) = \frac{ \# politics}{\# all\ articles} $

*where phrase is our test statement*

<img src = "images/solving_theta.png" width="400">

### How should we calculate $P(politics)$?

This is essentially the distribution of the probability of either type of article. We have three of each type of article, therefore, we assume that there is an equal probability of either article

In [None]:
p_politics = len(politics) / (len(politics) + len(music))
p_music = len(music) / (len(politics) + len(music))

In [None]:
p_politics

In [None]:
p_music

### How should we calculate $P(phrase | politics)$?

We'll break the phrase down into individual words.

$\large P(phrase | politics) = \prod_{i=1}^{d} P(word_{i} | politics) $

**This is where the naivety of Naive Bayes comes in in this context. We assume that the predictive relevances of words are mutually independent.**

In practice, of course, this sounds rather unrealistic. But it greatly simplifies the Bayesian calculation.

$\large P(word_{i} | politics) = \frac{\#\ of\ word_{i}\ in\ politics\ articles} {\#\ of\ total\ words\ in\ politics\ articles} $

## Laplace Smoothing

In practice, the calculation of probabilities is often adjusted slightly to avoid zeroes.

$\large P(word_{i} | politics) = \frac{\#\ of\ word_{i}\ in\ politics\ articles \bf{+ \alpha}} {\#\ of\ total\ words\ in\ politics\ articles \bf{+ \alpha d}} $

$\large P(word_{i} | music) = \frac{\#\ of\ word_{i}\ in\ music\ articles \bf{+ \alpha}} {\#\ of\ total\ words\ in\ music\ articles \bf{+ \alpha d}}$

This correction process is called Laplace smoothing:

* d : number of features (in this instance total number of vocabulary words)
* $\alpha$ can be any number greater than 0 (it is usually 1)


#### Now let's find this calculation

In [None]:
def vocab_maker(category):
    """
    parameters: category is a list containing all the articles
    of a given category.
    
    returns the vocabulary for a given type of article
    
    """
    
    vocab_category = set() # will filter down to only unique words
    
    for art in category:
        words = art.split()
        for word in words:
            vocab_category.add(word)
    return vocab_category

In [None]:
voc_music = vocab_maker(music)
voc_pol = vocab_maker(politics)

In [None]:
# These are all the unique words in the music category
voc_music

In [None]:
# These are all the unique words in the politics category
voc_pol

In [None]:
# The union of the two sets gives us the unique words across both article groups
voc_all = voc_music.union(voc_pol)
voc_all

In [None]:
total_vocab_count = len(voc_all)
total_music_count = len(voc_music)
total_politics_count = len(voc_pol)

Let's remind ourselves of the goal, to see the posterior probability of the class politics given our phrase. 

> P(politics | "leaders agreed to fund the stadium")

In [None]:
def find_number_words_in_category(phrase, category):
    
    """This function will help us calculate our likelihoods
    by constructing a dictionary of counts of how many times
    each word in our test phrase appears in articles of a
    given category."""
    
    statement = phrase.split()
    
    # category is a list of the raw documents of each category
    str_category = ' '.join(category)
    cat_word_list = str_category.split()
    word_count = defaultdict(int)
    
    # loop through each word in the phrase
    for word in statement:
        # loop through each word in the category
        for art_word in cat_word_list:
            if word == art_word:
                # count the number of times the phrase word occurs in the category
                word_count[word] += 1
            else:
                word_count[word]
    return word_count

In [None]:
test_music_word_count = find_number_words_in_category(test_statement, music)

Let's remind ourselves of our music articles:

In [None]:
music

In [None]:
test_music_word_count

In [None]:
test_politic_word_count = find_number_words_in_category(test_statement, politics)

Let's remind ourselves of our politics articles:

In [None]:
politics

In [None]:
test_politic_word_count

In [None]:
def find_likelihood_with_smooth(category_count, test_category_count, alpha):
    
    # The numerator will be the product of all the counts 
    # with the smoothing factor (alpha) to make sure the probability is not zeroed out.
    # Because we're being officially "naïve", we'll simply multiply these all together.
    num = np.product(np.array(list(test_category_count.values())) + alpha)
    
    # The denominator will be the same for each word (total category count + total vocab
    # + alpha), so we raise it to the power of the length of the test category (d =
    # total_vocab_count)
    denom = (category_count + total_vocab_count*alpha)**(len(test_category_count))
    
    return num / denom

In [None]:
likelihood_m = find_likelihood_with_smooth(total_music_count, test_music_word_count, 1)

In [None]:
likelihood_p = find_likelihood_with_smooth(total_politics_count, test_politic_word_count, 1)

In [None]:
print(likelihood_m)
print(likelihood_p)

 $ P(politics | article) = P(politics) x \prod_{i=1}^{d} P(word_{i} | politics) $

#### Deteriming the winner of our model:

<img src = "images/solvingforyhat.png" width= "400">

In [None]:
# p(politics|article)  > p(music|article)
likelihood_p * p_politics  > likelihood_m * p_music

Many times, the probabilities we end up with are exceedingly small, so we can transform them using logarithms to save on computation speed. This takes advantage of the nice mathematical feature that the log of a product of factors is equal to the sum of the logs of the individual factors, i.e.:

$log(xy) = log(x) + log(y)$

$\large log(P(politics | article)) = log(P(politics)) + \sum_{i=1}^{d}log( P(word_{i} | politics)) $





Good Resource: https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html

## Back to Satire

In [None]:
# Recall our corpus
corpus.head()

Like always, we will perform a train test split...

In [None]:
X = corpus.body
y = corpus.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    test_size=0.25)

... and preprocess the training set.

In [None]:
# Bring in stopwords

sw = stopwords.words('english')

In [None]:
def get_wordnet_pos(treebank_tag):
    '''
    Translate nltk POS to wordnet tags
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
lemmatizer = WordNetLemmatizer() 

In [None]:
def doc_preparer(doc, stop_words=sw):
    '''
    
    :param doc: a document from the satire corpus 
    :return: a document string with words which have been 
            lemmatized, 
            parsed for stopwords, 
            made lowercase,
            and stripped of punctuation and numbers.
    '''
    
    regex_token = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    tokens = regex_token.tokenize(doc)
    lowered_t = [word.lower() for word in tokens]
    no_sw_t = [word for word in lowered_t if word not in sw]
    # print(doc)
    tokens_tagged = pos_tag(no_sw_t)
    pos_tokens = [(word[0], get_wordnet_pos(word[1])) for word in tokens_tagged] 
    tokens_lemm = [lemmatizer.lemmatize(word[0], word[1]) for word in pos_tokens]
    return ' '.join(tokens_lemm)

In [None]:
token_docs = [doc_preparer(doc, sw) for doc in X_train]

In [None]:
token_docs

In [None]:
corpus['processed'] = corpus['body'].apply(doc_preparer)
corpus.head()

For demonstration purposes, we will **limit our count vectorizer to 5 words** (the top 5 words by frequency).

In [None]:
# Secondary train-test split to build our best model
X_t, X_val, y_t, y_val = train_test_split(token_docs, y_train,
                                          test_size=0.25, random_state=42)

cv = CountVectorizer(max_features=5)

X_t_vec = cv.fit_transform(X_t)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)

In [None]:
X_t_vec

In [None]:
# We then transform the validation set. (Do not refit the vectorizer!)

X_val_vec = cv.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

### Multinomial Naive Bayes

Now let's fit the the Multinomial Naive Bayes Classifier on our training data

In [None]:
mnb = MultinomialNB()

mnb.fit(X_t_vec, y_t)

In [None]:
#What should our priors for each class be?

prior_0 = y_t.value_counts()[0]/len(y_t)
prior_1 = y_t.value_counts()[1]/len(y_t)
print(prior_0, prior_1)

In [None]:
y_t.value_counts(normalize=True)

The classifier has a `.class_log_prior_` attribute (once fitted).

In [None]:
mnb.class_log_prior_

And it simply contains the logged values of the priors:

In [None]:
np.exp(mnb.class_log_prior_[0])

In [None]:
np.log(prior_0)

Generate model predictions and get an accuracy score!

In [None]:
mnb.score(X_t_vec, y_t)

In [None]:
y_hat = mnb.predict(X_val_vec)
accuracy_score(y_val, y_hat)

Let's consider the scenario that we would like to isolate satirical news on Facebook so we can flag it. We do not want to flag real news by mistake. In other words, we want to minimize false positives.

In [None]:
from sklearn.metrics import plot_confusion_matrix

In [None]:
plot_confusion_matrix(mnb, X_val_vec, y_val);

In [None]:
precision_score(y_val, y_hat)

That's pretty good for a five word vocabulary.

Let's see what happens when we don't restrict our vocabulary.

In [None]:
cv = CountVectorizer()
X_t_vec = cv.fit_transform(X_t)
X_t_vec  = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)


X_val_vec = cv.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

In [None]:
mnb = MultinomialNB()

mnb.fit(X_t_vec, y_t)
y_hat = mnb.predict(X_val_vec)
plot_confusion_matrix(mnb, X_t_vec, y_t);

In [None]:
precision_score(y_t, mnb.predict(X_t_vec))

Wow! Look how well that performed. 

In [None]:
precision_score(y_val, y_hat)

In [None]:
len(cv.vocabulary_)

Let's see whether or not we can maintain that level of accuracy with fewer words.

In [None]:
cv = CountVectorizer(min_df=0.05, max_df=0.95)
X_t_vec = cv.fit_transform(X_t)
X_t_vec  = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)

X_val_vec = cv.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

mnb = MultinomialNB()

mnb.fit(X_t_vec, y_t)
y_hat = mnb.predict(X_val_vec)

precision_score(y_val, y_hat)

In [None]:
len(cv.vocabulary_)

In [None]:
# Now let's see what happens with TF-IDF

In [None]:
tfidf = TfidfVectorizer()
X_t_vec = tfidf.fit_transform(X_t)
X_t_vec  = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(tfidf.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)

X_val_vec = tfidf.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(tfidf.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

mnb = MultinomialNB()

mnb.fit(X_t_vec, y_t)
y_hat = mnb.predict(X_val_vec)

precision_score(y_val, y_hat)

In [None]:
precision_score(y_t, mnb.predict(X_t_vec))

TFIDF does not necessarily perform better than CV. It is just a tool in our toolbelt that's often worth trying out.

In [None]:
len(tfidf.vocabulary_)

In [None]:
tfidf = TfidfVectorizer(min_df=0.05, max_df=0.95)
X_t_vec = tfidf.fit_transform(X_t)
X_t_vec  = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(tfidf.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)

X_val_vec = tfidf.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(tfidf.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

mnb = MultinomialNB()

mnb.fit(X_t_vec, y_t)
y_hat = mnb.predict(X_val_vec)

precision_score(y_val, y_hat)

In [None]:
precision_score(y_t, mnb.predict(X_t_vec))

In [None]:
len(tfidf.vocabulary_)

Let's compare MNB to Random Forest:

In [None]:
rf = RandomForestClassifier(n_estimators=1000, max_features=5, max_depth=5)
rf.fit(X_t_vec, y_t)
y_hat = rf.predict(X_val_vec)
precision_score(y_val, y_hat)

In [None]:
precision_score(y_t, rf.predict(X_t_vec))

The two perform comparably, but MNB is lightweight as far as computational power and speed. For real-time predictions, we may choose MNB over Random Forest because the classifications can be performed quickly.

## You can feed the doc_preparer function directly to the vectorizor

#### Specify the preprocessor arguement in the vectorizor

In [None]:
X_train

In [None]:
X_t, X_val, y_t, y_val = train_test_split(X_train, y_train,
                                          test_size=0.25, random_state=42)

In [None]:
tfidf = TfidfVectorizer(min_df=0.05, max_df=0.95, 
                preprocessor=doc_preparer)
X_t_vec = tfidf.fit_transform(X_t)
X_t_vec  = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(tfidf.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)

X_val_vec = tfidf.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(tfidf.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

mnb = MultinomialNB()

mnb.fit(X_t_vec, y_t)
y_hat = mnb.predict(X_val_vec)

precision_score(y_val, y_hat)