In [31]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df = pd.read_csv('SMSSpamCollection', sep = '\t', names = ['label', 'message'])

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Cleaning
> Spam == 1, Ham == 0

In [4]:
df['label'] = df['label'].map({'spam': 1, 'ham': 0})
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


> NOTE: The entire dataframe is our corpus, each row is considered its own document

## Check Class Balance

We want to get a sense for our "baseline accuracy" or the distribution of spam versus ham.  It's called the baseline because if we were to simply calculate our 1's (spam) with brute force (label everything as spam) we would be right X amount of the time. It's obviously easy to improve upon that brute force model, but we also have an idea of how accurate we would be with a shitty model.

In [6]:
df['label'].value_counts(normalize = True) # normalize converts counts to percentages

0    0.865937
1    0.134063
Name: label, dtype: float64

> That's pretty unbalanced... we've got a lotta ham in here.

## Selecting Target & Features, TTS

In [7]:
X = df['message'] # note: one bracket for series to pass into CountVectorizer
y = df['label']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

> NOTE: notice the `stratify` parameter above. This retains the class distribution of our target variable that we checked above (~87/13 split ham/spam)

## Pipeline

Now we're ready to have our data go through a pipeline like so:
- Step 1: Pass X through the Count Vectorizer
- Step 2: Feed data into our model
    - NOTE: We are using a `MultinomialNB` model because when we tokenize our data with `CountVectorizer` we will have far more than two binary classes.


**NOTE About Pipelines**  
Pipelines are lists of tuples; each stage in the pipeline (above) is its own separate tuple. The first param of the tuple must be a string value. By convention, it makes your life easier if the string value is the exact same name as the variable you have.

In [9]:
cv = CountVectorizer()
model = MultinomialNB()

In [10]:
pipe = Pipeline([
    ('cv', cv), # step 1 above
    ('model', model) # step 2 above
])

## Grid Search
At this point, we would normally cross-validate to check the performance of our model. `GridSearchCV` obviously has that built-in, so we get it all in one step. We're going to pass in two parameters to `GridSearchCV` to optimize our model:  
1. Our `pipeline`
2. Our `param_grid` that we wish to optimize  

**NOTE:** For purposes of this tutorial, we will spend more time tuning the parameters of `CountVectorizer`

In [11]:
gs = GridSearchCV(pipe, param_grid = {})
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('model', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [12]:
gs.best_score_

0.9825317061497966

> NOTE: The score above represents our 3-fold cross-validation score from the `GridSearchCV`. That is the default number of folds but we can change it with the `cv` argument.

In [13]:
gs.score(X_test, y_test)

0.9863603732950467

> Check this out:

In [14]:
X_train

4747           Orh i tot u say she now still dun believe.
5295    Alex says he's not ok with you not being ok wi...
5568                 Will ü b going to esplanade fr home?
4654     Lol yes. But it will add some spice to your day.
1133                  Good morning princess! How are you?
944     And also I've sorta blown him off a couple tim...
357     Congratulations ur awarded 500 of CD vouchers ...
2290    Had your mobile 11mths ? Update for FREE to Or...
3964        If you ask her or she say any please message.
3734                Old Orchard near univ. How about you?
1200    NEFT Transaction with reference number  &lt;#&...
394     Yes i think so. I am in office but my lap is i...
4136    No need to say anything to me. I know i am an ...
1375    500 New Mobiles from 2004, MUST GO! Txt: NOKIA...
4494    LOL .. *grins* .. I'm not babe, but thanks for...
713                 08714712388 between 10am-7pm Cost 10p
5561    Get me out of this dump heap. My mom decided t...
2644    Hi! Yo

> What's so cool about this? Well, you'll notice that our `X_train` data is "unmodified" from our original TTS. THIS is the beauty of `Pipeline`.  Here's what's happening:
- We instantiated a `CountVectorizer` and a `MultinomialNB` model above
- We put them into a `Pipeline` to serve as a "de facto" model that we can pass into `GridSearchCV`
- We called the grid search on our pipeline and fit it to our training data set
- Pipeline took care of the rest, and now we don't have to refit or scale or anything else, we can simply tune our parameters in `GridSearchCV` to optimize our model.

### Note About Workflows
So at this point our workflow has been set up above. To illustrate the beauty of `Pipeline` I'm going to leave it as is, and just restart the grid search below. In a normal environment, you would just start playing with the grid search `param_grid` but I don't want to mess up the workbook flow.

In [28]:
params = {
    # NOTE: we need to tell GridSearchCV which model's params we want to tune; that's what the cv__ is for
    'cv__stop_words': [None, 'english'], # GridSearchCV will determine which is better
    'cv__max_features': [3000, 4000],
    'cv__ngram_range': [(1,1), (1,2)]
}
gs = GridSearchCV(pipe, param_grid = params)
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('model', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'cv__stop_words': [None, 'english'], 'cv__max_features': [3000, 4000], 'cv__ngram_range': [(1, 1), (1, 2)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [29]:
# check best params
gs.best_params_

{'cv__max_features': 3000, 'cv__ngram_range': (1, 2), 'cv__stop_words': None}

In [30]:
gs.best_score_

0.9844460397224216

## Questions
- How can I interpret the results of this? For example, can I inspect most popular words in each document? How?
- Lemmatizing... Do I take care of that before even splitting up into TTS?
- Can you only run one model at a time in `Pipeline`?

## One more thingy
The `binary` param in `CountVectorizer` simply states whether a word appeared in the document (`1`) or it didn't (`0`); it doesn't count the number of individual words. Riley points out in his video that this would be a good use-case for the `BernoulliNB` model instead of `MultinomialNB`. We're going to pull that into the pipeline and see how it performs as well.

In [32]:
cv = CountVectorizer(binary = True)
model = BernoulliNB()
pipe = Pipeline([
    ('cv', cv),
    ('model', model)
])

In [33]:
params = {
    # NOTE: we need to tell GridSearchCV which model's params we want to tune; that's what the cv__ is for
    'cv__stop_words': [None, 'english'], # GridSearchCV will determine which is better
    'cv__max_features': [3000, 4000],
    'cv__ngram_range': [(1,1), (1,2)]
}
gs = GridSearchCV(pipe, param_grid = params)
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=No...cabulary=None)), ('model', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'cv__stop_words': [None, 'english'], 'cv__max_features': [3000, 4000], 'cv__ngram_range': [(1, 1), (1, 2)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [34]:
gs.best_params_

{'cv__max_features': 3000, 'cv__ngram_range': (1, 2), 'cv__stop_words': None}

In [35]:
gs.best_score_

0.9808566642737497

In [36]:
gs.score(X_test, y_test)

0.9777458722182341

> 'E' for Effort, but it's actually worse than the original pipeline with `MultinomialNB`