# Naive Bayes Spam Classifier
* This is a tiny project over the application of Naive Bayes in spam detection

## Data Loading

In [1]:
#!wget https://lazyprogrammer.me/course_files/spam.csv -P data/

In [2]:
import pandas as pd

df = pd.read_csv('data/spam.csv', encoding='iso-8859-1', usecols=[0,1])
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Notice that we are dealing with a pretty unbalanced dataset.
# This will require us to resort to specific quality metrics, such as f-beta. 
df['v1'].value_counts(normalize=True)

v1
ham     0.865937
spam    0.134063
Name: proportion, dtype: float64

## Data Treatment
* Just applying usual treatment for ML (split, tokenization, lemmatization...)

In [4]:
from sklearn.model_selection import train_test_split
import numpy as np

# Getting our X and y.
X,y = df['v2'], df['v1']
y = np.where(y=='spam', 1,0)

# Creating our splits.
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=42)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet, stopwords
from typing import List

class LemmaTokenizer:
    '''
        Lemmatizer to be used as the `tokenizer` argument in the 
        `sklearn.feature_extraction.text.TfidfVectorizer` class. It tokenizes  the string and 
        applies lemmatization, according to the WordNet Pos-Tagging.
    '''
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    
    @staticmethod
    def get_wordnet_pos(treebank_tag):
        '''
            Converts a Tree Bank TAG into a WordNet TAG.

            Parameter
            ---------
            `trebank_tag`: str
                The Tree Bank TAG

            Returns
            -------
            The converted TAG.`
        '''
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
        
    def __call__(self, doc)->List[str]:
        tokens = word_tokenize(doc)
        tokens_tags = pos_tag(tokens)
        return [self.wnl.lemmatize(token, pos=self.get_wordnet_pos(pos)) for token, pos in tokens_tags]

In [6]:
# Listing common English stopwords to avoid generating large matrices.
stop_words = stopwords.words('english')

In [7]:
# Finally, generating our transformation Pipeline!
from sklearn.pipeline import Pipeline
tf_idf = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words=stop_words, strip_accents='ascii')

pipe = Pipeline([('tf_idf', tf_idf)])

In [8]:
# Turning our documents into vectors.
X_train = pipe.fit_transform(X_train).toarray()
X_test = pipe.transform(X_test).toarray()



## Naive Bayes Modeling
* With our data in numerical form, we can carry out the creation of a Gaussian Naive Bayes classifier.

In [9]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(X_train, y_train)

In [10]:
# From my point of view, the prediction of FP's are a little bit more detrimental than FN's
# in spam detection, since it is undesirable that the user misses important emails because they
# were directed to the spam  box.

# Hence, we'll evaluate our model with a f-beta score with beta=0.75
from sklearn.metrics import fbeta_score
fbeta_score(y_test, gnb.predict(X_test), beta=.75)

0.6307253341342544