# Spam Detector App

The data is from https://www.kaggle.com/uciml/sms-spam-collection-dataset. It consists of text messages labeled "ham" (not spam) or "spam".

## Setup

### Import packages

In [1]:
import pandas as pd
import re

from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_validate

### Some light EDA

In [2]:
# load the data

df = pd\
    .read_csv('spam.csv', encoding='L1')\
    .rename(columns={'v1': 'class', 'v2': 'text'})

In [3]:
# see the first 5 observations from the data

df.head()

Unnamed: 0,class,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# see the unique classes

df['class'].unique()

array(['ham', 'spam'], dtype=object)

In [5]:
# see the class balance

df['class'].value_counts()

ham     4825
spam     747
Name: class, dtype: int64

### Prepare the data from scikit-learn

In [6]:
X = df['text'].values
y = [0 if value == 'ham' else 1 for value in df['class'].values]

## Cross-Valididation

### Cross validate Naive Bayes

In [7]:
# create a model

naive_bayes = make_pipeline(
    CountVectorizer(stop_words='english', binary=True),
    MultinomialNB()
)

In [8]:
# validate the model

naive_bayes_cv = cross_validate(
    naive_bayes,
    X,
    y,
    cv=3,
    scoring=('accuracy', 'precision', 'recall')
)

print(naive_bayes_cv)

{'fit_time': array([0.09353232, 0.07993698, 0.05258703]), 'score_time': array([0.07555771, 0.07133007, 0.06686592]), 'test_accuracy': array([0.98977395, 0.98492192, 0.98384491]), 'test_precision': array([0.97131148, 0.98253275, 0.96202532]), 'test_recall': array([0.95180723, 0.90361446, 0.91566265])}


### Make a function for cross validation

In [9]:
# make a function to simplifly cross validation for the particular problem

def cross_validation_report(model):
    
    raw_cv_report = cross_validate(
        model,
        X,
        y,
        cv=3,
        scoring=('accuracy', 'precision', 'recall'),
#         random_state=101
    )
    
    cv_report = {f'avg_{key}': raw_cv_report[key].mean() for key in raw_cv_report}
    
    for key in cv_report:
        print(f'{key}: {cv_report[key]}')
    
    return

In [10]:
cross_validation_report(naive_bayes)

avg_fit_time: 0.05158837636311849
avg_score_time: 0.06886800130208333
avg_test_accuracy: 0.9861802595673157
avg_test_precision: 0.9719565143190785
avg_test_recall: 0.9236947791164658
