# Spam Detector App

The data is from https://www.kaggle.com/uciml/sms-spam-collection-dataset. It consists of text messages labeled "ham" (not spam) or "spam".

## Setup

### Import packages

In [1]:
import pandas as pd
import re

from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_validate

from joblib import dump

### Some light EDA

In [2]:
# load the data

df = pd\
    .read_csv('../resources/spam.csv', encoding='L1')\
    .rename(columns={'v1': 'class', 'v2': 'text'})

In [3]:
# see the first 5 observations from the data

df.head()

Unnamed: 0,class,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# see the unique classes

df['class'].unique()

array(['ham', 'spam'], dtype=object)

In [5]:
# see the class balance

df['class'].value_counts()

ham     4825
spam     747
Name: class, dtype: int64

### Prepare the data from scikit-learn

In [6]:
X = df['text'].values
y = [0 if value == 'ham' else 1 for value in df['class'].values]

## Cross-Valididation

### Cross validate Naive Bayes

In [7]:
# create a model

naive_bayes = make_pipeline(
    CountVectorizer(stop_words='english', binary=True),
    MultinomialNB()
)

In [8]:
# validate the model

naive_bayes_cv = cross_validate(
    naive_bayes,
    X,
    y,
    cv=3,
    scoring=('accuracy', 'precision', 'recall')
)

print(naive_bayes_cv)

{'fit_time': array([0.08257985, 0.08208585, 0.05234504]), 'score_time': array([0.06981301, 0.0664351 , 0.06772995]), 'test_accuracy': array([0.98977395, 0.98492192, 0.98384491]), 'test_precision': array([0.97131148, 0.98253275, 0.96202532]), 'test_recall': array([0.95180723, 0.90361446, 0.91566265])}


### Make a function for cross validation

In [9]:
# make a function to simplifly cross validation for the particular problem

def cross_validation_report(model):
    
    raw_cv_report = cross_validate(
        model,
        X,
        y,
        cv=3,
        scoring=('accuracy', 'precision', 'recall')
    )
    
    cv_report = {f'avg_{key}': raw_cv_report[key].mean() for key in raw_cv_report}
    
    for key in cv_report:
        print(f'{key}: {cv_report[key]}')
    
    return

In [10]:
cross_validation_report(naive_bayes)

avg_fit_time: 0.053866942723592125
avg_score_time: 0.07056403160095215
avg_test_accuracy: 0.9861802595673157
avg_test_precision: 0.9719565143190785
avg_test_recall: 0.9236947791164658


## Introspection

### Cross validate logistic regression

In [11]:
wc_lr = make_pipeline(
    CountVectorizer(stop_words='english'),
    LogisticRegression()
)

cross_validation_report(wc_lr)



avg_fit_time: 0.07392223676045735
avg_score_time: 0.071684996287028
avg_test_accuracy: 0.9781046473365936
avg_test_precision: 0.9952977825021407
avg_test_recall: 0.8406961178045514


### Fit logistic regression

In [12]:
wc_lr.fit(X, y);

### Inspect vocabulary and feature importance

In [13]:
# look at the vocab

vocab = wc_lr[0].vocabulary_

# display the dictionary as an series to limit output
pd.Series(vocab).head(10)

jurong       4224
point        5741
crazy        2271
available    1271
bugis        1703
great        3534
world        8227
la           4349
buffet       1701
cine         1994
dtype: int64

In [14]:
# look at the coef

coef = wc_lr[-1].coef_
coef

array([[ 0.5207137 ,  0.58063767, -0.00719109, ...,  0.09480863,
        -0.12598943, -0.00422916]])

In [15]:
# create a dataframe of features ordered by coef magnitude
pd.DataFrame({
    'feature': pd.Series(vocab).sort_values().index,
    'coef': coef.flatten()
})\
    .set_index('feature')\
    .assign(abs_coef=lambda df: df['coef'].abs())\
    .sort_values('abs_coef', ascending=False)\
    .head(10)

Unnamed: 0_level_0,coef,abs_coef
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
uk,2.243867,2.243867
service,1.983691,1.983691
txt,1.976917,1.976917
claim,1.970454,1.970454
new,1.908175,1.908175
mobile,1.854016,1.854016
150p,1.817447,1.817447
50,1.758578,1.758578
message,1.727548,1.727548
won,1.631451,1.631451


### Feature Importance with a function

In [16]:
# make it a function
def get_importance(cv_lr_pipeline):
    
    vocab = cv_lr_pipeline[0].vocabulary_
    coef = cv_lr_pipeline[-1].coef_
    
    return pd.DataFrame({
        'feature': pd.Series(vocab).sort_values().index,
        'coef': coef.flatten()
    })\
        .set_index('feature')\
        .assign(abs_coef=lambda df: df['coef'].abs())\
        .sort_values('abs_coef', ascending=False)

In [17]:
get_importance(wc_lr).head(10)

Unnamed: 0_level_0,coef,abs_coef
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
uk,2.243867,2.243867
service,1.983691,1.983691
txt,1.976917,1.976917
claim,1.970454,1.970454
new,1.908175,1.908175
mobile,1.854016,1.854016
150p,1.817447,1.817447
50,1.758578,1.758578
message,1.727548,1.727548
won,1.631451,1.631451


## Remove non-alpha and go character-level

### Character-level logistic regression

In [18]:
char_clean_lr = make_pipeline(
    CountVectorizer(
        stop_words='english',
        preprocessor=lambda s: re.sub(r'[^a-zA-Z ]', '', s).lower(),
        ngram_range=(3,5),
        strip_accents='ascii',
        analyzer='char_wb',
    ),
    LogisticRegression()
)

cross_validation_report(char_clean_lr)



avg_fit_time: 0.582451343536377
avg_score_time: 0.5834526220957438
avg_test_accuracy: 0.9840270205985595
avg_test_precision: 0.9882065637637684
avg_test_recall: 0.891566265060241


In [19]:
char_clean_lr.fit(X, y)
get_importance(char_clean_lr).head(10)

Unnamed: 0_level_0,coef,abs_coef
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
i,-0.795189,0.795189
fp,0.673032,0.673032
tfp,0.673032,0.673032
tfp,0.673032,0.673032
tfp,0.673027,0.673027
tf,0.673027,0.673027
tfp,0.673027,0.673027
xt,0.595023,0.595023
mob,0.56944,0.56944
p,0.521665,0.521665


### Character-level naive bayes

In [20]:
char_clean_naive_bayes = make_pipeline(
    CountVectorizer(
        stop_words='english',
        preprocessor=lambda s: re.sub(r'[^a-zA-Z ]', '', s).lower(),
        ngram_range=(3,5),
        strip_accents='ascii',
        analyzer='char_wb'
    ),
    MultinomialNB()
)

cross_validation_report(char_clean_naive_bayes)

avg_fit_time: 0.42515865961710614
avg_score_time: 0.5718952020009359
avg_test_accuracy: 0.9781050337757481
avg_test_precision: 0.9025337289584113
avg_test_recall: 0.9384203480589024


## Retrain Selected Model

In [21]:
naive_bayes.fit(X, y);

## Save the model

In [22]:
dump(naive_bayes, 'spam_detector.joblib') 

['spam_detector.joblib']