# Parallel Training of Spam Detector

The data is from https://www.kaggle.com/uciml/sms-spam-collection-dataset. It consists of text messages labeled "ham" (not spam) or "spam".

## Setup

### Import packages

In [1]:
import pandas as pd
import re

from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from resc import ParallelHashingVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_validate

from joblib import dump

### Some light EDA

In [2]:
# load the data

df = pd\
    .read_csv('spam.csv', encoding='L1')\
    .rename(columns={'v1': 'class', 'v2': 'text'})

In [3]:
# see the first 5 observations from the data

df.head()

Unnamed: 0,class,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# see the unique classes

df['class'].unique()

array(['ham', 'spam'], dtype=object)

In [5]:
# see the class balance

df['class'].value_counts()

ham     4825
spam     747
Name: class, dtype: int64

### Prepare the data from scikit-learn

In [6]:
X = df['text'].values
y = [0 if value == 'ham' else 1 for value in df['class'].values]

In [7]:
# make a function to simplifly cross validation for the particular problem

def cross_validation_report(model):
    
    raw_cv_report = cross_validate(
        model,
        X,
        y,
        cv=3,
        scoring=('accuracy', 'precision', 'recall')
    )
    
    cv_report = {f'avg_{key}': raw_cv_report[key].mean() for key in raw_cv_report}
    
    for key in cv_report:
        print(f'{key}: {cv_report[key]}')
    
    return

In [18]:
model_1 = make_pipeline(
    CountVectorizer(
        ngram_range=(3,5),
        analyzer='char_wb'
    ),
    LogisticRegression()
)

cross_validation_report(model_1)



avg_fit_time: 1.861017147699992
avg_score_time: 1.417300780614217
avg_test_accuracy: 0.984924428925048
avg_test_precision: 0.9940205692418081
avg_test_recall: 0.892904953145917


In [21]:
model_2 = make_pipeline(
    CountVectorizer(
        ngram_range=(3,5),
        analyzer='char_wb'
    ),
    LogisticRegression(
        n_jobs=4,
        solver='saga'
    )
)

cross_validation_report(model_2)

avg_fit_time: 4.776790301005046
avg_score_time: 1.1906009515126545
avg_test_accuracy: 0.9833094996984809
avg_test_precision: 0.9823136488794747
avg_test_recall: 0.891566265060241


In [22]:
model_3 = make_pipeline(
    HashingVectorizer(
        ngram_range=(3,5),
        analyzer='char_wb'
    ),
    LogisticRegression(
        n_jobs=4,
        solver='saga'
    )
)

cross_validation_report(model_3)

avg_fit_time: 6.621983448664348
avg_score_time: 0.9682006041208903
avg_test_accuracy: 0.9691314335598061
avg_test_precision: 0.9931446791550419
avg_test_recall: 0.7751004016064257


In [23]:
model_4 = make_pipeline(
    ParallelHashingVectorizer(
        ngram_range=(3,5),
        analyzer='char_wb',
        n_jobs=8
    ),
    LogisticRegression(
        n_jobs=8,
        solver='saga'
    )
)

cross_validation_report(model_4)

avg_fit_time: 1.4733986059824626
avg_score_time: 0.1358009179433187
avg_test_accuracy: 0.9646445851469406
avg_test_precision: 0.97759527739499
avg_test_recall: 0.7536813922356091


In [24]:
6.621983448664348 / 1.4733986059824626

4.4943597895213205