# Building a simple spam classifier with different models and parameters

This notebook explores efficiency of different models and NLP techniques in spam classification problem

In [1]:
# import libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
%pylab inline

Populating the interactive namespace from numpy and matplotlib


Import dataset of labeled smap/nonspam SMS:

In [2]:
data = pd.read_csv('SMSSpamCollection.txt', sep='\t', header=None, names=['class', 'text'])
data.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
data.describe()

Unnamed: 0,class,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


## Model building 

In [10]:
texts = data['text']
classes = data['class'].apply(lambda name: 0 if name == 'ham' else 1)

In [16]:
clf_pipeline = Pipeline(
            [("vectorizer", TfidfVectorizer()),
            ("transformer", TfidfTransformer()),
            ("classifier", LogisticRegression(random_state=2))]
        )

In [17]:
score = np.mean(cross_val_score(clf_pipeline, texts, classes, cv=10, scoring='f1'))
print(score)

0.8408629879743227


In [18]:
# yet another pipeline

In [23]:
clf_pipeline = Pipeline(
            [("vectorizer", CountVectorizer()),
            ("classifier", LogisticRegression(random_state=2))]
        )

In [41]:
score = np.mean(cross_val_score(clf_pipeline, texts, classes, cv=10, scoring='accuracy'))
print(score)

0.9829504254359875


In [25]:
clf_pipeline.fit(texts, classes)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...nalty='l2', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [45]:
messages = ['SO MANY CASH COME AND BUY IT VIAGRA 100$ free WINNER!!!! !!!']
clf_pipeline.predict(messages)

array([0])

In [37]:
data[data['class'] == 'spam']

Unnamed: 0,class,text
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
12,spam,URGENT! You have won a 1 week FREE membership ...
15,spam,"XXXMobileMovieClub: To use your credit, click ..."
19,spam,England v Macedonia - dont miss the goals/team...
34,spam,Thanks for your subscription to Ringtone UK yo...
42,spam,07732584351 - Rodger Burns - MSG = We tried to...


In [72]:
params = [(2, 2), (3, 3), (1, 3)]
for param in params:
    vectorizer = CountVectorizer(ngram_range=param)
    vectorizer.fit(texts)
    X = vectorizer.transform(texts)
    clf = MultinomialNB()
    print np.mean(cross_val_score(clf, X, classes, cv=10, scoring='f1'))

0.6455015177985443
0.37871948524573595
0.8884859656061002
