# Spam Classifiers

In [1]:
import pandas as pd
import numpy as np

spam_data = pd.read_csv('spam.csv')

spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head(10)

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
5,FreeMsg Hey there darling it's been 3 week's n...,1
6,Even my brother is not like to speak with me. ...,0
7,As per your request 'Melle Melle (Oru Minnamin...,0
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1


In [2]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], 
                                                    spam_data['target'], 
                                                    random_state=0)

In [3]:
# Percentage of spam instances

len(spam_data[spam_data['target'] == 1]) / len(spam_data) * 100

13.406317300789663

In [15]:
# Longest Token in the vocabulary

vec = CountVectorizer().fit(X_train)

features = np.array(vec.get_feature_names())
token_lengths = list(map(len, features))
print('Longest Token in the Vocabulary is {}'.format(features[np.argmax(token_lengths)]))

Longest Token in the Vocabulary is com1win150ppmx3age16subscription


## Model 1

Fit and transform the training data `X_train` using a Count Vectorizer with default parameters.

Next, fit a fit a multinomial Naive Bayes classifier model with smoothing `alpha=0.1`. Report the area under the curve (AUC) score using the transformed test data.

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score

vec = CountVectorizer().fit(X_train)

X_train_vec = vec.transform(X_train)
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vec, y_train)
predictions = model.predict(vec.transform(X_test))
print('AUC score of Multinomial Naive Bayes model is {}'.format(roc_auc_score(y_test, predictions)))

AUC score of Multinomial Naive Bayes model is 0.9720812182741116


In [6]:
# Exploratory data analysis using TF-IDF Vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer().fit(X_train)
    
X_train_vec = vec.transform(X_train)

features = np.array(vec.get_feature_names())

sorted_tfidf = X_train_vec.max(0).toarray()[0].argsort()

small_index = features[sorted_tfidf[:20]]
small_value = np.sort(X_train_vec.max(0).toarray()[0])[:20]
small_final_index = np.concatenate((np.sort(small_index[small_value == min(small_value)]), small_index[small_value != min(small_value)]))

large_index = features[sorted_tfidf[:-21:-1]]
large_value = np.sort(X_train_vec.max(0).toarray()[0])[:-21:-1]
large_final_index = np.concatenate((np.sort(large_index[large_value == max(large_value)]), large_index[large_value != max(large_value)]))

small = pd.Series(small_value,index=small_final_index)
large = pd.Series(large_value,index=large_final_index)

print('Features with smallest TF-IDF')
print(small)
print('\n')
print('Features with largest TF-IDF')
print(large)

Features with smallest TF-IDF
aaniye          0.074475
athletic        0.074475
chef            0.074475
companion       0.074475
courageous      0.074475
dependable      0.074475
determined      0.074475
exterminator    0.074475
healer          0.074475
listener        0.074475
organizer       0.074475
pest            0.074475
psychiatrist    0.074475
psychologist    0.074475
pudunga         0.074475
stylist         0.074475
sympathetic     0.074475
venaam          0.074475
diwali          0.091250
mornings        0.091250
dtype: float64


Features with largest TF-IDF
146tf150p    1.000000
645          1.000000
anything     1.000000
anytime      1.000000
beerage      1.000000
done         1.000000
er           1.000000
havent       1.000000
home         1.000000
lei          1.000000
nite         1.000000
ok           1.000000
okie         1.000000
thank        1.000000
thanx        1.000000
too          1.000000
where        1.000000
yup          1.000000
tick         0.980166
blank 

## Model 2

Fit and transform the training data `X_train` using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than **3**.

Then fit a multinomial Naive Bayes classifier model with smoothing `alpha=0.1` and compute the area under the curve (AUC) score using the transformed test data.

In [18]:
vec = TfidfVectorizer(min_df=3).fit(X_train)
X_train_vec = vec.transform(X_train)
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vec, y_train)
predictions = model.predict(vec.transform(X_test))

print('AUC score of Multinomial Naive Bayes model is {}'.format(roc_auc_score(y_test, predictions)))

AUC score of Multinomial Naive Bayes model is 0.9416243654822335


In [19]:
# Use length of text as an additional feature

length_spam = list(map(len,spam_data['text'][spam_data.target == 1]))
length_not_spam = list(map(len,spam_data['text'][spam_data.target == 0]))

print('Average length of spam text = {}'.format(np.mean(length_spam)))
print('Average length of non-spam text = {}'.format(np.mean(length_not_spam)))

Average length of spam text = 138.8661311914324
Average length of non-spam text = 71.02362694300518


In [20]:
# Function to add additional features

def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

## Model 3

Fit and transform the training data X_train using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than **5**.

Using this document-term matrix and an additional feature, **the length of document (number of characters)**, fit a Support Vector Classification model with regularization `C=10000`. Then compute the area under the curve (AUC) score using the transformed test data.

In [21]:
from sklearn.svm import SVC

vec = TfidfVectorizer(min_df=5)

X_train_vec = vec.fit_transform(X_train)
X_train_trans = add_feature(X_train_vec, X_train.str.len())

X_test_vec = vec.transform(X_test)
X_test_trans = add_feature(X_test_vec, X_test.str.len())

clf = SVC(C=10000, gamma='auto')
clf.fit(X_train_trans, y_train)
y_predicted = clf.predict(X_test_trans)

print('AUC score of SVM model is {}'.format(roc_auc_score(y_test, y_predicted)))

AUC score of SVM model is 0.9581366823421557


In [22]:
# Use average number of digits per text as additional feature

import re

spam = [re.findall("[0-9]", i) for i in spam_data['text'][spam_data.target == 1]]
non_spam = [re.findall("[0-9]", i) for i in spam_data['text'][spam_data.target == 0]]

print('Average # of digits in spam text = {}'.format(np.mean(list(map(len, spam)))))
print('Average # of digits in non-spam text = {}'.format(np.mean(list(map(len, non_spam)))))

Average # of digits in spam text = 15.759036144578314
Average # of digits in non-spam text = 0.2992746113989637


## Model 4

Fit and transform the training data `X_train` using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than **5** and using **word n-grams from n=1 to n=3** (unigrams, bigrams, and trigrams).

Using this document-term matrix and the following additional features:
* the length of document (number of characters)
* **number of digits per document**

fit a Logistic Regression model with regularization `C=100`. Then compute the area under the curve (AUC) score using the transformed test data.

In [24]:
from sklearn.linear_model import LogisticRegression

vec = TfidfVectorizer(min_df=5, ngram_range=[1, 3])
    
X_train_vec = vec.fit_transform(X_train)
X_train_trans = add_feature(X_train_vec, [X_train.str.len(), X_train.apply(lambda x: len(''.join([a for a in x if a.isdigit()])))])

X_test_vec = vec.transform(X_test)
X_test_trans = add_feature(X_test_vec, [X_test.str.len(), X_test.apply(lambda x: len(''.join([a for a in x if a.isdigit()])))])

clf = LogisticRegression(C=100, solver='liblinear').fit(X_train_trans, y_train)
y_predicted = clf.predict(X_test_trans)

print('AUC score of Logistic Regression model is {}'.format(roc_auc_score(y_test, y_predicted)))

AUC score of Logistic Regression model is 0.9678709064054463


In [25]:
# Use number of non-word characters as additional features

spam_data['alpha_num'] = spam_data['text'].str.findall(r'(\W)').str.len()

spam_non_word = np.mean(spam_data['alpha_num'][spam_data.target == 1])
non_spam_non_word = np.mean(spam_data['alpha_num'][spam_data.target == 0])

print('Average # of non-word characters in spam text = {}'.format(spam_non_word))
print('Average # of non-word characters in non-spam text = {}'.format(non_spam_non_word))

Average # of non-word characters in spam text = 29.041499330655956
Average # of non-word characters in non-spam text = 17.29181347150259


## Model 5

Fit and transform the training data X_train using a Count Vectorizer ignoring terms that have a document frequency strictly lower than **5** and using **character n-grams from n=2 to n=5.**

To tell Count Vectorizer to use character n-grams Passing in `analyzer='char_wb'` to the Count Vectorizer tells it to use character n-grams by forcing it to create character n-grams only from text inside word boundaries. This should make the model more robust to spelling mistakes.

Using this document-term matrix and the following additional features:
* the length of document (number of characters)
* number of digits per document
* **number of non-word characters (anything other than a letter, digit or underscore.)**

Fit a Logistic Regression model with regularization C=100. Then compute the area under the curve (AUC) score using the transformed test data.

In [26]:
vec = CountVectorizer(min_df=5, ngram_range=[2, 5], analyzer='char_wb')
    
X_train_vec = vec.fit_transform(X_train)
X_train_trans = add_feature(X_train_vec, [X_train.str.len(), 
                                          X_train.apply(lambda x: len(''.join([a for a in x if a.isdigit()]))),
                                          X_train.str.findall(r'(\W)').str.len()])
X_test_vec = vec.transform(X_test)
X_test_trans = add_feature(X_test_vec, [X_test.str.len(),
                                        X_test.apply(lambda x: len(''.join([a for a in x if a.isdigit()]))),
                                        X_test.str.findall(r'(\W)').str.len()])

clf = LogisticRegression(C=100, solver='liblinear').fit(X_train_trans, y_train)
y_predicted = clf.predict(X_test_trans) 
auc_score = roc_auc_score(y_test, y_predicted)

print('AUC score of Logistic Regression model is {}'.format(roc_auc_score(y_test, y_predicted)))

AUC score of Logistic Regression model is 0.9788593110707434


In [27]:
features = np.array(vec.get_feature_names() + ['length_of_doc', 'digit_count', 'non_word_char_count'])

sorted_coef_index = clf.coef_[0].argsort()
smallest = list(features[sorted_coef_index[:10]])
largest = list(features[sorted_coef_index[:-11:-1]])

print('Features with smallest coefficients:')
print(smallest)
print('\n')
print('Features with largest coefficients:')
print(largest)

Features with smallest coefficients:
['. ', '..', '? ', ' i', ' y', ' go', ':)', ' h', 'go', ' m']


Features with largest coefficients:
['digit_count', 'ne', 'ia', 'co', 'xt', ' ch', 'mob', ' x', 'ww', 'ar']
