# SMS Spam Classifier
- A short exercise of positive/negative spam prediction.
- comparison of approaches using:
  - CountVectorizer
  - Term Frequency-Inverse Document Frequency
  - n-grams of words
- text preprocessing:
  - drop stop words like: 'the', 'is', etc.
- feature engineering:
  - length of text
  - number of digits contained in text
  - number of non-word characters in text

In [1]:
import pandas as pd
import numpy as np

spam_data = pd.read_csv('./data/spam_sms.csv')
spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will Ì_ b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0


# EDA, feature engineering

In [2]:

##? PROPORTION OF SPAM TEXTS
num_spam = len(spam_data[spam_data.target==1])
num_total = len(spam_data)

print('Proportion of Spam Text: {:.3f} \n\
# Spam Texts : {:>5}\n\
# Total Texts: {:>5}\n'.format((num_spam/num_total), num_spam , num_total))


##? MEAN LENGTH OF SPAM VS NON-SPAM TEXTS
isspam = spam_data[spam_data.target==1]
nospam = spam_data[spam_data.target==0]
spamlength    = isspam.text.str.len().mean()
notspamlength = nospam.text.str.len().mean()

print('Mean length of non-spam texts : {:>8.3f}\n\
Mean length of spam texts     : {:>8.3f}\n\
'.format(notspamlength, spamlength))


##? MEAN NUMBER OF DIGIT CHARACTERS IN SPAM VS NON-SPAM TEXTS
import re
def nonwordlengths(row):
        a = len(re.findall(r'\W', row))
        return a

spam_data['nonwlength'] = spam_data['text'].apply(nonwordlengths)

##? MEAN NUMBER OF NON-WORD CHARACTERS IN SPAM VS NON-SPAM TEXTS (NOT A LETTER, DIGIT, UNDERSCORE)
def digitlengths(row):
        a = len(re.findall(r'\d', row))
        return a

spam_data['digitlength'] = spam_data['text'].apply(digitlengths)

isspam = spam_data[spam_data.target==1]
nospam = spam_data[spam_data.target==0]

print('\
Mean # digits in non-spam texts : {:>8.3f}\n\
Mean # digits in spam texts     : {:>8.3f}\n\
'.format(nospam['digitlength'].mean(), isspam['digitlength'].mean()))
# 0.2992746113989637 15.759036144578314


print('\
Mean # non-word chars in non-spam texts : {:>8.3f}\n\
Mean # non-word chars in spam texts     : {:>8.3f}\n\
'.format(nospam['nonwlength'].mean(), isspam['nonwlength'].mean()))
# 17.29181347150259 29.041499330655956

Proportion of Spam Text: 0.134 
# Spam Texts :   747
# Total Texts:  5572

Mean length of non-spam texts :   71.024
Mean length of spam texts     :  138.866

Mean # digits in non-spam texts :    0.299
Mean # digits in spam texts     :   15.759

Mean # non-word chars in non-spam texts :   17.292
Mean # non-word chars in spam texts     :   29.041



# Baseline Models

#### using CountVectorizer bag-of-words approach
- ignore words that appear in less than 3 samples, or more than half the samples
- drop english stop words like 'the' 'is'...
- strip accent characters
- review model, and word occurrence. we could iteratively improve model by adding stop words from the 'high occurrence words' list that we deem uninformative, in certain applications if necessary.

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], 
                                                    spam_data['target'], 
                                                    random_state=0)

##? MNB using CountVectorizer
vect = CountVectorizer(
    strip_accents = 'unicode',
    stop_words='english',
    lowercase = True,
    max_df = 0.5,
    min_df = 3
).fit(X_train)
X_train_vect = vect.transform(X_train)
X_test_vect  = vect.transform(X_test)

mnb = MultinomialNB(alpha=0.1).fit(X_train_vect, y_train)
y_pred = mnb.predict(X_test_vect)
score = precision_score(y_test, y_pred)
print('MNB using CountVectorizer, test ROC AUC score: {:.4f}'.format(score) )

feature_names = np.array(vect.get_feature_names_out())

##? Smallest and Largest Word Counts
train_vect_count = pd.DataFrame({'feature_name':feature_names, 'count':X_train_vect.max(0).toarray()[0]}).sort_values('count', ascending=False)
print('HIGH OCCURRENCE WORDS')
display(train_vect_count.head(10))
print('LOW OCCURRENCE WORDS')
display(train_vect_count.tail(10))

##? Smallest and Largest Coefs
train_coefs = pd.DataFrame({'feature_name':feature_names, 'coef':mnb.coef_[0]}).sort_values('coef', ascending=False)
print('WORDS ASSOCIATED WITH SPAM')
display(train_coefs.head(10))
print('WORDS ASSOCIATED WITH NOT SPAM')
display(train_coefs.tail(10))

MNB using CountVectorizer, test ROC AUC score: 0.9579
HIGH OCCURRENCE WORDS


Unnamed: 0,feature_name,count
1105,lt,18
808,gt,18
836,happy,15
670,face,10
534,day,6
1180,missing,6
1900,ur,6
1638,smile,5
858,hi,5
1614,simple,5


LOW OCCURRENCE WORDS


Unnamed: 0,feature_name,count
757,game,1
756,gals,1
754,fyi,1
752,funny,1
751,fun,1
749,ful,1
748,fucking,1
747,fuckin,1
745,fromm,1
2073,zed,1


WORDS ASSOCIATED WITH SPAM




Unnamed: 0,feature_name,coef
730,free,-3.758523
1869,txt,-4.075317
1900,ur,-4.157941
1702,stop,-4.276693
1783,text,-4.306223
1188,mobile,-4.378718
432,claim,-4.468565
2041,www,-4.529117
1488,reply,-4.529117
1415,prize,-4.706208


WORDS ASSOCIATED WITH NOT SPAM


Unnamed: 0,feature_name,coef
1100,loverboy,-11.214978
1089,loses,-11.214978
1098,lovely,-11.214978
1097,loved,-11.214978
320,bowl,-11.214978
1095,lovable,-11.214978
1094,loud,-11.214978
1092,lot,-11.214978
1090,loss,-11.214978
1275,nyc,-11.214978


#### using TfidfVectorizer bag-of-words approach
- ignore words that appear in less than 3 samples, or more than half the samples
- drop english stop words like 'the' 'is'...
- strip accent characters
- review model, and word occurrence. we could iteratively improve model by adding stop words from the 'low importance words' list that we deem uninformative, in certain applications if necessary.

In [11]:
##? MNB using TfidfVectorizer
vect = TfidfVectorizer(
    strip_accents = 'unicode',
    stop_words='english',
    lowercase = True,
    max_df = 0.5,
    min_df = 3
).fit(X_train)
X_train_vect = vect.transform(X_train)
X_test_vect = vect.transform(X_test)
feature_names = vect.get_feature_names_out()
tfidf_values = X_train_vect.max(axis=0).toarray()

mnb = MultinomialNB(alpha=0.1).fit(X_train_vect, y_train)
y_pred = mnb.predict(X_test_vect)
score = precision_score(y_test, y_pred)
print('MNB using TfidfVectorizer, test ROC AUC score: {:.4f}'.format(score) )

feature_names = np.array(vect.get_feature_names_out())

##? Smallest and Largest tf-idfs
train_vect_tfidf = pd.DataFrame({'feature_name':feature_names, 'tfidf':X_train_vect.max(0).toarray()[0]}).sort_values('tfidf', ascending=False)
print('HIGH IMPORTANCE WORDS')
display(train_vect_tfidf.head(10))
print('LOW IMPORTANCE WORDS')
display(train_vect_tfidf.tail(10))

##? Smallest and Largest Coefs
train_coefs = pd.DataFrame({'feature_name':feature_names, 'coef':mnb.coef_[0]}).sort_values('coef', ascending=False)
print('WORDS ASSOCIATED WITH SPAM')
display(train_coefs.head(10))
print('WORDS ASSOCIATED WITH NOT SPAM')
display(train_coefs.tail(10))


MNB using TfidfVectorizer, test ROC AUC score: 0.9781
HIGH IMPORTANCE WORDS


Unnamed: 0,feature_name,tfidf
1131,marriage,1.0
1977,weather,1.0
1789,thanx,1.0
190,amp,1.0
1269,number,1.0
1266,nt,1.0
800,gotta,1.0
454,coming,1.0
1255,nite,1.0
1253,night,1.0


LOW IMPORTANCE WORDS


Unnamed: 0,feature_name,tfidf
1262,norm150p,0.22968
80,36504,0.225531
496,crack,0.225277
844,havnt,0.225277
1024,laughed,0.225277
877,honeybee,0.225277
1746,sweetest,0.225277
771,genuine,0.224859
29,100percent,0.224859
1206,moral,0.2209


WORDS ASSOCIATED WITH SPAM




Unnamed: 0,feature_name,coef
730,free,-4.228695
1869,txt,-4.533689
1783,text,-4.627443
1702,stop,-4.63154
432,claim,-4.704872
1188,mobile,-4.713391
1900,ur,-4.787524
2041,www,-4.837347
1488,reply,-4.843838
1415,prize,-4.874564


WORDS ASSOCIATED WITH NOT SPAM


Unnamed: 0,feature_name,coef
1108,lunch,-9.930952
1098,lovely,-9.930952
338,bslvyl,-9.930952
340,btw,-9.930952
1105,lt,-9.930952
341,bucks,-9.930952
1102,loving,-9.930952
1101,loves,-9.930952
1100,loverboy,-9.930952
1284,oic,-9.930952


# Improved Model with engineered features

- fit using TfidfVectorizer ignoring terms that have a document frequency less than 5
- using n-grams from 2 to 5
- engineered features
  - message length
  - number of digits in message
  - number of non-word characters in message

In [12]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer

vect = TfidfVectorizer(
    strip_accents = 'unicode',
    stop_words='english',
    lowercase = True,
    max_df = 0.5,
    min_df = 5,
    ngram_range=(1,3), 
    # ngram_range=(2,5), 
    # analyzer='char_wb'
).fit(X_train)
X_train_vect = vect.transform(X_train)
X_test_vect  = vect.transform(X_test)

X_train_txtlengths  = X_train.str.len()
X_test_txtlengths   = X_test.str.len()
X_train_nonwords    = X_train.apply(nonwordlengths)
X_test_nonwords     = X_test.apply(nonwordlengths)
X_train_digits      = X_train.apply(digitlengths)
X_test_digits       = X_test.apply(digitlengths)

X_train_vect = add_feature(X_train_vect, [X_train_txtlengths, X_train_nonwords, X_train_digits] )
X_test_vect  = add_feature(X_test_vect,  [X_test_txtlengths,  X_test_nonwords,  X_test_digits ] )


lr = LogisticRegression(C=100, max_iter=3000).fit(X_train_vect, y_train)
y_pred = lr.predict(X_test_vect)
score = precision_score(y_test, y_pred)

print('MNB using TfidfVectorizer, n-grams, test ROC AUC score: {:.4f}'.format(score) )

feature_names = np.array(vect.get_feature_names_out())
feature_names = np.append(feature_names, ['TEXTLENGTH','NUM_NONWORDCHARS','NUM_DIGITS'])

##? Smallest and Largest tf-idfs
# feature_series = pd.Series(lr.coef_[0], index=feature_names)
# feature_series = feature_series.sort_values(ascending=True)

train_vect_tfidf = pd.DataFrame({'feature_name':feature_names, 'tfidf':X_train_vect.max(0).toarray()[0]}).sort_values('tfidf', ascending=False)
print('HIGH IMPORTANCE WORDS')
display(train_vect_tfidf.head(10))
print('LOW IMPORTANCE WORDS')
display(train_vect_tfidf.tail(10))

# ##? Smallest and Largest Coefs
train_coefs = pd.DataFrame({'feature_name':feature_names, 'coef':lr.coef_[0]}).sort_values('coef', ascending=False)
print('WORDS ASSOCIATED WITH SPAM')
display(train_coefs.head(10))
print('WORDS ASSOCIATED WITH NOT SPAM')
display(train_coefs.tail(10))

MNB using TfidfVectorizer, n-grams, test ROC AUC score: 0.9789
HIGH IMPORTANCE WORDS


Unnamed: 0,feature_name,tfidf
1876,TEXTLENGTH,790.0
1877,NUM_NONWORDCHARS,253.0
1878,NUM_DIGITS,41.0
805,ii,1.0
878,late,1.0
877,lar,1.0
876,laptop,1.0
868,know,1.0
158,amp,1.0
857,keeping,1.0


LOW IMPORTANCE WORDS


Unnamed: 0,feature_name,tfidf
49,1st week no1,0.144664
852,just txt nokia,0.144664
89,36504,0.144664
1567,tell ur mates,0.144664
657,getzed uk pobox,0.138909
1008,mates www getzed,0.138909
1671,uk pobox,0.138909
1672,uk pobox 36504,0.138909
1007,mates www,0.138909
1699,ur mates www,0.138909


WORDS ASSOCIATED WITH SPAM


Unnamed: 0,feature_name,coef
1670,uk,9.913764
1389,sale,9.898754
661,girls,8.653641
393,content,8.46888
1473,sms,7.843521
369,comes,7.797163
1573,text,7.752478
1624,tones,7.674236
442,darling,7.655989
1432,sexy,7.35681


WORDS ASSOCIATED WITH NOT SPAM


Unnamed: 0,feature_name,coef
1360,ring,-4.364748
688,got,-4.404382
796,i_,-4.982019
1751,wan,-5.25017
1619,tomorrow,-5.27563
150,ah,-5.530506
47,1st,-5.80786
1395,say,-6.186969
1208,park,-6.266788
798,i_ wan,-6.612087
