Importing Dependencies

In [0]:
import numpy as np
import pandas as pd

In [0]:
from gensim.parsing.preprocessing import strip_punctuation, remove_stopwords

In [0]:
import spacy

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [0]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [0]:
from sklearn.naive_bayes import MultinomialNB

Loading CSV & Spacy Model

In [0]:
nlp = spacy.load("en_core_web_sm")

In [0]:
# Loading as latin characters as typical utf-8 wouldn't load
df = pd.read_csv('spam.csv', encoding='latin-1')

Data Cleaning

In [0]:
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [0]:
df = df.rename(columns={'v1':'y', 'v2':'raw_text'})

In [0]:
# Spam is 1, Real is 0
df['y'] = df['y'].astype('str').str.replace('ham', '0').replace('spam','1').astype('int')

In [0]:
df['y'] = df['y'].astype('bool')

In [0]:
# counting number of words that are fully capitalized
df['num_cap_wrds'] = [sum(map(str.isupper, i.split())) for i in df['raw_text']]

Creating a dataframe column for any SMS with more than three capitalized words 

(I played around with the number of fully capitalized words, and +3 words had the highest spam:ham ratio)

In [0]:
df[df['num_cap_wrds'] > 3].groupby('y').count()

Unnamed: 0_level_0,raw_text,num_cap_wrds
y,Unnamed: 1_level_1,Unnamed: 2_level_1
False,179,179
True,202,202


In [0]:
df['cap_plus_three'] = np.where(df['num_cap_wrds'] > 3, 1, 0)

In [0]:
# Dropping the count of the number of capitalized words, as cap_plus_three is a stronger predictor
df = df.drop(columns='num_cap_wrds')

In [0]:
## Text Processing/Feature Engineering ##

In [0]:
# Lowercase everything, remove gensim's list of stopwords, and strip punctuation (I am removing stopwords now to improve PoS Processing)
df['raw_text'] = df['raw_text'].apply(str.lower)
df['raw_text'] = df['raw_text'].apply(remove_stopwords)
df['raw_text'] = df['raw_text'].apply(strip_punctuation)

In [0]:
# replace non unicode characters
df['raw_text'] = df['raw_text'].replace({r'[^\x00-\x7F]+':''}, regex=True)

In [0]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS
# lemmatizing (turning words into their base)
def preprocess(text):
    doc = nlp(text, disable=['ner', 'parser'])
    lemmas = [token.lemma_ for token in doc]
    a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() or lemma.isnumeric() and lemma not in stopwords]
    
    return ' '.join(a_lemmas)

In [0]:
df['raw_text'] = df['raw_text'].apply(preprocess)

In [0]:
# Parts of Speech (PoS) Tagging

In [0]:
# Takes text, returns number of proper nouns, nouns, pronounces, (pre)determiners, adverbs, verbs, and numbers in text
def pos_count(text):
    doc = nlp(text)
    pos = [token.pos_ for token in doc]
    return [pos.count('PROPN'), pos.count('NOUN'), pos.count('PRON'),  pos.count('DET'), pos.count('ADV'), pos.count('VERB'), pos.count('NUM')]

In [0]:
# apply above function to dataframe
df['PoS_Vals'] = df['raw_text'].apply(pos_count)


In [0]:
# Assign values to dataframe
df[['PROPN', 'NOUN', 'PRON', 'DET', 'ADV', 'VERB', 'NUM']] = pd.DataFrame(df.PoS_Vals.tolist(), index= df.index)

In [0]:
# dropping original column-lists for PoS results as they are no longer necessary
df = df.drop(columns='PoS_Vals')

Term Frequency - Inverse Document Freqeuncy Processing

In [0]:
# I remove stopwords using two separate libraries to filter out a large pool of unnessesary words. This removes meaningless values from the dataset
vectorizer = TfidfVectorizer(use_idf=True, strip_accents='ascii', stop_words='english')

In [0]:
tfidf_text = vectorizer.fit_transform(df['raw_text'])

In [0]:
tfidf_df = pd.DataFrame(tfidf_text.toarray())

In [0]:
df = pd.concat([df, tfidf_df], axis=1)

In [0]:
y = df[['y']]
X = df.drop(columns=['raw_text', 'y'])

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [0]:
# Classifier

In [0]:
model = MultinomialNB()

In [0]:
model.fit(X_train, np.ravel(y_train))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
y_pred = model.predict(X_test)

In [0]:
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

       False      0.955     0.997     0.976       965
        True      0.972     0.700     0.814       150

    accuracy                          0.957      1115
   macro avg      0.964     0.848     0.895      1115
weighted avg      0.958     0.957     0.954      1115



95% accuracy


In [0]:
results = confusion_matrix(y_true=y_test, y_pred=y_pred)
print(confusion_matrix(y_true=y_test, y_pred=y_pred))

[[962   3]
 [ 45 105]]


In [0]:
print(f"True Positive: {results[0][0]} \n False Positive: {results[0][1]} \
      \n False Negative: {results[1][0]} \n True Negative: {results[1][1]}")

True Positive: 962 
 False Positive: 3       
 False Negative: 45 
 True Negative: 105
