# SPAM or HAM

In [1]:
import os
import math
import nltk
import tqdm
import joblib
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns

%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 15]
plt.rcParams['figure.dpi'] = 72
from collections import Counter
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer

import gensim
from gensim.models.fasttext import FastText


from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef, accuracy_score, f1_score

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/mantunes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mantunes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mantunes/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mantunes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import warnings
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

In [3]:
def div_norm(x):
   norm_value = np.linalg.norm(x)
   if norm_value > 0:
       return x * ( 1.0 / norm_value)
   else:
       return x


def word_vector_to_sentence_vector(sentence:list, model):
    vectors = []
    # for all the tokens in the setence
    for token in sentence:
        if token in model:
            vectors.append(model[token])
    # add the EOS token
    if '\n' in model:
        vectors.append(model['\n'])
    # normalize all the vectors
    vectors = [div_norm(x) for x in vectors]
    return np.mean(vectors, axis=0)

In [4]:
df = pl.read_csv('../datasets/spam.csv')
df

Target,SMS
str,str
"""ham""","""Go until juron…"
"""ham""","""Ok lar... Joki…"
"""spam""","""Free entry in …"
"""ham""","""U dun say so e…"
"""ham""","""Nah I don't th…"
"""spam""","""FreeMsg Hey th…"
"""ham""","""Even my brothe…"
"""ham""","""As per your re…"
"""spam""","""WINNER!! As a …"
"""spam""","""Had your mobil…"


In [5]:
dataset = df.rows()
dataset = [(text, label) for (label, text) in dataset]
targets = [label for _, label in dataset]

In [6]:
lemmatizer = WordNetLemmatizer()
stop_words = set(nltk.corpus.stopwords.words('english'))

corpus = [text for text, _ in dataset]
tokens = [nltk.word_tokenize(sample) for sample in corpus]
tokens_clean = [[lemmatizer.lemmatize(w).lower() for w in t if len(lemmatizer.lemmatize(w)) > 2 and w.isalpha() and w not in stop_words] for t in tokens]
tokens_clean

[['jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'cup',
  'final',
  'tkts',
  'may',
  'text',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even',
  'brother',
  'like',
  'speak',
  'they',
  'treat',
  'like',
  'aid',
  'patent'],
 ['per',
  'request',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'callers',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
  'prize',
  'reward',
  'claim',
  'call',
  'claim',


In [7]:
text_model = FastText(vector_size=256, window=7, min_count=3, workers=os.cpu_count(), seed=42)
text_model.build_vocab(tokens_clean)

In [8]:
X = np.array([word_vector_to_sentence_vector(sentence, text_model.wv) for sentence in tqdm.tqdm(tokens_clean)])

100%|██████████| 5521/5521 [00:00<00:00, 8916.23it/s]


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, targets, stratify=targets, test_size=0.2, random_state=42)

In [10]:
print(f'Training Data : {len(X_train)}')
print(f'Testing Data  : {len(X_test)}')

# define the list of classifiers
clfs = [
    ('LR', LogisticRegression(random_state=42, multi_class='auto', max_iter=600)),
    ('KNN', KNeighborsClassifier(n_neighbors=1)),
    ('NB', GaussianNB()),
    ('RFC', RandomForestClassifier(random_state=42)),
    ('MLP', MLPClassifier(random_state=42, learning_rate='adaptive', max_iter=1000))
]

# whenever possible used joblib to speed-up the training
with joblib.parallel_backend('loky', n_jobs=-1):
    for label, clf in clfs:
        # train the model
        clf.fit(X_train, y_train)

        # generate predictions
        predictions = clf.predict(X_test)

        # compute the performance metrics
        mcc = matthews_corrcoef(y_test, predictions)
        acc = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, average='weighted')
        print(f'{label:3} {acc:.2f} {f1:.2f} {mcc:.2f}')

Training Data : 4416
Testing Data  : 1105


LR  0.91 0.88 0.52
KNN 0.95 0.95 0.80
NB  0.76 0.80 0.46
RFC 0.93 0.92 0.66
MLP 0.96 0.96 0.83
