In [84]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [85]:
sms_data = pd.read_csv('./SMSSpamCollection.zip', sep='	', names=['target', 'text'], header=None)

sms_data.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [86]:
def text_cleanup(data):
    data['text'] = data['text'].str.replace('\W+', ' ', regex=True)
    data['text'] = data['text'].str.replace('\s+', ' ', regex=True)
    data['text'] = data['text'].str.strip()
    data['text'] = data['text'].str.lower()

text_cleanup(sms_data)

In [87]:
vocabulary = set(word for sms in sms_data['text'].values for word in sms.split())


In [88]:
len(vocabulary)

8753

In [89]:
sms_data.loc[:, 'target'] = sms_data['target'] == 'spam'
sms_data.loc[:, 'target'] = np.int8(sms_data['target'])

In [90]:
X = sms_data['text']
y = sms_data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape)
print(X_test.shape)

(4457,)
(1115,)


In [91]:
word_stats = sms_data.copy()
word_stats.rename(columns={'text': 'orig_text'}, inplace=True)

word_stats

for word in vocabulary:
    word_stats.loc[:, word] = 0

  self.obj[key] = value


In [92]:
for (idx, row) in zip(word_stats.index, word_stats.values):
    orig = row[1]
    for word in orig.split():
        word_stats.loc[idx, word] += 1

In [93]:
words_list = list(vocabulary)
m_spam = word_stats['target'] == 1
m_nospam = word_stats['target'] == 0
word_is_spam = word_stats[m_spam][words_list].sum(axis=0)
word_is_nospam = word_stats[m_nospam][words_list].sum(axis=0)

In [94]:
word_usage = word_stats[words_list].sum(axis=0)

In [95]:
spam_count = m_spam.sum()
ham_count = m_nospam.sum()

words_probabilities = pd.DataFrame(
    data=[
        [word_is_spam[word] / spam_count for word in words_list],
        [word_is_nospam[word] / ham_count for word in words_list],
    ],
    columns=words_list,
    index=['word_is_spam_p', 'word_is_ham_p']
)

In [96]:
words_probabilities

Unnamed: 0,conditions,ended,jan,quality,someplace,25p,idiot,lazy,fireplace,82050,...,peripherals,costume,tamilnadu,senrd,icon,out,buz,mutai,winds,massive
word_is_spam_p,0.005355,0.0,0.0,0.002677,0.0,0.01071,0.001339,0.0,0.0,0.002677,...,0.0,0.0,0.0,0.0,0.0,0.080321,0.0,0.0,0.0,0.0
word_is_ham_p,0.000207,0.001244,0.000622,0.001036,0.000207,0.0,0.000622,0.001865,0.000207,0.0,...,0.000207,0.000207,0.000207,0.000207,0.000207,0.044352,0.000207,0.000207,0.000207,0.000415


In [97]:
sms_is_spam_p = spam_count / (spam_count + ham_count)
sms_is_ham_p = ham_count / (spam_count + ham_count)

In [98]:
def multiply_list(num_list):
    res = 1
    for n in num_list:
        res *= n

    return res

def detect_spam(sms, show=True):
    sms_df = pd.DataFrame({
        'text': [sms],
    })

    text_cleanup(sms_df)

    sms_cleaned = sms_df['text'].values[0]

    sms_words = sms_cleaned.split()

    is_spam_p = sms_is_spam_p * multiply_list([words_probabilities.loc['word_is_spam_p', word] for word in sms_words])
    is_ham_p = sms_is_ham_p * multiply_list([words_probabilities.loc['word_is_ham_p', word] for word in sms_words])

    is_spam_res = is_spam_p >= is_ham_p

    if show:
        print(f'is_ham_p: {is_ham_p}')
        print(f'is_spam_p: {is_spam_p}')
        print('SPAM' if is_spam_res else 'ham')
    else:
        return 1 if is_spam_res else 0



In [99]:
def predict(X):
    return X.apply(lambda sms: detect_spam(sms, False))

y_test_pred = predict(X_test)

In [100]:
from sklearn import metrics

# print(f'Accuracy: {metrics.accuracy_score(y_test, y_test_pred)}')
# print(f'Precision: {metrics.precision_score(y_test, y_test_pred)}')
# print(f'Recall: {metrics.recall_score(y_test, y_test_pred)}')
# print(f'F₁ score: {metrics.f1_score(y_test, y_test_pred)}')
print(metrics.classification_report(y_test, y_test_pred))

# confusion_matrix = metrics.confusion_matrix(y_test, y_test_pred)
# sns.heatmap(confusion_matrix, annot=True, fmt='')

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       966
           1       0.97      1.00      0.98       149

    accuracy                           1.00      1115
   macro avg       0.98      1.00      0.99      1115
weighted avg       1.00      1.00      1.00      1115

