In [1]:
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

### import data from spam.csv
#### data is email messages from kaggle dataset, just email content without email metadata

In [2]:
data = pd.read_csv("spam.csv")
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
data['length'] = data['message'].apply(len)
data

Unnamed: 0,labels,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,161
5568,ham,Will Ã_ b going to esplanade fr home?,37
5569,ham,"Pity, * was in mood for that. So...any other s...",57
5570,ham,The guy did some bitching but I acted like i'd...,125


### text processing function: removes punctuation and removes stopwords from messages
#### stopwords include: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [3]:
def text_process(text):
    nopunc = [char for char in text if text not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

### sectioning dataset into training and testing (80-20 split)

In [4]:
message_train,message_test,label_train,label_test = train_test_split(data['message'],data['labels'],test_size=0.2)

### creating a pipeline to train data:
#### bow count vectorizer: runs text_process and convert messages into large matrix counting occurrences of each word
#### tfidf transformer calculates the frequency of the term in the document
#### multinomial naive bayes classifier can classify the messages which are now represented as vectors

In [5]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

In [6]:
pipeline.fit(message_train, label_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x1a21aa1710>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [7]:
predictions = pipeline.predict(message_test)
print(classification_report(predictions, label_test))

              precision    recall  f1-score   support

         ham       1.00      0.95      0.98      1023
        spam       0.66      1.00      0.80        92

    accuracy                           0.96      1115
   macro avg       0.83      0.98      0.89      1115
weighted avg       0.97      0.96      0.96      1115

