In [1]:
import pandas as pd
import numpy as np

In [4]:
import os

os.listdir("../Spam Classifier /smsspamcollection")

['readme', 'SMSSpamCollection']

In [7]:
# Dataset is taken from UCI Machine Learning Repository.
data = pd.read_table("../Spam Classifier /smsspamcollection/SMSSpamCollection",header=None)

data.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
data.columns = ["label", "message"]
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
data.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [14]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [15]:
label_dict = {
    "ham" : 0,
    "spam" : 1
}

data.label = data.label.map(label_dict)
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


<hr>

## Get bag of words 

<hr>

#### Order does not matter in bag of words.
#### By default, it removes punctuation marks, treat them as delimters, convert all words to lower case.

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()
count_vector

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

<hr>

## Sample data

<hr>

In [24]:
documents = [
    "Are you bored?",
    "Do ypu want to earn BILLIONS?",
    "How are you you?"
]

count_vector.fit(documents)

count_vector.get_feature_names()

['are', 'billions', 'bored', 'do', 'earn', 'how', 'to', 'want', 'you', 'ypu']

In [25]:
doc_array = count_vector.transform(documents).toarray()
doc_array

array([[1, 0, 1, 0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 1, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 0, 2, 0]])

<hr>

## Make a dataframe out of document array.

<hr>

In [27]:
doc_df = pd.DataFrame(doc_array, columns = count_vector.get_feature_names())

doc_df.head()

Unnamed: 0,are,billions,bored,do,earn,how,to,want,you,ypu
0,1,0,1,0,0,0,0,0,1,0
1,0,1,0,1,1,0,1,1,0,1
2,1,0,0,0,0,1,0,0,2,0


<hr>

## Splitting data

<hr>

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data["message"], data["label"], random_state=0, test_size=0.2)

In [33]:
count_vector = CountVectorizer(stop_words="english")

train_data = count_vector.fit_transform(X_train)

test_data = count_vector.transform(X_test)

<hr>

## Building model

<hr>

##### We choose to use multinomail Naive Bayesian classifier as we have selected features based on bag of words.
##### Multinomial Bayesian classifier is suitable for discrete features.
##### Gaussian(normal distribution) Naive Bayesian is suitable for continuous data.

In [35]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()
naive_bayes.fit(train_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [37]:
y_pred = naive_bayes.predict(test_data)
y_pred

array([0, 1, 0, ..., 0, 1, 0])

In [41]:
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, precision_recall_fscore_support, classification_report

print("Accuracy Score : ", accuracy_score(y_test, y_pred))
print("Precision Score : ", precision_score(y_test, y_pred))
print("Precision, Recall, Fscore, Support : \n", precision_recall_fscore_support(y_test, y_pred))
print("Classification Report : \n", classification_report(y_test, y_pred))
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred))

Accuracy Score :  0.9865470852017937
Precision Score :  0.9617834394904459
Precision, Recall, Fscore, Support : 
 (array([0.99060543, 0.96178344]), array([0.99371728, 0.94375   ]), array([0.99215891, 0.95268139]), array([955, 160]))
Classification Report : 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       955
           1       0.96      0.94      0.95       160

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

Confusion Matrix : 
 [[949   6]
 [  9 151]]
