#  Naive Bayes Classifiers

In [2]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

## Naive Bayes
### Using Naive Bayes to predict spam

In [3]:
#Use Latin encoding as the Data has non UTF-8 Chars
data = pd.read_csv("spam.csv",encoding='latin-1')
print(data.shape)
data.head()

(5572, 2)


Unnamed: 0,type,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
X =  data.email
y = data.type

## Vectorization : Transforming TEXT to Vectors

In [5]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)
feature_names = vectorizer.get_feature_names()

In [6]:
len(feature_names)

8672

In [7]:
feature_names[8000:8010]

['unfortuntly',
 'unhappiness',
 'unhappy',
 'uni',
 'unicef',
 'uniform',
 'unintentional',
 'unintentionally',
 'unique',
 'united']

In [13]:
X = X.toarray()

In [14]:
X.shape

(5572, 8672)

In [None]:
y.shape

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [11]:
#Fitting Naive Bayes algo
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
model = BernoulliNB(alpha=0.75)
model.fit(X_train,y_train)
y_predict = model.predict(X_test)

In [12]:
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))
pd.crosstab(y_test,y_predict)

0.9856459330143541
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1446
        spam       1.00      0.89      0.94       226

    accuracy                           0.99      1672
   macro avg       0.99      0.95      0.97      1672
weighted avg       0.99      0.99      0.99      1672



col_0,ham,spam
type,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,1446,0
spam,24,202


## Checking new email for spam

In [13]:
#NewEmail = pd.Series(["Hi team, We have meeting tomorrow"])
#NewEmail = pd.Series(['**FREE MESSAGE**Thanks for using the Auction Subscription Service. 18 . 150p/MSGRCVD 2 Skip an Auction txt OUT. 2 Unsubscribe txt STOP CustomerCare 08718726270'])
NewEmail = pd.Series(['hi get free entry win price? '])
NewEmail


0    hi get free entry win price? 
dtype: object

In [14]:
NewEmail_transformed = vectorizer.transform(NewEmail)

In [15]:
NewEmail_transformed.shape

(1, 8672)

In [16]:
model.predict(NewEmail_transformed)

array(['ham'], dtype='<U4')

In [None]:
X_test

In [43]:
import scipy as sp
-1*0.01*sp.log2(0.01)

0.06643856189774724

In [44]:
-1*0.4*sp.log2(0.6)

0.2947862376664825

In [None]:
-pElog2(p)