<a href="https://colab.research.google.com/github/gbothra38/MachineLearning/blob/main/Spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Naive Bayes

* Bayes Theorem:

  P(A|B)=P(B|A)*P(A) / P(B)

* In our case:

 P(y|X)=P(X|y)*P(y) / P(X),  
  X=(x1,x2,.....,xn) are the features
 
 Assuming that all features are mutually independent
 P(y|X)=P(x1|y)*P(x2|y)*....*P(xn|y)*P(y) / p(X)


In [None]:
import pandas as pd
import numpy as np

In [None]:
ls drive/

[0m[01;34mMyDrive[0m/


In [None]:
cd drive/MyDrive

/content/drive/MyDrive


In [None]:
dataset=pd.read_csv('spam_ham_dataset.csv')

In [None]:
class NaiveBayes():
  def fit(self,input,output):
    self.no_of_examples,self.no_of_features=input.shape
    self.classes=np.unique(output)
    self.no_of_classes=len(self.classes)

    self.mean=np.zeros((self.no_of_classes,self.no_of_features))
    self.variance=np.zeros((self.no_of_classes,self.no_of_features))
    self.prior=np.zeros(self.no_of_classes)

    for c in self.classes:
      input_class=input[c==output]
      self.mean[c,:]=input_class.mean(axis=0)
      self.variance[c,:]=input_class.mean(axis=0)
      self.prior[c]=np.array(input_class.shape[0])/float(self.no_of_examples)


  def predicts(self,input_test):
    y_predicted=[self.predict(i) for i in input_test]
    return y_predicted

  def safe_log(self,input_):
    result=np.where(input_>0.01,input_,1.0)
    np.log(result,out=result,where=result>0)
    return result

  def predict(self,input_instance):
    posteriors=[]

    for index, c in enumerate(self.classes):
      prior=np.log(self.prior[index])
      class_conditional=np.sum(self.safe_log(self.pdf(index,input_instance)))
      posterior=prior+class_conditional
      posteriors.append(posterior)

    return self.classes[np.argmax(posteriors)]


  def pdf(self,index,input_instance):
    np.seterr(divide='ignore', invalid='ignore')
    mean=self.mean[index]
    variance=self.variance[index]
    numerator=np.exp(-(input_instance-mean)**2/(2*variance))
    denominator=np.sqrt(2*np.pi*variance)
    return np.divide(numerator,denominator, where=denominator!=0)

In [None]:
import nltk
from nltk.corpus import words
from sklearn.model_selection import train_test_split

In [None]:
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
words=words.words()

Functions to perform preprocessing tasks

In [None]:
def build_vocabulary(mail,vocab):
  for word in mail.split():
    if(word.lower() not in vocab and word.lower() in words):
      vocab[word]=len(vocab)

In [None]:
dataset.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

First and second columns are of no use

In [None]:
dataset.drop(['Unnamed: 0','label'],axis=1,inplace=True)

In [None]:
dataset.head()

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


Remove rows containing NA or None values

In [None]:
dataset.dropna(inplace=True)
dataset.reset_index(drop=True,inplace=True)

In [None]:
dataset.shape

(5171, 2)

In [None]:
input_dataset=dataset['text']
output_dataset=dataset['label_num']
input_dataset.shape

(5171,)

In [None]:
vocab={}
for i in range(input_dataset.shape[0]):
  build_vocabulary(input_dataset[i],vocab)

In [None]:
len(vocab)

18007

In [None]:
input_dataset_numeric=np.zeros((input_dataset.shape[0],len(vocab)))

In [None]:
for i in range(input_dataset.shape[0]):
  for word in input_dataset[i].split():
    input_dataset_numeric[i,vocab['word']]+=1

In [None]:
input_train,input_test,output_train,output_test=train_test_split(input_dataset_numeric,output_dataset,random_state=10)

In [None]:
naive=NaiveBayes()
naive.fit(input_train,output_train)

In [None]:
predicted=naive.predicts(input_test)

In [None]:
predicted.count(1)

16

In [None]:
score=np.sum(predicted==output_test)/output_test.shape[0]
score

0.6983758700696056