# Bayesian Modelling ( Naive Bayes )

Bayesian modeling is a statistical method used to update the probability of a hypothesis as more evidence becomes available

In [1]:
import numpy as np

import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
emails = [
    "Get rich quick!",
    "Hello, how are you?",
    "Win a free iPhone now!",
    "Meeting at 3 PM",
    "Claim your prize money!",
    "Project deadline tomorrow"
]


labels = [1, 0, 1, 0, 1, 0]

In [3]:
vectorizer =  CountVectorizer()

encoded = vectorizer.fit_transform( emails )

model = MultinomialNB()

model.fit( encoded ,  labels )



In [4]:
new_email = ["Congratulations! You've won a lottery!"]

new_emails_transformed = vectorizer.transform(new_email)

pred_prob = model.predict_proba(new_emails_transformed)

pred = model.predict( new_emails_transformed)


print( [ "Spam" if i == 1 else "Not Spam" for i in pred ])
pred

['Not Spam']


array([0])

In [5]:
"""

More complex example using a pipeline

"""


data = {
    'email_content': [
        "Get rich quick!",
        "Hello, how are you?",
        "Win a free iPhone now!",
        "Meeting at 3 PM",
        "Claim your prize money!",
        "Project deadline tomorrow"
    ],
    'sender_domain': [
        'unknown.com',
        'company.com',
        'freeprizes.com',
        'company.com',
        'luckywinner.com',
        'company.com'
    ],
    'time_sent': [
        '23:45',
        '09:30',
        '02:15',
        '14:00',
        '22:30',
        '11:00'
    ],
    'is_spam': [1, 0, 1, 0, 1, 0]
}


df = pd.DataFrame( data )


X = df.drop('is_spam', axis=1)
y = df['is_spam']


X


Unnamed: 0,email_content,sender_domain,time_sent
0,Get rich quick!,unknown.com,23:45
1,"Hello, how are you?",company.com,09:30
2,Win a free iPhone now!,freeprizes.com,02:15
3,Meeting at 3 PM,company.com,14:00
4,Claim your prize money!,luckywinner.com,22:30
5,Project deadline tomorrow,company.com,11:00


In [6]:
# i will use counter vectorizer on email ,  using one hot encoding on domain and count vectorizer on time_sent


preprocessor = ColumnTransformer(
 transformers= (
  ( 'email_content' , CountVectorizer() , 'email_content'),
  ('sender_domain' , OneHotEncoder(handle_unknown='ignore') , ['sender_domain']),
  ('time_sent' , CountVectorizer() ,  'time_sent')
 )
)


pipeline = Pipeline(

[ 
 ('preprocessor' , preprocessor),

 ('classifier' , MultinomialNB())
 ]

)



pipeline.fit(X , y)



In [7]:
new_email = pd.DataFrame({
    'email_content': ["Congratulations! You've won a lottery!"],
    'sender_domain': ['luckydraw.com'],
    'time_sent': ['21:30']
})


probabilities = pipeline.predict_proba(new_email)

print(f"Probability of being spam: {probabilities[0][1]:.2f}")
print(f"Probability of being non-spam: {probabilities[0][0]:.2f}")

Probability of being spam: 0.33
Probability of being non-spam: 0.67


In [8]:
prediction = pipeline.predict(new_email)
print(f"Predicted class: {'Spam' if prediction[0] == 1 else 'Non-spam'}")

Predicted class: Non-spam
