# Naive Bayes

### 1. Import des libraries utiles

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

from collections import Counter

### 2. Traitement du dataset (idem SVM)

In [10]:
sms = pd.read_csv('SMS_spam.csv', encoding='latin-1')

sms.drop(sms.columns[range(2,5)],axis=1,inplace=True)

vec = CountVectorizer(stop_words = 'english')
X = vec.fit_transform(sms.v2).toarray()
is_spam = sms.v1.map({'spam':1,'ham':0})

y = sms.v1
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=1,stratify=y)

 ### 3. Modélisation et Performances

In [36]:
#1 paramètres grid pour GaussianNB
param_grid = {'var_smoothing': np.logspace(0,-9, num=4)}
param_grid
#2 fit
gscv = GridSearchCV(
    GaussianNB(),
    param_grid,
    cv = 3)
gscv.fit(X_train,Y_train)
#3 best_score et params
print(gscv.best_score_,gscv.best_params_)
#4 pred
best_model = gscv.best_estimator_
pred_train = best_model.predict(X_train)
pred_test = best_model.predict(X_test)
print(pred_train,pred_test)
#5 report
print(classification_report(Y_train,pred_train),'\n',classification_report(Y_test,pred_test))

0.9 {'var_smoothing': 0.001}
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham'] ['spam' 'ham' 'ham' ... 'spam' 'ham' 'spam']
              precision    recall  f1-score   support

         ham       1.00      0.94      0.97      3377
        spam       0.71      1.00      0.83       523

    accuracy                           0.94      3900
   macro avg       0.85      0.97      0.90      3900
weighted avg       0.96      0.94      0.95      3900
 
               precision    recall  f1-score   support

         ham       0.99      0.89      0.94      1448
        spam       0.57      0.95      0.71       224

    accuracy                           0.90      1672
   macro avg       0.78      0.92      0.82      1672
weighted avg       0.93      0.90      0.91      1672



In [30]:
#1 paramètres grid pour MultinomialNB
param_grid = {'alpha': np.linspace(0.25,1, num=4)}
param_grid
#2 fit
gscv = GridSearchCV(
    MultinomialNB(),
    param_grid,
    cv = 3)
gscv.fit(X_train,Y_train)
#3 best_score et params
print(gscv.best_score_,gscv.best_params_)
#4 pred
best_model = gscv.best_estimator_
pred_train = best_model.predict(X_train)
pred_test = best_model.predict(X_test)
print(pred_train,pred_test)
#5 report
print(classification_report(Y_train,pred_train),'\n',classification_report(Y_test,pred_test))

0.9797435897435897 {'alpha': 1}
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham'] ['spam' 'ham' 'ham' ... 'ham' 'ham' 'spam']
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3377
        spam       0.98      0.98      0.98       523

    accuracy                           1.00      3900
   macro avg       0.99      0.99      0.99      3900
weighted avg       1.00      1.00      1.00      3900
 
               precision    recall  f1-score   support

         ham       0.99      0.98      0.99      1448
        spam       0.90      0.96      0.93       224

    accuracy                           0.98      1672
   macro avg       0.95      0.97      0.96      1672
weighted avg       0.98      0.98      0.98      1672



In [40]:
#1 paramètres grid pour ComplementNB
param_grid = {'alpha': np.linspace(0.25,1, num=4)}
param_grid
#2 fit
gscv = GridSearchCV(
    ComplementNB(),
    param_grid,
    cv = 3)
gscv.fit(X_train,Y_train)
#3 best_score et params
print(gscv.best_score_,gscv.best_params_)
#4 pred
best_model = gscv.best_estimator_
pred_train = best_model.predict(X_train)
pred_test = best_model.predict(X_test)
print(pred_train,pred_test)
#5 report
print(classification_report(Y_train,pred_train),'\n',classification_report(Y_test,pred_test))

0.9382051282051282 {'alpha': 0.25}
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham'] ['spam' 'ham' 'ham' ... 'ham' 'ham' 'spam']
              precision    recall  f1-score   support

         ham       1.00      0.98      0.99      3377
        spam       0.91      0.99      0.95       523

    accuracy                           0.99      3900
   macro avg       0.95      0.99      0.97      3900
weighted avg       0.99      0.99      0.99      3900
 
               precision    recall  f1-score   support

         ham       1.00      0.94      0.97      1448
        spam       0.72      0.98      0.83       224

    accuracy                           0.95      1672
   macro avg       0.86      0.96      0.90      1672
weighted avg       0.96      0.95      0.95      1672



In [41]:
#1 paramètres grid pour BernoulliNB
param_grid = {'alpha': np.linspace(0.25,1, num=4),'binarize':}
param_grid
#2 fit
gscv = GridSearchCV(
    BernoulliNB(),
    param_grid,
    cv = 3)
gscv.fit(X_train,Y_train)
#3 best_score et params
print(gscv.best_score_,gscv.best_params_)
#4 pred
best_model = gscv.best_estimator_
pred_train = best_model.predict(X_train)
pred_test = best_model.predict(X_test)
print(pred_train,pred_test)
#5 report
print(classification_report(Y_train,pred_train),'\n',classification_report(Y_test,pred_test))

0.9835897435897435 {'alpha': 0.25}
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham'] ['spam' 'ham' 'ham' ... 'ham' 'ham' 'spam']
              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      3377
        spam       1.00      0.97      0.98       523

    accuracy                           0.99      3900
   macro avg       1.00      0.98      0.99      3900
weighted avg       0.99      0.99      0.99      3900
 
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1448
        spam       0.94      0.94      0.94       224

    accuracy                           0.98      1672
   macro avg       0.97      0.97      0.97      1672
weighted avg       0.98      0.98      0.98      1672



Conclusion : pour is_spam, BernoulliNB semble la méthode la plus précise - ce qui se comprend car variable cible binaire spam/ham