### Multinominal Naive Bayes

O classificador Multinomial Naive Bayes é adequado para classificação com variáveis discretas (por exemplo, contagens de palavras para a classificação de texto). A distribuição multinomial normalmente requer contagens de entidados inteiras. No entanto, na prática, contagens fracionadas como td-idf também podem funcionar.

#### Classificador de Notícias

In [4]:
import numpy as np 
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [5]:
# Definindo as categorias
# (usando apenas 4 de um total de 20 disponíveis para que o processo de classificação seja mais rápido)
categories = ["alt.atheism", "soc.religion.christian", "comp.graphics", "sci.med"]

In [7]:
# Treinamento
twenty_train = fetch_20newsgroups(subset = "train", categories = categories, shuffle = True, random_state = 42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [8]:
# Classes
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [9]:
len(twenty_train.data)

2257

In [10]:
# Visualizando alguns dados (atributos)
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [11]:
# Visualizando as variáveis target
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [12]:
# O Scikit-learn registra os labels como array de números, a fim de aumentar a velocidade
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [13]:
# Visualizando as classes dos 10 primeiros registros
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


#### Bag of words (Saco de palavras)

In [14]:
# Tokenizing
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
count_vect.vocabulary_.get(u"algorithm")
X_train_counts.shape

(2257, 35788)

In [15]:
# De ocorrências a frequencias - Term Frequency times Inverse Document Frequency (Tfidf)
tf_transformer = TfidfTransformer(use_idf = False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

In [17]:
# Criando o modelo Multinomial
classifier = MultinomialNB().fit(X_train_tf, twenty_train.target)

In [20]:
# Previsões
docs_new = ["God is love", "OpenGL on the GPU is fast"]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tf_transformer.transform(X_new_counts)

predicted = classifier.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print("%r => %s" % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [33]:
# Criando um pipeline - Classificador Composto
# vectorizer => tranformer => classifier
text_classifier = Pipeline([("vect", CountVectorizer()),
                            ("tfidf", TfidfTransformer()),
                            ("clf", MultinomialNB())
                          ])

In [34]:
# Fit
text_classifier = text_classifier.fit(twenty_train.data, twenty_train.target)

In [35]:
# Acurácia do modelo
twenty_test = fetch_20newsgroups(subset = "test", categories = categories, shuffle = True, random_state = 42)
docs_test = twenty_test.data
predicted = text_classifier.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8348868175765646

In [36]:
# metricas
print(metrics.classification_report(twenty_test.target, predicted, target_names = twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

              accuracy                           0.83      1502
             macro avg       0.89      0.82      0.83      1502
          weighted avg       0.88      0.83      0.84      1502



In [37]:
# Confusion matrix
metrics.confusion_matrix(twenty_test.target, predicted)

array([[192,   2,   6, 119],
       [  2, 347,   4,  36],
       [  2,  11, 322,  61],
       [  2,   2,   1, 393]], dtype=int64)

In [38]:
# Parâmetros para o GridSearchCV
parameters = {"vect__ngram_range": [(1, 1), (1, 2)],
              "tfidf__use_idf": (True, False),
              "clf__alpha": (1e-2, 1e-3),
             }

In [39]:
# GridSearchCV
gridSearch_classifier = GridSearchCV(text_classifier, parameters, n_jobs = -1)

In [40]:
# Fit
gridSearch_classifier = gridSearch_classifier.fit(twenty_train.data[:400], twenty_train.target[:400])



In [46]:
# Teste
twenty_train.target_names[gridSearch_classifier.predict(["the brain is the most important organ"])[0]]

'sci.med'

In [48]:
# Score
gridSearch_classifier.best_score_

0.93

In [49]:
# Parametros utilizados
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gridSearch_classifier.best_params_[param_name]))

clf__alpha: 0.01
tfidf__use_idf: True
vect__ngram_range: (1, 2)


### FIM