# Bibliotecas Utilizadas


In [16]:
from sklearn.datasets import fetch_20newsgroups 
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
import numpy as np
from sklearn.linear_model import SGDClassifier 
from sklearn.model_selection import GridSearchCV
import nltk 
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import warnings
import matplotlib.pyplot as plt 

warnings.simplefilter('ignore')
#nltk.download()

# Carregamento do Dataset

In [17]:
newsgroups = fetch_20newsgroups(subset='train')

In [18]:
list(newsgroups.target_names)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

# Escolha de 10 categorias para o teste do modelo

In [19]:
categories = ['comp.graphics', 'comp.sys.ibm.pc.hardware','comp.windows.x',  'comp.sys.mac.hardware', 'sci.electronics','sci.space','sci.med','talk.politics.mideast','misc.forsale','talk.politics.misc',  ]
df_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
df_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True)

In [20]:
df_train.target_names
#Treino utilizando as categorias escolhidas

['comp.graphics',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'talk.politics.mideast',
 'talk.politics.misc']

In [21]:
count_vect = CountVectorizer() # Instancia o algoritmo do count vectorizer
X_train_counts = count_vect.fit_transform(df_train.data)
X_train_counts.shape

(5737, 66307)

In [22]:
tfidf_transformer = TfidfTransformer() # Medida estatistica que indica a importância das categorias em relação a coleção dos dados
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Treinamento do Modelo 1

In [23]:
clf = MultinomialNB() # Instancia o Nayve Bayes para o Machine Learning
clf.fit(X_train_tfidf, df_train.target)

In [24]:
#Pipeline do machine learning
clf_1 = Pipeline([
    ('vect', CountVectorizer()),   
    ('tfidf', TfidfTransformer()), 
    ('clf', MultinomialNB())])

In [25]:
clf_trained = clf_1.fit(df_train.data, df_train.target) #iniciando o treinamento do modelo

In [26]:
pred = clf_trained.predict(df_test.data) #Realiza a predição no dado de teste

In [27]:
acc = np.mean(pred == df_test.target) #Teste de acurácia do modelo
print('>>>> Acurácia: ', acc)

>>>> Acurácia:  0.8259162303664922


In [28]:
creport = classification_report(df_test.target, pred, target_names=df_test.target_names)
print(creport)

                          precision    recall  f1-score   support

           comp.graphics       0.82      0.75      0.78       389
comp.sys.ibm.pc.hardware       0.69      0.85      0.76       392
   comp.sys.mac.hardware       0.84      0.84      0.84       385
          comp.windows.x       0.89      0.82      0.85       395
            misc.forsale       0.96      0.78      0.86       390
         sci.electronics       0.81      0.75      0.78       393
                 sci.med       0.88      0.90      0.89       396
               sci.space       0.82      0.94      0.88       394
   talk.politics.mideast       0.74      0.99      0.84       376
      talk.politics.misc       0.96      0.61      0.74       310

                accuracy                           0.83      3820
               macro avg       0.84      0.82      0.82      3820
            weighted avg       0.84      0.83      0.82      3820



# Tuning de parametros

In [29]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [31]:
gs_clf = GridSearchCV(clf_trained, parameters, n_jobs=-1) # Define o grid search para buscar os melhores parametros 
gs_clf = gs_clf.fit(df_train.data, df_train.target) # treinamento do modelo 

In [32]:
print(gs_clf.best_score_)
gs_clf.best_params_

0.9170288193104874


{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

# Treinamento do Modelo: Segundo modo

In [33]:
clf_2 = Pipeline([
    ('vect', CountVectorizer()), 
    ('tfidf', TfidfTransformer()),
    ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=25, random_state=42))])

In [34]:
svm_trained = clf_2.fit(df_train.data, df_train.target)

In [35]:
pred = svm_trained.predict(df_test.data)

In [36]:
acc = np.mean(pred == df_test.target) #Teste de acurácia do modelo
print('>>>> Acurácia: ', acc)

>>>> Acurácia:  0.8489528795811518


In [37]:
creport = classification_report(df_test.target, pred, target_names=df_test.target_names)
print(creport)

                          precision    recall  f1-score   support

           comp.graphics       0.79      0.78      0.79       389
comp.sys.ibm.pc.hardware       0.78      0.74      0.76       392
   comp.sys.mac.hardware       0.84      0.85      0.85       385
          comp.windows.x       0.84      0.82      0.83       395
            misc.forsale       0.82      0.93      0.87       390
         sci.electronics       0.86      0.73      0.79       393
                 sci.med       0.88      0.89      0.89       396
               sci.space       0.89      0.95      0.92       394
   talk.politics.mideast       0.90      0.95      0.92       376
      talk.politics.misc       0.88      0.85      0.87       310

                accuracy                           0.85      3820
               macro avg       0.85      0.85      0.85      3820
            weighted avg       0.85      0.85      0.85      3820



In [38]:
print(gs_clf.best_score_)
gs_clf.best_params_

0.9170288193104874


{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

# Tuning de Parâmetros

Modelo usando SGD com Grid Search

In [39]:
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

In [40]:
gs_clf_svm = GridSearchCV(svm_trained, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(df_train.data, df_train.target)

In [41]:
print(gs_clf_svm.best_score_)
gs_clf_svm.best_params_

0.9126705327334752


{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [42]:
pred = gs_clf_svm.predict(df_test.data)

In [43]:
acc = np.mean(pred == df_test.target)
print('>>>> Acurácia: ', acc)

>>>> Acurácia:  0.8479057591623037


In [44]:
creport = classification_report(df_test.target, pred, target_names=df_test.target_names)
print(creport)

                          precision    recall  f1-score   support

           comp.graphics       0.80      0.77      0.78       389
comp.sys.ibm.pc.hardware       0.78      0.75      0.76       392
   comp.sys.mac.hardware       0.86      0.84      0.85       385
          comp.windows.x       0.84      0.83      0.83       395
            misc.forsale       0.85      0.93      0.89       390
         sci.electronics       0.84      0.74      0.78       393
                 sci.med       0.92      0.86      0.89       396
               sci.space       0.88      0.96      0.92       394
   talk.politics.mideast       0.83      0.98      0.90       376
      talk.politics.misc       0.90      0.83      0.86       310

                accuracy                           0.85      3820
               macro avg       0.85      0.85      0.85      3820
            weighted avg       0.85      0.85      0.85      3820



Após os testes realizados, o modo dois do treinamento para o modelo se apresenta com **maior acurácia**.

Porém sua precisão com algumas categorias **não foi satisfatória**

O recall de 3 categorias estão abaixo do esperado e a precisão de 1 deles também abaixo do requisitado