In [3]:
from sklearn.datasets import fetch_20newsgroups_vectorized
X, y = fetch_20newsgroups_vectorized(return_X_y=True)
X.shape, y.shape

((11314, 130107), (11314,))

In [4]:
from sklearn.model_selection import train_test_split
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, random_state=42)
X_treino.shape, X_teste.shape, y_treino.shape, y_teste.shape

((8485, 130107), (2829, 130107), (8485,), (2829,))

## Escolha três algoritmos de classificação

In [5]:
#testando o classificador KNN
from sklearn.neighbors import KNeighborsClassifier
modelo = KNeighborsClassifier()
modelo.fit(X_treino, y_treino)
knn_pr = modelo.predict(X_teste)
knnhits = knn_pr == y_teste
knnhits, sum(knnhits)/len(knnhits)

(array([ True, False, False, ...,  True,  True, False]), 0.5563803464121597)

In [6]:
#testando o classificador Logistic Regression
from sklearn.linear_model import LogisticRegression
modelo = LogisticRegression()
modelo.fit(X_treino, y_treino)
lgr_pr = modelo.predict(X_teste)
lgrhits = lgr_pr == y_teste
lgrhits, sum(lgrhits)/len(lgrhits)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(array([ True,  True,  True, ...,  True,  True,  True]), 0.7925061859314245)

In [7]:
# testando com o classificador Perceptrom
from sklearn.linear_model import Perceptron
modelo = Perceptron()
modelo.fit(X_treino, y_treino)
per_pr = modelo.predict(X_teste)
perhits = per_pr == y_teste
perhits, sum(perhits)/len(perhits)

(array([ True,  True,  True, ...,  True,  True,  True]), 0.8525980911983033)

In [8]:
import numpy as np
hits = np.stack((knnhits, lgrhits, perhits))
hits.T

array([[ True,  True,  True],
       [False,  True,  True],
       [False,  True,  True],
       ...,
       [ True,  True,  True],
       [ True,  True,  True],
       [False,  True,  True]])

In [9]:
y_pr = np.stack((knn_pr, lgr_pr, per_pr))
y_pr.T

array([[ 8,  8,  8],
       [ 0,  1,  1],
       [18, 10, 10],
       ...,
       [ 7,  7,  7],
       [ 4,  4,  4],
       [15, 14, 14]])

In [10]:
# COMBINAÇÃO DOS CLASSIFICADORES
from scipy import stats
y_pr = stats.mode(y_pr)[0][0]
vohits = y_pr == y_teste  #acertos do hits
vohits, sum(vohits)/len(vohits)


(array([ True,  True,  True, ...,  True,  True,  True]), 0.8253799929303641)

## Combine os classificadores de duas formas diferentes

In [11]:
#Combinando os classificadores através do VotingClassifier
from sklearn.ensemble import VotingClassifier
modelo = VotingClassifier([
('knn', KNeighborsClassifier()),
('logistic', LogisticRegression()),
('perceptron', Perceptron())
])
modelo.fit(X_treino, y_treino)
vo_pr = modelo.predict(X_teste)
vohits = vo_pr == y_teste
vohits, sum(vohits)/len(vohits)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(array([ True,  True,  True, ...,  True,  True,  True]), 0.8253799929303641)

In [12]:
# verificar se o classificador é bom ou ruim usando uma arvore de decisao e comparando a acurácia
from sklearn.tree import DecisionTreeClassifier
modelo = VotingClassifier([
('knn', KNeighborsClassifier()),
('logistic', LogisticRegression()),
('arvore', DecisionTreeClassifier())
])
modelo.fit(X_treino, y_treino)
vo_pr = modelo.predict(X_teste)
vohits = vo_pr == y_teste
vohits, sum(vohits)/len(vohits)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(array([ True, False, False, ...,  True,  True, False]), 0.7635206786850477)

In [13]:
# testando a arvore sozinha 
modelo = DecisionTreeClassifier(random_state=42)
modelo.fit(X_treino, y_treino)
dt_pr = modelo.predict(X_teste)
dthits = dt_pr == y_teste
dthits, sum(dthits)/len(dthits)


(array([ True, False, False, ..., False,  True, False]), 0.6284906327324142)

In [14]:
import warnings
warnings.filterwarnings('ignore')

In [19]:
#Combinando os classificadores através do StackingClassifier

from sklearn.ensemble import StackingClassifier
voting = VotingClassifier([
('knn', KNeighborsClassifier()),
('logistic', LogisticRegression()),
('perceptron', Perceptron())
])

modelo = StackingClassifier([
('voting', voting),
('arvore', DecisionTreeClassifier(random_state=42)),
], cv=3, passthrough=True)
modelo.fit(X_treino, y_treino)
sc_pr = modelo.predict(X_teste)
schits = sc_pr == y_teste
schits, sum(schits)/len(schits)

(array([ True,  True, False, ..., False,  True, False]), 0.6709084482149169)

## usando Gridsearch para ajustar os parêmetros

In [22]:
from sklearn.model_selection import GridSearchCV

parametros = {
    'atributosDesejados__excluirName': [True, False],
    'classificador__max_depth': [5]
}
modelo = GridSearchCV(modelo, param_grid=parametros)