In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
categorias = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']

In [3]:
data_train = fetch_20newsgroups(subset='train',categories=categorias,shuffle=True)

In [4]:
data_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [5]:
len(data_train.data)

2257

In [6]:
len(data_train.filenames)

2257

In [7]:
print(data_train.data[0])

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.



In [8]:
print(data_train.filenames[0])

C:\Users\Lucas\scikit_learn_data\20news_home\20news-bydate-train\comp.graphics\38440


In [9]:
data_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [10]:
data_train.target[0]

1

In [11]:
for t in data_train.target[:20]:
    print(data_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med
soc.religion.christian
comp.graphics
alt.atheism
alt.atheism
comp.graphics
comp.graphics
sci.med
alt.atheism
soc.religion.christian
alt.atheism


In [12]:
import pandas as pd
import numpy as np

In [13]:
data = pd.DataFrame(data=np.array(data_train.data).reshape(-1,1),columns=['texto'])

In [14]:
data_train.target

array([1, 1, 3, ..., 2, 2, 2], dtype=int64)

In [15]:
X = data_train.data
y = data_train.target

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X)

In [18]:
X_train.shape

(2257, 35788)

In [19]:
X_train

<2257x35788 sparse matrix of type '<class 'numpy.int64'>'
	with 365886 stored elements in Compressed Sparse Row format>

In [20]:
count_vect.vocabulary_.get(u'computer')

9338

In [21]:
count_vect.vocabulary_.get(u'graphics')

15699

In [22]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train)
X_train = tf_transformer.transform(X_train)

In [23]:
X_train.shape

(2257, 35788)

In [24]:
from sklearn.naive_bayes import MultinomialNB

In [25]:
nb = MultinomialNB()
nb.fit(X_train,y)

MultinomialNB()

In [26]:
testes = ['God is love','OpenGL on the GPU is fast']

In [27]:
teste_count = count_vect.transform(testes)

In [28]:
teste_tf = tf_transformer.transform(teste_count)

In [29]:
preds = nb.predict(teste_tf)

In [30]:
for doc, cat in zip(testes,preds):
    print(f'Texto: {doc} \nClasse: {data_train.target_names[cat]}\n')

Texto: God is love 
Classe: soc.religion.christian

Texto: OpenGL on the GPU is fast 
Classe: comp.graphics



In [31]:
from sklearn.pipeline import Pipeline

In [32]:
clf = Pipeline([
                ('count',CountVectorizer()),
                ('tfidf',TfidfTransformer()),
                ('nb',MultinomialNB())
                ])

In [33]:
clf.fit(X,y)

Pipeline(steps=[('count', CountVectorizer()), ('nb', MultinomialNB())])

In [34]:
dados_teste = fetch_20newsgroups(subset='test',categories=categorias,shuffle=True)

In [35]:
X_test = dados_teste.data
y_test = dados_teste.target

In [36]:
preds = clf.predict(X_test)

In [32]:
clf = Pipeline([
                ('count',CountVectorizer()),
                ('tfidf',TfidfTransformer()),
                ('nb',MultinomialNB())
                ])

In [33]:
clf.fit(X,y)

Pipeline(steps=[('count', CountVectorizer()), ('nb', MultinomialNB())])

In [34]:
dados_teste = fetch_20newsgroups(subset='test',categories=categorias,shuffle=True)

In [35]:
X_test = dados_teste.data
y_test = dados_teste.target

In [36]:
preds = clf.predict(X_test)

In [140]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [144]:
clf = Pipeline([
                ('count',CountVectorizer()),
                ('tfidf',TfidfTransformer(use_idf=True)),
                ('knn',KNeighborsClassifier())
                ])

knn_params = {'knn__n_neighbors':np.arange(1,11),
              'knn__metric':['minkowski','cosine'],
              'knn__n_jobs':[-1]}
knn_model = GridSearchCV(clf,param_grid=knn_params,n_jobs=-1)
knn_model.fit(X,y)

GridSearchCV(estimator=Pipeline(steps=[('count', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('knn', KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'knn__metric': ['minkowski', 'cosine'],
                         'knn__n_jobs': [-1],
                         'knn__n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])})

In [145]:
dados_teste = fetch_20newsgroups(subset='test',categories=categorias,shuffle=True)

In [146]:
X_test = dados_teste.data
y_test = dados_teste.target

In [148]:
preds = knn_model.predict(X_test)

In [150]:
np.mean(preds == y_test)

0.796271637816245

In [151]:
knn_model.best_params_

{'knn__metric': 'minkowski', 'knn__n_jobs': -1, 'knn__n_neighbors': 1}

In [152]:
from sklearn.linear_model import SGDClassifier

In [153]:
clf = Pipeline([
                ('count',CountVectorizer()),
                ('tfidf',TfidfTransformer(use_idf=True)),
                ('knn',SGDClassifier())
                ])

In [154]:
clf.fit(X,y)

Pipeline(steps=[('count', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('knn', SGDClassifier())])

In [155]:
dados_teste = fetch_20newsgroups(subset='test',categories=categorias,shuffle=True)

In [156]:
X_test = dados_teste.data
y_test = dados_teste.target

In [159]:
preds = clf.predict(X_test)

In [160]:
np.mean(preds == y_test)

0.9307589880159787