In [19]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


In [20]:
#categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
categories = [ 'talk.religion.misc','comp.graphics', 'sci.space', 'talk.politics.misc', 
              'sci.med', 'rec.autos', 'comp.os.ms-windows.misc', 'rec.sport.baseball']
train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

# Aprendendo sobre os dados

In [21]:
train.target_names

['comp.graphics',
 'comp.os.ms-windows.misc',
 'rec.autos',
 'rec.sport.baseball',
 'sci.med',
 'sci.space',
 'talk.politics.misc',
 'talk.religion.misc']

In [22]:
df = pd.DataFrame(train.target)
df[0] == 0

0       False
1       False
2       False
3       False
4       False
        ...  
4390    False
4391     True
4392    False
4393    False
4394     True
Name: 0, Length: 4395, dtype: bool

In [23]:
categories_enum = { i : train.target_names[i] for i in range(0, len(train.target_names) ) }
categories_enum

{0: 'comp.graphics',
 1: 'comp.os.ms-windows.misc',
 2: 'rec.autos',
 3: 'rec.sport.baseball',
 4: 'sci.med',
 5: 'sci.space',
 6: 'talk.politics.misc',
 7: 'talk.religion.misc'}

In [24]:
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train.data)
test_vectors = vectorizer.transform(test.data)

In [25]:
train_vectors.shape

(4395, 66960)

In [26]:
len(train.data)

4395

In [27]:
train_vectors[0]

<1x66960 sparse matrix of type '<class 'numpy.float64'>'
	with 87 stored elements in Compressed Sparse Row format>

In [28]:
train.data[0]

'\nWhere were you brought up?  In the former USSR?  Is Innocent until proven  \nguilty by a jury of your peers, NOT Dan Rather, dead in this country?  Seems  \nso.  Is tax evasion, the only charge brought against the BDs, punishable by  \ndeath in this country, now?\n\n\nNot really.  You are a blind idiot.\n\n\n"Not sure", yet you condem them to death for it?  If the BATF had stayed home,  \nall would be alive, now.  So who murdered who?\nYou have a short memory.\n\nnext.\n\nSleep well, tonite, heartless idiot.  Sleep the sleep of the simple-minded.\n\nI shall weep for my country, myself.\n\n\nI\'m short of patience tonite, but rabid dogs deserve and get better treatment  \nthan the BDs got.\n\nJim\n--\njmd@handheld.com'

In [29]:
train.target[0]

7

In [30]:
categories_enum[train.target[0]]

'talk.religion.misc'

In [31]:
train.data[5]

"My comments about the Feingold Diet have no relevance to your\ndaughter's purported FrostedFlakes-related seizures.  I can't imagine\nwhy you included it.\n"

# Usando SVM

In [32]:
from sklearn.svm import SVC


classifier = SVC()

classifier.fit(train_vectors, train.target)

SVC()

In [33]:
from sklearn.metrics import f1_score

test_prediction = classifier.predict(test_vectors)
f1_score(test.target, test_prediction, average='macro')


0.7311447514680061

# Naive Bayes

In [34]:
from sklearn.naive_bayes import MultinomialNB


classifier = MultinomialNB(alpha=.01)
classifier.fit(train_vectors, train.target)
test_prediction = classifier.predict(test_vectors)
f1_score(test.target, test_prediction, average='macro')


0.7869255262555797

# Decision Tree

In [35]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()
classifier.fit(train_vectors, train.target)
test_prediction = classifier.predict(test_vectors)
f1_score(test.target, test_prediction, average='macro')

0.5046598061765181

# Regressão logística

In [36]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(train_vectors, train.target)
test_prediction = classifier.predict(test_vectors)
f1_score(test.target, test_prediction, average='macro')

0.7520120141732767