In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


In [2]:
train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

# Aprendendo sobre os dados

In [3]:
train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
df = pd.DataFrame(train.target)
df[0] == 0

0        False
1        False
2        False
3        False
4        False
         ...  
11309    False
11310    False
11311    False
11312    False
11313    False
Name: 0, Length: 11314, dtype: bool

In [5]:
categories_enum = { i : train.target_names[i] for i in range(0, len(train.target_names) ) }
categories_enum

{0: 'alt.atheism',
 1: 'comp.graphics',
 2: 'comp.os.ms-windows.misc',
 3: 'comp.sys.ibm.pc.hardware',
 4: 'comp.sys.mac.hardware',
 5: 'comp.windows.x',
 6: 'misc.forsale',
 7: 'rec.autos',
 8: 'rec.motorcycles',
 9: 'rec.sport.baseball',
 10: 'rec.sport.hockey',
 11: 'sci.crypt',
 12: 'sci.electronics',
 13: 'sci.med',
 14: 'sci.space',
 15: 'soc.religion.christian',
 16: 'talk.politics.guns',
 17: 'talk.politics.mideast',
 18: 'talk.politics.misc',
 19: 'talk.religion.misc'}

In [7]:
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train.data)
test_vectors = vectorizer.transform(test.data)

In [8]:
train_vectors.shape

(11314, 101631)

In [9]:
len(train.data)

11314

In [10]:
train_vectors[0]

<1x101631 sparse matrix of type '<class 'numpy.float64'>'
	with 64 stored elements in Compressed Sparse Row format>

In [11]:
train.data[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [12]:
train.target[0]

7

In [13]:
categories_enum[train.target[0]]

'rec.autos'

In [14]:
train.data[5]

'\n\n\n\n\nOf course.  The term must be rigidly defined in any bill.\n\n\nI doubt she uses this term for that.  You are using a quote allegedly\nfrom her, can you back it up?\n\n\n\n\nI read the article as presenting first an argument about weapons of mass\ndestruction (as commonly understood) and then switching to other topics.\nThe first point evidently was to show that not all weapons should be\nallowed, and then the later analysis was, given this understanding, to\nconsider another class.\n\n\n\n'

# Usando SVM

In [15]:
from sklearn.svm import SVC


classifier = SVC()

classifier.fit(train_vectors, train.target)

SVC()

In [18]:
from sklearn.metrics import f1_score

test_prediction = classifier.predict(test_vectors)
f1_score(test.target, test_prediction, average='macro')


0.644061323734714

# Naive Bayes

In [20]:
from sklearn.naive_bayes import MultinomialNB


classifier = MultinomialNB(alpha=.01)
classifier.fit(train_vectors, train.target)
test_prediction = classifier.predict(test_vectors)
f1_score(test.target, test_prediction, average='macro')


0.682861129525057

# Decision Tree

In [22]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()
classifier.fit(train_vectors, train.target)
test_prediction = classifier.predict(test_vectors)
f1_score(test.target, test_prediction, average='macro')

0.39466911856207776

# Regressão logística

In [23]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(train_vectors, train.target)
test_prediction = classifier.predict(test_vectors)
f1_score(test.target, test_prediction, average='macro')

0.6585894332744863