In [31]:
# NLP for NEWS classification

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
train = fetch_20newsgroups(subset = "train", shuffle=True)
test = fetch_20newsgroups(subset = "test", shuffle=True)

In [3]:
train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
print(train.target_names[train.target[5]])

talk.politics.guns


In [5]:
print(train.data[9])

From: kerr@ux1.cso.uiuc.edu (Stan Kerr)
Subject: Re: Sigma Designs Double up??
Article-I.D.: ux1.C52u8x.B62
Organization: University of Illinois at Urbana
Lines: 29

jap10@po.CWRU.Edu (Joseph A. Pellettiere) writes:


>	I am looking for any information about the Sigma Designs
>	double up board.  All I can figure out is that it is a
>	hardware compression board that works with AutoDoubler, but
>	I am not sure about this.  Also how much would one cost?

I've had the board for over a year, and it does work with Diskdoubler,
but not with Autodoubler, due to a licensing problem with Stac Technologies,
the owners of the board's compression technology. (I'm writing this
from memory; I've lost the reference. Please correct me if I'm wrong.)

Using the board, I've had problems with file icons being lost, but it's
hard to say whether it's the board's fault or something else; however,
if I decompress the troubled file and recompress it without the board,
the icon usually reappears. Because of the

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train.data)

X_train_counts.shape

(11314, 130107)

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transform = TfidfTransformer()
X_train_tfidf = tfidf_transform.fit_transform(X_train_counts)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import numpy as np 

In [21]:
text_clf_svm = Pipeline([("vect", CountVectorizer()),
    ("tfidf", TfidfTransformer()),
    ("clf-svm", SGDClassifier(loss="hinge", penalty="l2", alpha=1e-3, n_iter_no_change=5, random_state=42))
])

In [25]:
text_clf_svm.fit(train.data, train.target)
predicted_svm = text_clf_svm.predict(test.data)

In [26]:
np.mean(predicted_svm == test.target)

0.8240839086563994

In [28]:
from sklearn.model_selection import GridSearchCV
param_svm = {"vect__ngram_range": [(1, 1), (1, 2)],
            "tfidf__use_idf": (True, False),
            "clf-svm__alpha": (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, param_svm, n_jobs = -1)
gs_clf_svm = gs_clf_svm.fit(train.data, train.target)
gs_clf_svm.best_score_

0.9051618841994754

In [29]:
predicted_gs_svm = gs_clf_svm.predict(test.data)
np.mean(predicted_gs_svm == test.target)

0.8351035581518853