## Data Exploration

In [35]:
from sklearn.datasets import fetch_20newsgroups
news_groups_train = fetch_20newsgroups(subset='train')
from pprint import pprint
pprint(list(news_groups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [36]:
# lets work on a subset of entire data for faster execution
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
# the dataset
partial_data = fetch_20newsgroups(subset='train', categories=categories,shuffle=True, random_state=42)

# Exploration
print("\n".join(partial_data.data[0].split('\n')[:]))
print(partial_data.target_names[partial_data.target[0]])
print(len(partial_data.data)) # Number of documents in our system
print(partial_data.target_names) # Distinct classes(categories) of the documents
print(partial_data.target[:10]) # target contains indices of the target_names


From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.

comp.graphics
2257
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
[1 1 3 3 3 3 3 2 2 2]


## Feature Extraction

In [37]:
# Feature Extraction
# CountVectorizer does TextPreprocessing, tokenizing and filtering of stopwords.
# It finally 1. builds a dictionary of features and and 2. transforms the documents into feature vectors
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_count = count_vect.fit_transform(partial_data.data)
print(X_train_count.shape)

(2257, 35788)


partial_data.data, a list of 2257 documents;Each of the document was converted into its bag-of-word vector representation.  
Length of the vocabalury is 35788

In [38]:
# Working on Improving the features
# Better than counts are the frequencies of occurence of words
# Better than frequencies are tF-idf(term frequency times inverse Document Frequency)
# Count can be converted to tf-idf with standard sklearn package
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
print(X_train_tfidf.shape)

(2257, 35788)


## Training a Naive Bayesian Classifier

In [39]:
# Train a classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, partial_data.target)
# Predict
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_test_count = count_vect.transform(docs_new)
X_test_tfidf = tfidf_transformer.transform(X_test_count)
predicted = clf.predict(X_test_tfidf)
predictions = [partial_data.target_names[pred] for pred in predicted]
print(predictions)

['soc.religion.christian', 'comp.graphics']


## Evaluating Model Performance

In [40]:
# alternate way of designing a compound classifier pipeline
# Evaluating Model Performance on test set
import numpy as np
test_data = fetch_20newsgroups(subset='test',categories=categories,shuffle=True,random_state=42)
X_test_count = count_vect.transform(test_data.data)
X_test_tfidf = tfidf_transformer.transform(X_test_count)
predicted=clf.predict(X_test_tfidf)
print(np.mean(predicted==test_data.target))



0.834886817577


## Fitting Better Models

In [41]:
# Attempts to improve the accuracy
# Use SVM instead of naive Baye's classifier
from sklearn.linear_model import SGDClassifier
#  Here we combine the above steps of feature extraction and calssifier into a single pipeline using
# pre-defined methods in sklearn
from sklearn.pipeline import Pipeline
# Initialize the text classifier
text_clf = Pipeline([('vect',CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge',penalty='l2', alpha=1e-3, n_iter=5, random_state=42))])
_ = text_clf.fit(partial_data.data,partial_data.target)
predicted = text_clf.predict(test_data.data)
np.mean(predicted == test_data.target)


0.9127829560585885