In [1]:
import itertools
import numpy as np
import nltk
from sklearn import (
    datasets, feature_extraction, model_selection, pipeline,
    svm, metrics
)
import matplotlib.pyplot as plt


def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''

    stop_words = nltk.corpus.stopwords.words("english")

    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
        ngram_range=(1, 2),
        stop_words=stop_words
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus

if __name__ == '__main__':
    newsgroups_data = datasets.load_files(
        '20_newsgroups', shuffle=True, random_state=42, encoding='ISO-8859-1')

    print('Data loaded.\nClasses = {classes}\n{datapoints}'.format(
        classes=newsgroups_data.target_names,
        datapoints=len(newsgroups_data.data)))

    print(newsgroups_data.data[0])

    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        newsgroups_data.data, newsgroups_data.target, test_size=0.33,
        random_state=42)

    model = pipeline.Pipeline([
        ('counts', feature_extraction.text.CountVectorizer()),
        ('tfidf', feature_extraction.text.TfidfTransformer()),
        ('svm', svm.LinearSVC()),
    ])

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print('Accuracy of SVM= {}'.format(
        np.mean(y_pred == y_test)))

    print(metrics.classification_report(
        y_test, y_pred, target_names=newsgroups_data.target_names))


Data loaded.
Classes = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
19997
Newsgroups: rec.sport.hockey
Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!howland.reston.ans.net!zaphod.mps.ohio-state.edu!uwm.edu!cs.utexas.edu!utnut!alchemy.chem.utoronto.ca!golchowy
From: golchowy@alchemy.chem.utoronto.ca (Gerald Olchowy)
Subject: Re: RUMOUR - Keenan signs with Rangers?
Message-ID: <1993Apr16.222232.17393@alchemy.chem.utoronto.ca>
Organization: University of Toronto Chemistry Department
References: <1993Apr16.171347.784@news.columbia.edu> <1993Apr16.183110.838@alchemy.chem.utoronto.ca> <1993Apr16.18582