In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [3]:
vectorizer = TfidfVectorizer()

In [4]:
vectorizer.fit(newsgroups_train.data)

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [5]:
vectors_train = vectorizer.transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

In [6]:
train_labels = newsgroups_train.target
test_labels = newsgroups_test.target

In [7]:
top200 = SelectKBest(chi2, k=200)

In [8]:
X_new_train = top200.fit_transform(vectors_train, train_labels)

In [9]:
X_new_test = top200.fit_transform(vectors_test, test_labels)

In [10]:
model = LogisticRegression()

In [11]:
model.fit(X_new_train, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
predictions = model.predict(X_new_test)

In [13]:
accuracy_score(test_labels, predictions)

0.19211364843335105

In [14]:
from sklearn.feature_selection import mutual_info_classif

In [15]:
top200_mic = mutual_info_classif(vectors_train, train_labels, n_neighbors=200,copy=True)

In [18]:
X_new_train.shape

(11314, 200)

In [20]:
X_new_test.shape

(7532, 200)

In [21]:
top200_mic.shape

(130107,)

In [22]:
import numpy as np

In [23]:
top200_mic_ind = np.argpartition(top200_mic, -200)[-200:]

In [24]:
top200_mic = top200_mic[top200_mic_ind]

In [25]:
top200_mic.shape

(200,)

In [27]:
vectors_train200 = vectors_train.T[top200_mic_ind]
vectors_test200 = vectors_test.T[top200_mic_ind]

vectors_train200 = vectors_train200.T
vectors_test200 = vectors_test200.T

In [28]:
vectors_train200.shape

(11314, 200)

In [29]:
vectors_test200.shape

(7532, 200)

In [30]:
model = LogisticRegression()

In [31]:
model.fit(vectors_train200, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
predictions = model.predict(vectors_test200)

In [33]:
accuracy_score(test_labels, predictions)

0.29580456718003184