In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [3]:
vectorizer = TfidfVectorizer()

In [4]:
vectorizer.fit(newsgroups_train.data)

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [5]:
vectors_train = vectorizer.transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

In [6]:
train_labels = newsgroups_train.target
test_labels = newsgroups_test.target

In [7]:
model = LogisticRegression(penalty='l1', C=1.0)

In [8]:
model.fit(vectors_train, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
predictions = model.predict(vectors_test)

In [10]:
accuracy_score(test_labels, predictions)

0.75212426978226232

In [11]:
predictions_train = model.predict(vectors_train)

In [12]:
accuracy_score(train_labels, predictions_train)

0.86865829945200634

In [13]:
model = LogisticRegression(penalty='l1', C=1.5)

In [14]:
model.fit(vectors_train, train_labels)

LogisticRegression(C=1.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
predictions = model.predict(vectors_test)

In [19]:
accuracy_score(test_labels, predictions)

0.77071163037705792

In [20]:
predictions = model.predict(vectors_train)

In [21]:
accuracy_score(train_labels, predictions)

0.90852041718225207

### Top 200 coefficients

In [22]:
model.coef_

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.31647077,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [23]:
model.coef_.shape

(20, 130107)

In [31]:
import numpy as np

In [32]:
coefficients = np.sum(model.coef_, axis=0)

In [33]:
ind_lr = np.argpartition(coefficients, -200)[-200:]

In [34]:
f_coeffs = coefficients[ind_lr]

In [35]:
f_coeffs.shape

(200,)

In [30]:
f_coeffs

array([ 23.77128736,  24.00379703,  24.31286774,  26.11921975,
        26.6183117 ,  24.32343952,  24.56541431,  25.73392407,
        24.4457139 ,  25.49792307,  26.89761346,  27.50739754,
        27.68308247,  27.13916781,  27.50218272,  27.1333492 ,
        28.31784881,  28.41220035,  34.80307927,  30.91440908,
        43.73426672,  46.49337363,  38.5627637 ,  55.70716168,
        28.94837958,  70.72063029,  28.66593337,  31.26614434,
        43.35379598,  41.04746688])

### Reconstructing dataset using top 200 features

In [36]:
vectors_train200 = vectors_train.T[ind_lr]
vectors_test200 = vectors_test.T[ind_lr]

vectors_train200 = vectors_train200.T
vectors_test200 = vectors_test200.T

In [37]:
vectors_train200.shape

(11314, 200)

In [39]:
vectors_test200.shape

(7532, 200)

### Re-run Logistic Regression

In [40]:
model_new = LogisticRegression()

In [41]:
model_new.fit(vectors_train200, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [43]:
predictions = model_new.predict(vectors_test200)

In [44]:
accuracy_score(test_labels, predictions)

0.59399893786510882

In [45]:
predictions = model_new.predict(vectors_train200)

In [47]:
accuracy_score(train_labels, predictions)

0.65564786989570445