### The hashing trick

In [6]:
import sys
import string
import nltk
import numpy as np

import sklearn
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.feature_selection import chi2

import sklearn.linear_model
import sklearn.model_selection
import sklearn.pipeline
import sklearn.feature_extraction
import sklearn.datasets
import scipy
import scipy.sparse as sp

from sklearn.feature_extraction.text import CountVectorizer

In [7]:
X = sklearn.datasets.fetch_20newsgroups()

X_train = sklearn.datasets.fetch_20newsgroups(subset="train").data
y_train = sklearn.datasets.fetch_20newsgroups(subset="train").target
X_test  = sklearn.datasets.fetch_20newsgroups(subset="test").data
y_test  = sklearn.datasets.fetch_20newsgroups(subset="test").target

In [17]:
hashvectorizer = sklearn.feature_extraction.text.HashingVectorizer(ngram_range=(1,2))

logistic = sklearn.linear_model.LogisticRegression()
model_pipe_hash = sklearn.pipeline.Pipeline([("hashvectorizer", hashvectorizer),
                                             ("logisticregression", logistic)] ) 

In [18]:
model_pipe_hash.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('hashvectorizer', HashingVectorizer()),
                ('logisticregression', LogisticRegression())])

In [19]:
np.mean(model_pipe_hash.predict(X_train) == y_train)

0.894908962347534

In [20]:
np.mean(model_pipe_hash.predict(X_test) == y_test)

0.7267657992565055

### Hashing Pipeline (no normalization)
We can try the hashing pipeline without normalization

In [27]:
hashvectorizer = sklearn.feature_extraction.text.HashingVectorizer(ngram_range=(1,2),
                                                                   norm=None)

logistic = sklearn.linear_model.LogisticRegression()
model_pipe_hash2 = sklearn.pipeline.Pipeline([("hashvectorizer", hashvectorizer),
                                              ("logisticregression", logistic)] ) 

model_pipe_hash2.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('hashvectorizer',
                 HashingVectorizer(ngram_range=(1, 2), norm=None)),
                ('logisticregression', LogisticRegression())])

In [28]:
np.mean(model_pipe_hash2.predict(X_train) == y_train)

0.9999116139296447

In [29]:
np.mean(model_pipe_hash2.predict(X_test) == y_test)

0.7966011683483802

### Logistic Pipeline

In [22]:
logistic = sklearn.linear_model.LogisticRegression()
model_pipe = sklearn.pipeline.Pipeline([("count_vec", CountVectorizer(ngram_range=(1,2))),
                                        ("logisticregression", logistic)] )
model_pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('count_vec', CountVectorizer(ngram_range=(1, 2))),
                ('logisticregression', LogisticRegression())])

In [23]:
np.mean(model_pipe.predict(X_train) == y_train)

0.9999116139296447

In [24]:
np.mean(model_pipe.predict(X_test) == y_test)

0.7979288369622942