### The hashing trick

In [1]:
import sys
import string
import nltk
import numpy as np

import sklearn
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.feature_selection import chi2

import sklearn.linear_model
import sklearn.model_selection
import sklearn.pipeline
import sklearn.feature_extraction
import sklearn.datasets
import scipy
import scipy.sparse as sp

from sklearn.feature_extraction.text import CountVectorizer

## Dataset loading

In [2]:
X = sklearn.datasets.fetch_20newsgroups()

X_train = sklearn.datasets.fetch_20newsgroups(subset="train").data
y_train = sklearn.datasets.fetch_20newsgroups(subset="train").target
X_test  = sklearn.datasets.fetch_20newsgroups(subset="test").data
y_test  = sklearn.datasets.fetch_20newsgroups(subset="test").target

## Hash vectors

In [3]:
hashvectorizer = sklearn.feature_extraction.text.HashingVectorizer(ngram_range=(1,2))

logistic = sklearn.linear_model.LogisticRegression()
model_pipe_hash = sklearn.pipeline.Pipeline([("hashvectorizer", hashvectorizer),
                                             ("logisticregression", logistic)] ) 

In [4]:
%%time
model_pipe_hash.fit(X_train, y_train)
train_accuray = np.mean(model_pipe_hash.predict(X_train) == y_train)
test_accuracy = np.mean(model_pipe_hash.predict(X_test) == y_test)
print(f'train_accuray = {train_accuray}')
print(f'test_accuray = {test_accuracy}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


train_accuray = 0.9284956690825525
test_accuray = 0.7327403080191184
CPU times: user 4min 49s, sys: 54.4 s, total: 5min 44s
Wall time: 3min 14s


## Hashing Pipeline (no normalization)

We can try the hashing pipeline without normalization, which tends to help overfitting document classification

In [7]:
hashvectorizer = sklearn.feature_extraction.text.HashingVectorizer(ngram_range=(1,2),
                                                                   norm=None)

logistic = sklearn.linear_model.LogisticRegression()
model_pipe_hash2 = sklearn.pipeline.Pipeline([("hashvectorizer", hashvectorizer),
                                              ("logisticregression", logistic)] ) 


In [8]:
%%time
model_pipe_hash2.fit(X_train, y_train)
train_accuray = np.mean(model_pipe_hash2.predict(X_train) == y_train)
test_accuracy = np.mean(model_pipe_hash2.predict(X_test) == y_test)
print(f'train_accuray = {train_accuray}')
print(f'test_accuray = {test_accuracy}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


train_accuray = 0.9999116139296447
test_accuray = 0.7968667020711631
CPU times: user 5min 19s, sys: 55.1 s, total: 6min 14s
Wall time: 3min 16s


## Hash vectors vs Count Vectors

Count vectors in this example improve quality with respect to hash vectors without normalizing but the result is very marginal (third digit) `0.796` vs `0.799`.

In [9]:
import sklearn
from sklearn import neural_network
clf = sklearn.neural_network.MLPClassifier()

In [5]:
countvectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,2))

logistic = sklearn.linear_model.LogisticRegression()
model_pipe_count = sklearn.pipeline.Pipeline([("countvectorizer", countvectorizer),
                                              ("classifer", clf)] ) 

In [6]:
%%time
for xbatch, y_batch in get_batches(X_train, y_batch)
    model_pipe_count.partial_fit(xbatch, y_batch)
    
train_accuray = np.mean(model_pipe_count.predict(X_train) == y_train)
test_accuracy = np.mean(model_pipe_count.predict(X_test) == y_test)
print(f'train_accuray = {train_accuray}')
print(f'test_accuray = {test_accuracy}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


train_accuray = 0.9999116139296447
test_accuray = 0.7993892724375996
CPU times: user 5min 16s, sys: 53.2 s, total: 6min 9s
Wall time: 3min 32s
