# Trying Logistic Regression on Topic Models

I'd like to try comparing logistic regression on standard vectorizers against running it on topic models, to see if the less sparse variables help significantly.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai import *
from fastai.text import *
from fastai.utils.mem import GPUMemTrace #call with mtrace

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
import sklearn.feature_extraction.text as sklearn_text
import pickle 

In [5]:
path = untar_data(URLs.IMDB)
path

PosixPath('/home/david/.fastai/data/imdb')

In [6]:
import pandas as pd
pd.options.display.max_colwidth = 500

In [7]:
(path/'train').ls()

[PosixPath('/home/david/.fastai/data/imdb/train/labeledBow.feat'),
 PosixPath('/home/david/.fastai/data/imdb/train/unsupBow.feat'),
 PosixPath('/home/david/.fastai/data/imdb/train/neg'),
 PosixPath('/home/david/.fastai/data/imdb/train/pos')]

In [8]:
%%time
# throws `BrokenProcessPool' Error sometimes. Keep trying `till it works!
count = 0
error = True
while error:
    try: 
        # Preprocessing steps
        reviews_full = (TextList.from_folder(path)
             #  Make a `TextList` object that is a list of `WindowsPath` objects, 
             #     each of which contains the full path to one of the data files.
             .split_by_folder(valid='test')
             # Generate a `LabelLists` object that splits files by training and validation folders
             # Note: .label_from_folder in next line causes the `BrokenProcessPool` error
             .label_from_folder(classes=['neg', 'pos']))
             # Create a `CategoryLists` object which contains the data and
             #   its labels that are derived from folder names
        error = False
        print(f'failure count is {count}\n')    
    except: # catch *all* exceptions
        # accumulate failure count
        count = count + 1
        print(f'failure count is {count}')


failure count is 0

CPU times: user 7.9 s, sys: 1.96 s, total: 9.86 s
Wall time: 39.4 s


In [9]:
vocab = reviews_full.vocab

In [10]:
len(vocab.itos) - len(set(vocab.itos))

2

In [13]:
pd.Series(vocab.itos).value_counts()

xxfake          3
titanium        1
strident        1
anachronism     1
raimi           1
               ..
nationals       1
disciplining    1
mafioso         1
marketing       1
blades          1
Length: 38438, dtype: int64

In [16]:
[(idx,s) for idx,s in enumerate(vocab.itos) if s == 'xxfake']

[(38437, 'xxfake'), (38438, 'xxfake'), (38439, 'xxfake')]

In [21]:
# Fixes the multiplicity of 'xxfake'

vocab.itos = vocab.itos[:-2]
vocab.itos[-10:]

['zizola',
 'cornette',
 'knobs',
 'elster',
 'dm',
 "o'daniel",
 'lucianna',
 'marcie',
 'magalhães',
 'xxfake']

In [22]:
vocab.stoi['xxfake'] = vocab.itos.index('xxfake')

In [24]:
any(idx == 38437 for idx in vocab.stoi.values())

True

In [25]:
vocab.stoi['xxfake']

38437

In [26]:
pd.Series(vocab.stoi).value_counts()

0        79719
2708         1
14994        1
8849         1
10896        1
         ...  
13612        1
3371         1
1322         1
7465         1
28626        1
Length: 38438, dtype: int64

## Will use Fast.ai's creation of the term document matrix

In [28]:
cv = sklearn_text.CountVectorizer(vocabulary=vocab.itos)

In [29]:
reviews_full.train.x[0].text

'xxbos i enjoyed the previous xxmaj ittenbach movie that i \'d seen , " xxmaj burning xxmaj moon " . xxmaj but while that movie was rather grim and nasty , " xxmaj premutos " seems to mostly play it for laughs . xxmaj while its admirable how xxmaj ittenbach made this movie with no money in his spare time ( and the xxup dvd documentary is worthwhile to see this ) , i found myself constantly battling not to fast - forward to the next gore scene . xxmaj sure , there \'s gore , and if that \'s all you want then go ahead and enjoy . xxmaj but be warned : there \'s an inordinate amount of lame comedy and tedious story exposition . xxmaj many are comparing this to xxmaj peter xxmaj jackson \'s movies , especially " xxmaj braindead " . xxmaj but looking at what xxmaj jackson did on a similar budget in " xxmaj bad xxmaj taste " , it \'s clear xxmaj ittenbach is lacking one thing that xxmaj jackson has - talent . 3 / 10 ( for pretty good and plentiful gore effects , and for getting the most out 

In [32]:
%%time
doc_term_mat = cv.fit_transform([x.text for x in reviews_full.train.x])

CPU times: user 4.86 s, sys: 58.7 ms, total: 4.92 s
Wall time: 4.8 s


In [34]:
valid_doc_term = cv.transform([x.text for x in reviews_full.valid.x])

In [35]:
valid_doc_term

<25000x38438 sparse matrix of type '<class 'numpy.int64'>'
	with 3360072 stored elements in Compressed Sparse Row format>

In [36]:
reviews_full.y

CategoryList (25000 items)
neg,neg,neg,neg,neg
Path: /home/david/.fastai/data/imdb

In [37]:
reviews_full.y.classes

['neg', 'pos']

In [38]:
reviews_full.train.y.items

array([0, 0, 0, 0, ..., 1, 1, 1, 1])

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.pipeline import make_pipeline

In [47]:
lr_only = make_pipeline(LogisticRegression(dual=True, solver='liblinear', max_iter=500))

In [50]:
%%time

lr_only.fit(doc_term_mat, reviews_full.train.y.items)
(lr_only.predict(valid_doc_term) == reviews_full.valid.y.items).mean()

CPU times: user 8.19 s, sys: 27.8 ms, total: 8.22 s
Wall time: 8.24 s


0.86532

NMF is not working well.  Kinda surprising to me.

In [53]:
%%time

nmf = make_pipeline(NMF(n_components=100,
                        verbose=2,
                       ),
                    LogisticRegression(dual=True, solver='liblinear', max_iter=500),
                   )

pipe = nmf

pipe.fit(doc_term_mat, reviews_full.train.y.items)
(pipe.predict(valid_doc_term) == reviews_full.valid.y.items).mean()

violation: 1.0
violation: 0.3451984883220355
violation: 0.3924725757634428
violation: 0.36359922386274257
violation: 0.3129004257175245
violation: 0.2700128556340622
violation: 0.22959302727181125
violation: 0.1977870342618297
violation: 0.17282701295307032
violation: 0.1556238192227033
violation: 0.1426735384340764
violation: 0.13251497083772987
violation: 0.12180697048645311
violation: 0.10822876366292548
violation: 0.09498537728808175
violation: 0.08454099229448633
violation: 0.07640475418903786
violation: 0.068934119622936
violation: 0.062244958017542595
violation: 0.05606276660804158
violation: 0.05103184882304673
violation: 0.046385138248229894
violation: 0.04236428412331383
violation: 0.03896012046218541
violation: 0.03618439287896018
violation: 0.03383196321334917
violation: 0.031847104335207224
violation: 0.030171440243737066
violation: 0.028455237309270975
violation: 0.02719454134737227
violation: 0.02617031276665348
violation: 0.025251869505587596
violation: 0.02442070298048

0.74152

`TruncatedSVD` is working much better.  It matches the performance of plain LR at about 500 topics, and starts to beat raw LR a bit as it goes over that.  Since the vocabulary is about 38,000 words, that's a very sizable reduction.

It's also **MUCH** faster than `NMF`.  I only tried to run up to about 100 components there, where it was starting to get too slow for my lunch break.

In [58]:
doc_term_mat.shape

(25000, 38438)

In [62]:
%%time

nmf = make_pipeline(TruncatedSVD(n_components=2000,
                       ),
                    LogisticRegression(dual=True, solver='liblinear', max_iter=500),
                   )

pipe = nmf

pipe.fit(doc_term_mat, reviews_full.train.y.items)
(pipe.predict(valid_doc_term) == reviews_full.valid.y.items).mean()



CPU times: user 4min 26s, sys: 20 s, total: 4min 46s
Wall time: 3min 1s


0.86992

In [63]:
%%time

nmf = make_pipeline(TruncatedSVD(n_components=1500,
                       ),
                    LogisticRegression(dual=True, solver='liblinear', max_iter=500),
                   )

pipe = nmf

pipe.fit(doc_term_mat, reviews_full.train.y.items)
(pipe.predict(valid_doc_term) == reviews_full.valid.y.items).mean()



CPU times: user 2min 57s, sys: 13.4 s, total: 3min 11s
Wall time: 2min 7s


0.8734

In [64]:
%%time

nmf = make_pipeline(TruncatedSVD(n_components=1000,
                       ),
                    LogisticRegression(dual=True, solver='liblinear', max_iter=500),
                   )

pipe = nmf

pipe.fit(doc_term_mat, reviews_full.train.y.items)
(pipe.predict(valid_doc_term) == reviews_full.valid.y.items).mean()



CPU times: user 1min 49s, sys: 4.28 s, total: 1min 53s
Wall time: 1min 16s


0.86664

## Trying the pipeline with TF-IDF

TF-IDF is working much better than the `CountVectorizer`.

It's much harder to beat the default performance (according to accuracy) with TF-IDF, but it does evenrually happen for `TruncatedSVD` after about 3000 topics.

I should look more into Precision and Recall, see what those metrics look like.

In [71]:
train_docs = [x.text for x in reviews_full.train.x]
valid_docs = [x.text for x in reviews_full.valid.x]

First, with no topics:

In [75]:
%%time

nmf = make_pipeline(sklearn_text.TfidfVectorizer(),
                    LogisticRegression(dual=True, solver='liblinear', max_iter=500),
                   )

pipe = nmf

pipe.fit(train_docs, reviews_full.train.y.items)
(pipe.predict(valid_docs) == reviews_full.valid.y.items).mean()

CPU times: user 7.87 s, sys: 46.5 ms, total: 7.91 s
Wall time: 7.92 s


0.88144

In [79]:
%%time

nmf = make_pipeline(sklearn_text.TfidfVectorizer(),
                    TruncatedSVD(n_components=1000,
                       ),
                    LogisticRegression(dual=True, solver='liblinear', max_iter=500),
                   )

pipe = nmf

pipe.fit(train_docs, reviews_full.train.y.items)
(pipe.predict(valid_docs) == reviews_full.valid.y.items).mean()

CPU times: user 1min 27s, sys: 5.19 s, total: 1min 32s
Wall time: 55.8 s


0.87816

In [80]:
%%time

nmf = make_pipeline(sklearn_text.TfidfVectorizer(),
                    TruncatedSVD(n_components=1500,
                       ),
                    LogisticRegression(dual=True, solver='liblinear', max_iter=500),
                   )

pipe = nmf

pipe.fit(train_docs, reviews_full.train.y.items)
(pipe.predict(valid_docs) == reviews_full.valid.y.items).mean()

CPU times: user 2min 24s, sys: 9.58 s, total: 2min 33s
Wall time: 1min 30s


0.87976

In [81]:
%%time

nmf = make_pipeline(sklearn_text.TfidfVectorizer(),
                    TruncatedSVD(n_components=2000,
                       ),
                    LogisticRegression(dual=True, solver='liblinear', max_iter=500),
                   )

pipe = nmf

pipe.fit(train_docs, reviews_full.train.y.items)
(pipe.predict(valid_docs) == reviews_full.valid.y.items).mean()

CPU times: user 4min 1s, sys: 22.8 s, total: 4min 24s
Wall time: 2min 20s


0.88128

In [82]:
%%time

nmf = make_pipeline(sklearn_text.TfidfVectorizer(),
                    TruncatedSVD(n_components=3000,
                       ),
                    LogisticRegression(dual=True, solver='liblinear', max_iter=500),
                   )

pipe = nmf

pipe.fit(train_docs, reviews_full.train.y.items)
(pipe.predict(valid_docs) == reviews_full.valid.y.items).mean()

CPU times: user 6min 45s, sys: 29.6 s, total: 7min 14s
Wall time: 3min 40s


0.8818