In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from glob import glob
import numpy as np
import os,re,string
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

## IMDB dataset and the sentiment classification task

In [122]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [123]:
!gunzip aclImdb_v1.tar.gz

In [124]:
!tar -xvf aclImdb_v1.tar

### Tokenizing and term document matrix creation

In [125]:
PATH='data/aclImdb/'
names = ['neg','pos']

In [126]:
!ls {PATH}

README     imdb.vocab imdbEr.txt [34mtest[m[m       [34mtrain[m[m


In [127]:
!ls {PATH}train

labeledBow.feat [34mpos[m[m             unsupBow.feat   urls_pos.txt
[34mneg[m[m             [34munsup[m[m           urls_neg.txt    urls_unsup.txt


In [128]:
!ls {PATH}train/pos | head

0_9.txt
10000_8.txt
10001_10.txt
10002_7.txt
10003_8.txt
10004_8.txt
10005_7.txt
10006_7.txt
10007_7.txt
10008_7.txt


In [129]:
!ls {PATH}test

labeledBow.feat [34mneg[m[m             [34mpos[m[m             urls_neg.txt    urls_pos.txt


In [130]:
!ls {PATH}test/pos | head

0_10.txt
10000_7.txt
10001_9.txt
10002_8.txt
10003_8.txt
10004_9.txt
10005_8.txt
10006_7.txt
10007_10.txt
10008_8.txt


In [142]:
def load_texts_labels_from_folders(path, folders):
    texts,labels = [],[]
    for idx,label in enumerate(folders):
        for fname in glob(os.path.join(path, label, '*.*')):
            texts.append(open(fname, 'r').read())
            labels.append(idx)
    # stored as np.int8 to save space 
    return texts, np.array(labels).astype(np.int8)

In [143]:
trn,trn_y = load_texts_labels_from_folders(f'{PATH}train',names)
val,val_y = load_texts_labels_from_folders(f'{PATH}test',names)

In [144]:
len(val),len(trn_y),len(val),len(val_y)

(25000, 25000, 25000, 25000)

In [145]:
len(trn_y[trn_y==1]),len(val_y[val_y==1])

(12500, 12500)

In [148]:
type(trn_y),type(trn)

(numpy.ndarray, list)

In [147]:
np.unique(trn_y)

array([0, 1], dtype=int8)

In [149]:
??texts_labels_from_folders

Here is the text of the first review

In [152]:
print(trn[0])
print()
print(f"Review's label: {trn_y[0]}")
# 0 represent a negative review

Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.

Review's label: 0


In [104]:
trn_y[0] # 0 represent a negative review

0

[`CountVectorizer`](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) converts a collection of text documents to a matrix of token counts (part of `sklearn.feature_extraction.text`).

In [2]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [3]:
s = "This 'movie' isn’t good." 
tokenize(s)

['This', "'", 'movie', "'", 'isn', '’', 't', 'good', '.']

In [171]:
#create term documetn matrix
veczr = CountVectorizer(tokenizer=tokenize)

`fit_transform(trn)` finds the vocabulary in the training set. It also transforms the training set into a term-document matrix. Since we have to apply the *same transformation* to your validation set, the second line uses just the method `transform(val)`. `trn_term_doc` and `val_term_doc` are sparse matrices. `trn_term_doc[i]` represents training document i and it contains a count of words for each document for each word in the vocabulary.

In [172]:
trn_term_doc = veczr.fit_transform(trn)
# Important: Use same vocab for validation set
val_term_doc = veczr.transform(val)

In [174]:
trn_term_doc

<25000x75132 sparse matrix of type '<class 'numpy.int64'>'
	with 3749745 stored elements in Compressed Sparse Row format>

In [187]:
trn_term_doc[5] #83 stored elements

<1x75132 sparse matrix of type '<class 'numpy.int64'>'
	with 83 stored elements in Compressed Sparse Row format>

In [188]:
w0 = set([o.lower() for o in trn[5].split(' ')]); w0

{'"it',
 '"ripping',
 'a',
 'accusations',
 'agree',
 'allen',
 "allen's",
 'and',
 'appears',
 'applauded',
 'are',
 'bergman',
 'bergman,',
 'bergman.',
 'brian',
 'but',
 'contemporaneous',
 'contrary',
 'critics',
 'cultural',
 'depalma',
 "don't",
 'drama',
 'excoriated',
 'films?',
 'find',
 'for',
 'form',
 'generally',
 'get',
 'good',
 'have',
 'his',
 'hitchcock',
 'i',
 'idea',
 'imitating',
 'imitations',
 'in',
 'is',
 "it's",
 'kid',
 "let's",
 'many',
 'mostly',
 'not',
 'notwithstanding.',
 'of',
 'off"',
 'originality',
 'ourselves:',
 'pretensions,',
 'pretentious',
 'reason:',
 'robin',
 'snobbery.',
 'strange',
 'supportive',
 'suspense/horror',
 'that',
 'that.',
 'the',
 'they',
 'this:',
 'to',
 'unbearably',
 'unpalatable."',
 'view,',
 'was',
 'were',
 'what',
 'whining',
 'why',
 'with',
 "wood's",
 'wooden',
 'woody',
 'would'}

In [189]:
len(w0)
# length is 78 which is pretty similar to 83, and just the 
# difference will be that I didn’t use a real tokenizer. 

78

In [190]:
vocab = veczr.get_feature_names()
print(len(vocab))
vocab[5000:5005]

75132


['aussie', 'aussies', 'austen', 'austeniana', 'austens']

In [191]:
veczr.vocabulary_['absurd']

1297

In [192]:
trn_term_doc[0,1297]
# word 'absurb' appears twice in the first document

2

In [198]:
vocab[4050]

'arching'

In [199]:
veczr.vocabulary_['arching']

4050

In [200]:
trn_term_doc[0,4050]
# word 'arching' does not appear in the first document

0

## Naive Bayes

In [219]:
def pr(y_i):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [215]:
x=trn_term_doc
y=trn_y

r = np.log((pr(1)/pr(0))
b = np.log((y==1).mean() / (y==0).mean())

In [210]:
r.shape,val_term_doc.shape

((1, 75132), (25000, 75132))

In [214]:
preds

matrix([[False, False, False, ...,  True,  True,  True]])

Here is the formula for Naive Bayes.

In [203]:
pre_preds = val_term_doc @ r.T + b
preds = pre_preds.T>0
(preds==val_y).mean()

0.81656

...and binarized Naive Bayes.

In [221]:
x=trn_term_doc.sign()
r = np.log(pr(1)/pr(0))

pre_preds = val_term_doc.sign() @ r.T + b #sign binarize
preds = pre_preds.T>0
(preds==val_y).mean()

0.83016