In [2]:
import matplotlib as mpl
%matplotlib inline
mpl.style.use('seaborn')

In [3]:
import os
import sys
import tarfile
import time

source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
target = 'aclImdb_v1.tar.gz'

def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
        return
    progress_size = int(count * block_size)
    duration = time.time() - start_time
    if duration != 0:
        speed = progress_size / (1024 ** 2 * duration)
    else:
        speed = 99
    percent = count * block_size * 100. / total_size
    sys.stdout.write("\r%d%% | %d MB | %.2f MB/s | %d sec elapsed" %
                     (percent, progress_size / (1024. ** 2), speed, duration))
    sys.stdout.flush()

from pathlib import Path
bs_path=Path(os.getcwd()).joinpath('data-science').joinpath('Py_ML')
os.chdir(bs_path)
if not os.path.isdir('aclImdb') and not os.path.isfile('aclImdb_v1.tar.gz'):
    if (sys.version_info < (3, 0)):
        import urllib
        urllib.urlretrieve(source, target, reporthook)
    else:
        import urllib.request
        urllib.request.urlretrieve(source, target, reporthook)



In [4]:
os.getcwd()

if not os.path.isdir('aclImdb'):
    with tarfile.open(target, 'r:gz') as tar:
        tar.extractall()

In [5]:
import pyprind
import pandas as pd
import os

# change the 'basepath' to the directory of the unzipped movie dataset

basepath = 'aclImdb'

labels = {'pos':1, 'neg':0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']





0% [#                             ] 100% | ETA: 00:02:59

0% [##                            ] 100% | ETA: 00:02:54

0% [###                           ] 100% | ETA: 00:02:46

0% [####                          ] 100% | ETA: 00:02:37

0% [#####                         ] 100% | ETA: 00:02:30

0% [######                        ] 100% | ETA: 00:02:24

0% [#######                       ] 100% | ETA: 00:02:19

0% [########                      ] 100% | ETA: 00:02:17

0% [#########                     ] 100% | ETA: 00:02:13

0% [##########                    ] 100% | ETA: 00:02:12

0% [###########                   ] 100% | ETA: 00:02:10

0% [############                  ] 100% | ETA: 00:02:03

0% [#############                 ] 100% | ETA: 00:01:56

0% [##############                ] 100% | ETA: 00:01:50

0% [###############               ] 100% | ETA: 00:01:44

0% [################              ] 100% | ETA: 00:01:38

0% [#################             ] 100% | ETA: 00:01:31

0% [##################            ] 100% | ETA: 00:01:24

0% [###################           ] 100% | ETA: 00:01:19

0% [####################          ] 100% | ETA: 00:01:16

0% [#####################         ] 100% | ETA: 00:01:12

0% [######################        ] 100% | ETA: 00:01:05

0% [#######################       ] 100% | ETA: 00:00:57

0% [########################      ] 100% | ETA: 00:00:49

0% [#########################     ] 100% | ETA: 00:00:41

0% [##########################    ] 100% | ETA: 00:00:33

0% [###########################   ] 100% | ETA: 00:00:25

0% [############################  ] 100% | ETA: 00:00:16

0% [############################# ] 100% | ETA: 00:00:08

0% [##############################] 100% | ETA: 00:00:00

0% [##############################] 100% | ETA: 00:00:00


Total time elapsed: 00:04:15


In [6]:
df.size
df.index

RangeIndex(start=0, stop=50000, step=1)

In [7]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

In [8]:
df


Unnamed: 0,review,sentiment
11841,I went and saw this movie last night after bei...,1
19602,Actor turned director Bill Paxton follows up h...,1
45519,As a recreational golfer with some knowledge o...,1
25747,"I saw this film in a sneak preview, and it is ...",1
42642,Bill Paxton has taken the true story of the 19...,1
31902,"I saw this film on September 1st, 2005 in Indi...",1
30346,"Maybe I'm reading into this too much, but I wo...",1
12363,I felt this film did have many good qualities....,1
32490,This movie is amazing because the fact that th...,1
26128,"""Quitting"" may be as much about exiting a pre-...",1


In [9]:
df.to_csv('movie_data.csv', index=False, encoding='utf-8')
df.head(3)


Unnamed: 0,review,sentiment
11841,I went and saw this movie last night after bei...,1
19602,Actor turned director Bill Paxton follows up h...,1
45519,As a recreational golfer with some knowledge o...,1


In [10]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining, the weather is sweet, and one and one is two'])
print(docs)
bag = count.fit_transform(docs)

['The sun is shining' 'The weather is sweet'
 'The sun is shining, the weather is sweet, and one and one is two']


In [11]:
print(bag)

  (0, 3)	1
  (0, 1)	1
  (0, 4)	1
  (0, 6)	1
  (1, 5)	1
  (1, 8)	1
  (1, 1)	1
  (1, 6)	1
  (2, 7)	1
  (2, 2)	2
  (2, 0)	2
  (2, 5)	1
  (2, 8)	1
  (2, 3)	1
  (2, 1)	3
  (2, 4)	1
  (2, 6)	2


In [12]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [13]:
print(bag.toarray())


[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [14]:
type(bag)
bag

In [15]:
np.set_printoptions(precision=2)

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer

tfid = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)

print(tfid.fit_transform(count.fit_transform(docs)).toarray())


[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [17]:
tf_is = 3
n_docs = 3
idf_is = np.log((n_docs+1) / (3+1))
tfidf_is = tf_is * (idf_is + 1)
print('tf-idf of term "is" = %.2f' % tfidf_is)

tf-idf of term "is" = 3.00


In [18]:
tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
raw_tfidf = tfidf.fit_transform(count.fit_transform(docs)).toarray()[-1]
raw_tfidf

array([3.39, 3.  , 3.39, 1.29, 1.29, 1.29, 2.  , 1.69, 1.29])

In [19]:
l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))
l2_tfidf

array([0.5 , 0.45, 0.5 , 0.19, 0.19, 0.19, 0.3 , 0.25, 0.19])

In [20]:
df.loc[11841, 'review'][:10]

'I went and'

In [21]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    return text

In [22]:
preprocessor(df.loc[0, 'review'][-50:])

'great way to spend an hour and a half '

In [23]:
preprocessor("</a>This :) is :( a test :-)!")


'this is a test :) :( :)'

In [24]:
df['review'] = df['review'].apply(preprocessor)

In [25]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]


In [26]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [27]:
tokenizer_porter('runners like running and thus they run')


['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [28]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\garciag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [30]:
X_train = df.loc[:2500, 'review'].values
y_train = df.loc[:2500, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [str.split],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range':[(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [str.split], # [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, 
                           verbose=1, n_jobs=-1)



In [32]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.2min


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 16.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, return_tr

In [34]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

Best parameter set: {'clf__C': 1.0, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <method 'split' of 'str' objects>} 
CV Accuracy: 0.886


In [35]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

Test Accuracy: 0.866


In [44]:
from sklearn.linear_model import LogisticRegression
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

np.random.seed(0)
np.set_printoptions(precision=6)
y = [np.random.randint(3) for i in range(25)]
X = (y + np.random.randn(25)).reshape(-1, 1)

print(y, '\n', X)
np.random.randn(100)

cv5_idx = list(StratifiedKFold(n_splits=5, shuffle=False, random_state=0).split(X, y))

print(cv5_idx)
cross_val_score(LogisticRegression(random_state=123), X, y, cv=cv5_idx)




[0, 1, 0, 1, 1, 2, 0, 2, 0, 0, 0, 2, 1, 2, 2, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1] 
 [[ 0.443863]
 [ 1.333674]
 [ 1.494079]
 [ 0.794842]
 [ 1.313068]
 [ 1.145904]
 [-2.55299 ]
 [ 2.653619]
 [ 0.864436]
 [-0.742165]
 [ 2.269755]
 [ 0.545634]
 [ 1.045759]
 [ 1.812816]
 [ 3.532779]
 [ 1.469359]
 [ 1.154947]
 [ 1.378163]
 [ 0.112214]
 [-0.980796]
 [-0.347912]
 [ 1.156349]
 [ 1.230291]
 [ 1.20238 ]
 [ 0.612673]]
[(array([ 4,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
       22, 23, 24]), array([0, 1, 2, 3, 5])), (array([ 0,  1,  2,  3,  5,  9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21,
       22, 23, 24]), array([ 4,  6,  7,  8, 12])), (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 12, 13, 14, 15, 18, 19, 20, 21,
       22, 23, 24]), array([ 9, 10, 11, 16, 17])), (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 14, 16, 17, 21,
       22, 23, 24]), array([13, 15, 18, 19, 20])), (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 15, 16, 17,
       18, 19, 2

array([0.6, 0.4, 0.6, 0.2, 0.6])

In [45]:
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(LogisticRegression(), {}, cv=cv5_idx, verbose=3).fit(X, y)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ...................................... , score=0.6, total=   0.0s
[CV]  ................................................................
[CV] ...................................... , score=0.4, total=   0.0s
[CV]  ................................................................
[CV] ...................................... , score=0.6, total=   0.0s
[CV]  ................................................................
[CV] ...................................... , score=0.2, total=   0.0s
[CV]  ................................................................
[CV] ...................................... , score=0.6, total=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


In [46]:
gs.best_score_

0.48

In [47]:
cross_val_score(LogisticRegression(), X, y, cv=cv5_idx).mean()

0.48

In [1]:
import numpy as np
import re
from nltk.corpus import stopwords

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label
            

In [5]:
next(stream_docs(path='movie_data.csv'))

('"I went and saw this movie last night after being coaxed to by a few friends of mine. I\'ll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge."',
 1)

In [8]:
next(stream_docs(path='movie_data.csv'))
next(stream_docs(path='movie_data.csv'))

('"I went and saw this movie last night after being coaxed to by a few friends of mine. I\'ll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge."',
 1)

In [13]:
for ele in next(stream_docs(path='movie_data.csv')):
    print(type(ele))

<class 'str'>
<class 'int'>


In [18]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn