# Applying Machine Learning to Sentiment Analysis

Source: http://ai.stanford.edu/~amaas/data/sentiment/

In [1]:
import pyprind
import pandas as pd
import os

reload_data = False

In [2]:
if reload_data:
    pbar = pyprind.ProgBar(50000)
    labels = {'pos':1, 'neg':0}
    df = pd.DataFrame()
    for s in ('test', 'train'):
        for l in ('pos', 'neg'):
            path ='./downloads/aclImdb/%s/%s' % (s, l)
            for file in os.listdir(path):
                with open(os.path.join(path, file), 'r', encoding="utf8") as infile:
                    txt = infile.read()
                df = df.append([[txt, labels[l]]], ignore_index=True)
                pbar.update()

    df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:18


In [3]:
if reload_data:
    df.to_csv('./downloads/aclImdb/movie_data.csv', index=False, encoding='utf8')

In [2]:
df = pd.read_csv('./downloads/aclImdb/movie_data.csv', encoding='utf8')
print(df.head())

import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

                                              review  sentiment
0  I went and saw this movie last night after bei...          1
1  Actor turned director Bill Paxton follows up h...          1
2  As a recreational golfer with some knowledge o...          1
3  I saw this film in a sneak preview, and it is ...          1
4  Bill Paxton has taken the true story of the 19...          1


### Bag of words concept

In [5]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining', 'The water is sweet', 'The sun is shining and the water is sweet'])
bag = count.fit_transform(docs)

In [6]:
print(count.vocabulary_)

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'water': 6, 'sweet': 4, 'and': 0}


In [7]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [8]:
# inverse document frequency and the importance of words/tokens
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.          0.43370786  0.55847784  0.55847784  0.          0.43370786
   0.        ]
 [ 0.          0.43370786  0.          0.          0.55847784  0.43370786
   0.55847784]
 [ 0.40474829  0.47810172  0.30782151  0.30782151  0.30782151  0.47810172
   0.30782151]]


In [9]:
# data cleansing
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [10]:
df['review'] = df['review'].apply(preprocessor)

In [11]:
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [12]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    #return [porter.stem(word) for word in text.split()]
    return pd.Series(text.split()).apply(porter.stem).tolist()
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\grzegorz.melniczak\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [15]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import tokenizers
import pickle

estimate = False

if estimate:
    param_grid = [{'vect__ngram_range': [(1,1)],
                   'vect__stop_words': [stop, None],
                   'vect__tokenizer': [tokenizers.tokenizer, 
                                       tokenizers.tokenizer_porter],
                   'clf__penalty': ['l1', 'l2'],
                   'clf__C': [1.0, 10.0, 100.0]},
                  {'vect__ngram_range': [(1,1)],
                   'vect__stop_words': [stop, None],
                   'vect__tokenizer': [tokenizers.tokenizer, 
                                       tokenizers.tokenizer_porter],
                   'vect__use_idf':[False],
                   'vect__norm':[None],
                   'clf__penalty': ['l1', 'l2'],
                   'clf__C': [1.0, 10.0, 100.0]}]

    lr_tfidf = Pipeline([('vect', TfidfVectorizer(strip_accents=None, 
                                                  lowercase=False, 
                                                  preprocessor=None)),
                         ('clf', LogisticRegression(random_state=0))])
    gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, 
                               scoring='accuracy',
                               cv=5, verbose=2,
                               n_jobs=4)

    gs_lr_tfidf.fit(X_train, y_train)
    
    with open('gs_lr_tfidf', 'wb') as f:
        pickle.dump(gs_lr_tfidf, f, pickle.HIGHEST_PROTOCOL)
else:
    with open("gs_lr_tfidf", "rb") as f:
        gs_lr_tfidf = pickle.load(f)

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(gs_lr_tfidf.predict(X_test), y_test)

0.89720163773559369

In [58]:
gs_lr_tfidf.predict(['I like this movie'])

array([0], dtype=int64)

In [20]:
gs_lr_tfidf.best_params_

{'clf__C': 10.0,
 'clf__penalty': 'l2',
 'vect__ngram_range': (1, 1),
 'vect__stop_words': None,
 'vect__tokenizer': <function tokenizers.tokenizer>}

In [21]:
gs_lr_tfidf.best_score_

0.89152973395931145

In [23]:
gs_lr_tfidf.score(X_test, y_test)

0.89720163773559369

## Working with bigger data – online algorithms and out-of-core learning

In [3]:
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

tokenizer('tralalala:)')

['tralalala', ':)']

In [15]:
def stream_docs(path):
    with open(path, 'r', encoding='utf8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [33]:
sd = stream_docs(path='./downloads/aclImdb/movie_data.csv')
print(next(sd))
print(next(sd))

('"I went and saw this movie last night after being coaxed to by a few friends of mine. I\'ll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge."', 1)
('"Actor turned director Bill Paxton follows up his promising debut, the Gothic-horror ""Frailty"", with this family friendly sports drama about the 1913 U.S. Open where a young American caddy ri

In [16]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [14]:
np.random.seed()
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./downloads/aclImdb/movie_data_unsorted.csv', index=False, encoding='utf8')

In [17]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21, 
                         preprocessor=None,
                         tokenizer=tokenizer)
clf = SGDClassifier(loss='log', n_iter=1)
doc_stream = stream_docs(path='./downloads/aclImdb/movie_data_unsorted.csv')

In [18]:
import pyprind

pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])

for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:28


In [19]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.869


In [20]:
clf = clf.partial_fit(X_test, y_test)

In [21]:
clf.predict_proba(vect.transform(['I love it']))

array([[ 0.07257276,  0.92742724]])

### Chapter 9: Serializing objects

In [22]:
import pickle
import os

dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
#pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'),'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)
#pickle.dump(gs_lr_tfidf, open(os.path.join(dest, 'classifier_large.pkl'), 'wb'), protocol=4)