# Applying Machine Learning to Sentiment Analysis

Source: http://ai.stanford.edu/~amaas/data/sentiment/

In [3]:
import pyprind
import pandas as pd
import os

reload_data = False

In [4]:
if reload_data:
    pbar = pyprind.ProgBar(50000)
    labels = {'pos':1, 'neg':0}
    df = pd.DataFrame()
    for s in ('test', 'train'):
        for l in ('pos', 'neg'):
            path ='./downloads/aclImdb/%s/%s' % (s, l)
            for file in os.listdir(path):
                with open(os.path.join(path, file), 'r', encoding="utf8") as infile:
                    txt = infile.read()
                df = df.append([[txt, labels[l]]], ignore_index=True)
                pbar.update()

    df.columns = ['review', 'sentiment']

In [5]:
if reload_data:
    df.to_csv('./downloads/aclImdb/movie_data.csv', index=False, encoding='utf8')

In [6]:
df = pd.read_csv('./downloads/aclImdb/movie_data.csv', encoding='utf8')
print(df.head())

import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

                                              review  sentiment
0  I went and saw this movie last night after bei...          1
1  Actor turned director Bill Paxton follows up h...          1
2  As a recreational golfer with some knowledge o...          1
3  I saw this film in a sneak preview, and it is ...          1
4  Bill Paxton has taken the true story of the 19...          1


### Bag of words concept

In [7]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining', 'The water is sweet', 'The sun is shining and the water is sweet'])
bag = count.fit_transform(docs)

In [8]:
print(count.vocabulary_)

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'water': 6, 'sweet': 4, 'and': 0}


In [9]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [10]:
# inverse document frequency and the importance of words/tokens
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.          0.43370786  0.55847784  0.55847784  0.          0.43370786
   0.        ]
 [ 0.          0.43370786  0.          0.          0.55847784  0.43370786
   0.55847784]
 [ 0.40474829  0.47810172  0.30782151  0.30782151  0.30782151  0.47810172
   0.30782151]]


In [11]:
# data cleansing
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [12]:
df['review'] = df['review'].apply(preprocessor)

In [13]:
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [14]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    #return [porter.stem(word) for word in text.split()]
    return pd.Series(text.split()).apply(porter.stem).tolist()
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [15]:
import nltk
nltk.download('stopwords')

removing collection member with no package: panlex_lite
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Grzegorz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [17]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [18]:
from pathos.multiprocessing import ProcessingPool as Pool
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import tokenizers

param_grid = [{'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizers.tokenizer, 
                                   tokenizers.tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizers.tokenizer, 
                                   tokenizers.tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}]

lr_tfidf = Pipeline([('vect', TfidfVectorizer(strip_accents=None, 
                                              lowercase=False, 
                                              preprocessor=None)),
                     ('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, 
                           scoring='accuracy',
                           cv=5, verbose=2,
                           n_jobs=4)

gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 25.9min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 148.0min
[Parallel(n_jobs=4)]: Done 240 out of 240 | elapsed: 229.3min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=4,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', '...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
     

In [19]:
from sklearn.metrics import accuracy_score
accuracy_score(gs_lr_tfidf.predict(X_test), y_test)

0.89720163773559369

In [20]:
import pickle

with open('gs_lr_tfidf', 'wb') as f:
    pickle.dump(gs_lr_tfidf, f, pickle.HIGHEST_PROTOCOL)
    

In [28]:
gs_lr_tfidf.predict(['I like this movie very much'])

array([0], dtype=int64)