## Sentiment analysis with word embedings

### Read in Data - IMBD Dataset

In [1]:
import os
import pandas as pd
import numpy as np

#### Read training data

In [2]:
labels = {'pos': 1, 'neg': 0}

train = pd.DataFrame()
for sentiment in ('pos', 'neg'):
    path =r'aclImdb/train/%s' %(sentiment)
    for review_file in os.listdir(path):
        with open(os.path.join(path, review_file), 'r') as input_file:
            review = input_file.read()
        train = train.append([[review, labels[sentiment]]],ignore_index=True)

train.columns = ['review', 'Sentiment']
indices = train.index.tolist()
np.random.shuffle(indices)
indices = np.array(indices)
train = train.reindex(index=indices)

In [11]:
train.iloc(1).review

AttributeError: '_iLocIndexer' object has no attribute 'review'

#### Read test data

In [5]:
labels = {'pos': 1, 'neg': 0}
test = pd.DataFrame()
for sentiment in ('pos', 'neg'):
    path =r'aclImdb/test/%s' %(sentiment)
    for review_file in os.listdir(path):
        with open(os.path.join(path, review_file), 'r') as input_file:
            review = input_file.read()
        test = test.append([[review, labels[sentiment]]],ignore_index=True)

test.columns = ['review', 'Sentiment']
indices = test.index.tolist()
np.random.shuffle(indices)
indices = np.array(indices)
test = test.reindex(index=indices)

### Load glove

In [33]:
def load_word_embedings(file):
    embeddings = {}
    with open(file, 'r') as infile:
        for line in infile:
            values = line.split()
            embeddings[values[0]] = np.asarray(values[1:], dtype='float32')
    return embeddings

fpath = 'glove.6B.300d.txt'
embeddings = load_word_embedings(fpath)

### Use the libary spacy to tokenize data.

In [25]:
import spacy
import string
import re
from spacy.symbols import ORTH

In [62]:
#Replace breaks, numbers and punctuations
def rm_punct(sentence):
    sentence = sentence.replace('<br />',"")
    regex = re.compile('[' + re.escape(string.punctuation) +'0-9\\r\\t\\n]')
    nopunct = regex.sub(' ', sentence)
    return nopunct

In [68]:
#Tokenize with Spacy and remove empty strings
my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(rm_punct(x)) if tok.text.isalpha()]

In [71]:
#remove stopwords
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

def get_non_stopwords(review):
    """Returns a list of non-stopwords"""
    return {x:1 for x in spacy_tok(str(review).lower()) if x not in stops}.keys()

### Create Mean embedding feature

In [72]:
def sentence_features_mean(s, embeddings=embeddings, emb_size=300):
    words = get_non_stopwords(s)
    words = [w for w in words if w.isalpha() and w in embeddings]
    if len(words) == 0:
        return np.hstack([np.zeros(emb_size)])
    M = np.array([embeddings[w] for w in words])
    return M.mean(axis=0)

In [73]:
x_train = np.array([sentence_features_mean(x) for x in train["review"].values])

In [75]:
x_test = np.array([sentence_features_mean(x) for x in test["review"].values])

In [76]:
y_train = train["Sentiment"].values
y_test = test["Sentiment"].values

### Run XGBOOST with Average Embedding

In [85]:
import xgboost as xgb
xgb_pars = {"min_child_weight": 50, "eta": 0.05, "max_depth": 8,
            "subsample": 0.8, "silent" : 1, "nthread": 4,
            "eval_metric": "logloss", "objective": "binary:logistic"}

d_train = xgb.DMatrix(x_train, label=y_train)
d_test = xgb.DMatrix(x_test, label=y_test)

watchlist = [(d_train, 'train'), (d_test, 'test')]

bst2 = xgb.train(xgb_pars, d_train, 800, watchlist, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.679917	test-logloss:0.681763
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.412032	test-logloss:0.471938
[100]	train-logloss:0.326947	test-logloss:0.421436
[150]	train-logloss:0.281358	test-logloss:0.400522
[200]	train-logloss:0.251767	test-logloss:0.389223
[250]	train-logloss:0.229758	test-logloss:0.382391
[300]	train-logloss:0.212306	test-logloss:0.377866
[350]	train-logloss:0.196957	test-logloss:0.374057
[400]	train-logloss:0.184643	test-logloss:0.371871
[450]	train-logloss:0.173985	test-logloss:0.370368
[500]	train-logloss:0.164318	test-logloss:0.369074
[550]	train-logloss:0.155187	test-logloss:0.368571
[600]	train-logloss:0.147052	test-logloss:0.368241
Stopping. Best iteration:
[596]	train-logloss:0.147687	test-logloss:0.368151



### Bag of Words Approach

In [86]:
from sklearn.feature_extraction.text import CountVectorizer

In [108]:
veczr = CountVectorizer(tokenizer = spacy_tok, max_features= 10000)
train_term_doc = veczr.fit_transform(train['review'])
test_term_doc = veczr.transform(test['review'])

In [112]:
xgb_pars = {"min_child_weight": 50, "eta": 0.1, "max_depth": 8,
            "subsample": 0.8, "silent" : 1, "nthread": 4,
            "eval_metric": "logloss", "objective": "binary:logistic"}

d2_train = xgb.DMatrix(train_term_doc, label=y_train)
d2_test = xgb.DMatrix(test_term_doc, label=y_test)

watchlist = [(d2_train, 'train'), (d2_test, 'test')]

bst_bow = xgb.train(xgb_pars, d2_train, 1200, watchlist, early_stopping_rounds=50, verbose_eval=100)

[0]	train-logloss:0.669804	test-logloss:0.669906
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
[100]	train-logloss:0.346665	test-logloss:0.380222
[200]	train-logloss:0.286714	test-logloss:0.341599
[300]	train-logloss:0.25397	test-logloss:0.327673
[400]	train-logloss:0.229206	test-logloss:0.321084
[500]	train-logloss:0.211619	test-logloss:0.318603
[600]	train-logloss:0.196303	test-logloss:0.317063
Stopping. Best iteration:
[623]	train-logloss:0.193595	test-logloss:0.316881



From the above，BOW approach actually outperforms embedding. However, for the first model, we only used the average embedding of the review, therefore only 300 features, while the BOW has 10000 features.