In [88]:
import re
import numpy as np
import pickle
import pandas as pd
from collections import Counter

from nltk import tokenize as token
import spacy
import textacy
from textblob import TextBlob

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

data_path = '/Users/eb/Google Drive/Research/text_words_meaning/twm-slides/notebooks/data/'

In [89]:
def spacy_pipe(nlp):
    """Custom spacy pipeline."""
    return(nlp.tagger, nlp.parser)


def get_lem(doc):
    """Return lemma of spacy doc if lemma is noun / adjective."""
    interesting_pos = ('NOUN', 'PROPN', 'ADJ')
    lems = [negated_word(word) for word in doc if word.pos_ in interesting_pos]

    return lems


def get_chunk(noun_chunk):
    """Return interesting parts of noun chunks."""
    interesting_pos = ('NOUN', 'PROPN', 'ADJ', 'ADV', 'VERB')
    chunk = [tok.lemma_ for tok in noun_chunk if tok.pos_ in interesting_pos]

    if len(chunk) > 1:
        return '_'.join(chunk)
    else:
        return ''

    
def negated_word(token):
    if token.pos and any(c.dep_ == 'neg' for c in token.children):
        return token.lemma_ + '_NEG'
    else:
        return token.lemma_

def term_list(doc):
    """Return term list item which is used to create term document matrix."""
    tl = []

    # lemmata of nouns and adjectives
    tl.extend(get_lem(doc))

    # noun chunks
    chunks = [get_chunk(chunk) for chunk in doc.noun_chunks]
    chunks = [chunk for chunk in chunks if chunk]
    tl.extend(chunks)

    return tl


def accuracy(pred, actual):
    """Calculate accuracy of predictions."""
    return sum(pred == actual) / len(pred)


def count_sentiment_words(tokens, lexicon):
    count = Counter(tokens)
    count = {key: count[key] for key in count if key in lexicon}
    return sum(list(count.values()))


def label_sentiment(review, lexica):
    tokens = [re.sub('[^A-Za-z]+', ' ', token)
              for token in token.word_tokenize(review)]

    n_token = len(tokens)
    n_positive = count_sentiment_words(tokens, lexica['positive'])
    n_negative = count_sentiment_words(tokens, lexica['negative'])
    fract_positive = n_positive / n_token
    fract_negative = n_negative / n_token
    
    if fract_positive > fract_negative:
        return 1
    else:
        return -1

In [16]:
seed = 123
n_keep = 3000

beer_reviews = pickle.load(open(data_path + '2_styles_sample.p', 'rb'))

In [21]:
# too much data. keep 2000 highest and 2000 lowest ratings
review_dict = [{'rating': meta['rating'], 'text': text}
               for meta, text in zip(beer_reviews[0], beer_reviews[1])]
df = pd.DataFrame(review_dict).sort_values(by='rating').reset_index(drop=True)
df = df.head(n_keep).append(df.tail(n_keep)).reset_index(drop=True)
df['sentiment'] = pd.Series([-1] * n_keep + [1] * n_keep)

In [50]:
reviews = df['text'].tolist()
rating = df['sentiment'].tolist()

In [25]:
nlp = spacy.load('en', create_pipeline=spacy_pipe)
texts = [doc for doc in nlp.pipe(reviews, n_threads=-1)]

# some additional pre-processing
texts = [' '.join(term_list(doc)) for doc in texts]

In [45]:
count_words = [doc.split(' ') for doc in texts]
count_words = [item for sublist in count_words for item in sublist]
len(Counter(count_words))

46208

In [79]:
# Use dimension reduction
texty_texts = textacy.Corpus('en', texts=texts)
tl_textacy = (doc.to_terms_list(ngrams=(1, 2), named_entities=True, as_strings=True) 
              for doc in texty_texts)

vec_specs = {'weighting': 'tfidf',
             'normalize': 'True',
             'smooth_idf': 'True',
             'min_df': 5,
             'max_df': 0.95}
vectorizer = textacy.Vectorizer(**vec_specs)
doc_term_matrix = vectorizer.fit_transform(tl_textacy)

model = textacy.tm.TopicModel('lda', n_topics=100, random_state=seed)
model.fit(doc_term_matrix)

doc_topic_matrix = model.transform(doc_term_matrix)

for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term):
    print('topic', topic_idx, ':', '   '.join(top_terms))



topic 0 : caramel   sweet   amber   head smell   taste sweet   chocolate caramel   caramel malt   candy   thin   sweet caramel
topic 1 : lace smell   small   spotty   goodness   lace   sour   chance   small white   brewery   carbonation smooth
topic 2 : depth   exceptional   beer perfect   thick mouthfeel   carbonation perfect   cloud   pours pitch   robust flavor   smell great   little coffee
topic 3 : silky   s-   m-   a-   bitter hop   just_hint   o-   family   crispness   book
topic 4 : nice   note   malty   head head   chocolatey   head nice   color nice   sticky   brown body   note taste
topic 5 : excellent   fruity   beer   summer   real   light beer   beer taste   sense   company   500ml
topic 6 : lemon   similar   biscuit   lime   tequila   citrus   soda   small head   hop malt   odd
topic 7 : wonderful   night   special   beer   texture   -pron-   -pron- beer   extra   quality   kind
topic 8 : mix   f   l   glass beer   pale golden   cigar   dirty   body smooth   chalky   dan

In [80]:
# Split in train and test
X_train, X_test, y_train, y_test = train_test_split(
    doc_topic_matrix, rating, test_size=0.33, random_state=seed)

In [81]:
parameters = {'clf__kernel': ('rbf', 'poly', 'linear', 'sigmoid'),
              'clf__gamma': ('auto', 1),
              'clf__C': (10, 1.0, 0.1)}

piper = Pipeline([('clf', SVC(random_state=seed))])

grid_search = GridSearchCV(piper, parameters, n_jobs=3, verbose=1,
                           refit=True, cv=3)

grid_search.fit(X_train, y_train)

print('Best score: %0.3f' % grid_search.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   43.4s
[Parallel(n_jobs=3)]: Done  72 out of  72 | elapsed:  1.3min finished


Best score: 0.857


In [82]:
y_pred = grid_search.predict(X_test)
res = pd.DataFrame({'y_test': pd.Series(y_test)})
res['y_pred'] = y_pred


print(pd.crosstab(res['y_test'], res['y_pred'], rownames=['True'],
                  colnames=['Predicted']))
print('Accuracy in test set: %0.3f' % accuracy(res['y_pred'], res['y_test']))

Predicted   -1    1
True               
-1         842  139
 1         116  883
Accuracy in test set: 0.871


In [83]:
grid_search.best_estimator_.get_params

<bound method Pipeline.get_params of Pipeline(memory=None,
     steps=[('clf', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=123, shrinking=True,
  tol=0.001, verbose=False))])>

### Check BL as Comparison

In [91]:
lexica = {
    sentiment: open(data_path + 'bl_%s.txt' % sentiment, encoding='utf-8',
                    errors='ignore').read().splitlines()
    for sentiment in ['positive', 'negative']
}

df['bl_pred'] = [label_sentiment(text, lexica)
                     for text in df['text']]

In [93]:
print(pd.crosstab(df['sentiment'], df['bl_pred'], rownames=['True'],
                  colnames=['Predicted']))
print('Accuracy in test set: %0.3f' % accuracy(df['sentiment'], df['bl_pred']))

Predicted    -1     1
True                 
-1         1023  1977
 1          403  2597
Accuracy in test set: 0.603
