# Sentiment analysis

In [1]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

In [2]:
import spacy
from spacy.tokens import DocBin
from spacy import displacy

nlp = spacy.load("en_core_web_sm", exclude=["ner"])

In [3]:
df = pd.read_parquet('s3://ling583/sentiment.parquet', storage_options={'anon': True})

In [4]:
docs = DocBin().from_disk('parsed.docbin')
df['doc'] = list(docs.get_docs(nlp.vocab))

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

In [6]:
train,test = train_test_split(df,
                             test_size=0.2,
                             stratify=df['sentiment'],
                             random_state=619)

----

In [7]:
displacy.render(nlp("They didn't have any clean towels."))

In [8]:
from spacy.tokens import Token
Token.set_extension('neg', default=False)

In [9]:
for doc in df['doc']:
    for t in doc:
        if t.dep_ == 'neg':
            t.head._.neg = True

In [10]:
def add_neg(token):
    return 'NOT:'+token.norm_ if token._.neg else token.norm_

In [11]:
def tokenize(doc):
    return [add_neg(t) for t in doc]

In [12]:
m1 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=tokenize),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m1.fit(train['doc'], train['sentiment'])
m1.score(test['doc'], test['sentiment'])

0.8989

In [13]:
def print_top_feats(M, k=0):
    V = M.named_steps['countvectorizer'].get_feature_names()
    coef = M.named_steps['sgdclassifier'].coef_[0]
    order = coef.argsort()
    for w1, w2 in zip(order[-k:][::-1],order[:k]):
        print(f'{V[w1]:20s} {coef[w1]:7.3f} | {V[w2]:20s} {coef[w2]:7.3f}')

In [14]:
print_top_feats(m1, 25)

great                  5.594 | ok                    -4.987
loved                  4.958 | average               -4.909
perfect                4.910 | dirty                 -4.903
excellent              4.608 | NOT:stay              -4.885
amazing                4.420 | poor                  -4.657
definitely             4.052 | ruined                -4.540
wonderful              4.019 | unhelpful             -4.410
comfortable            3.907 | not                   -4.238
appointed              3.871 | tiny                  -4.236
spacious               3.778 | dated                 -4.195
minor                  3.765 | worst                 -4.173
pleasantly             3.731 | filthy                -3.934
spotless               3.630 | dingy                 -3.907
NOT:beat               3.607 | terrible              -3.843
complaint              3.527 | outdated              -3.833
downside               3.503 | update                -3.795
elegant                3.418 | poorly   

In [15]:
def negify(tok):
    tok._.neg = True
    for child in tok.children:
        negify(child)

In [16]:
for doc in df['doc']:
    for t in doc:
        t._.neg = False
    for t in doc:        
        if t.dep_ == 'neg':
            t.head._.neg = True
            for r in t.head.rights:
                if r.dep_ in ['acomp', 'advmod', 'attr', 'dobj', 'prep', 'xcomp']:
                    negify(r)

In [17]:
m2 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=tokenize),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m2.fit(train['doc'], train['sentiment'])
m2.score(test['doc'], test['sentiment'])

0.9007

In [18]:
print_top_feats(m2, 25)

NOT:hesitate           5.485 | dirty                 -5.126
great                  5.425 | average               -4.899
loved                  4.949 | ok                    -4.669
perfect                4.603 | poor                  -4.539
excellent              4.219 | ruined                -4.302
wonderful              4.061 | dated                 -4.295
comfortable            4.042 | disappointed          -4.217
amazing                3.991 | unhelpful             -4.041
pleasantly             3.815 | worst                 -4.021
definitely             3.743 | outdated              -4.003
downside               3.738 | filthy                -3.981
NOT:better             3.649 | NOT:again             -3.979
NOT:beat               3.611 | terrible              -3.874
appointed              3.596 | not                   -3.855
lovely                 3.578 | tiny                  -3.848
spacious               3.554 | rude                  -3.632
minor                  3.525 | horrible 

In [19]:
def mod_tokenizer(doc):
    return [ add_neg(w.head) + '_' + add_neg(w) for w in doc 
            if w.dep_ in ['amod', 'advmod'] ] + \
            [ add_neg(w) for w in doc]

In [20]:
mod_tokenizer(nlp("The didn't have any clean towels."))

['towels_clean', 'the', 'do', 'not', 'have', 'any', 'clean', 'towels', '.']

In [21]:
m3 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=mod_tokenizer),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m3.fit(train['doc'], train['sentiment'])
m3.score(test['doc'], test['sentiment'])

0.908

In [22]:
print_top_feats(m3, 25)

NOT:hesitate           5.061 | dirty                 -4.706
loved                  4.702 | average               -4.288
great                  4.552 | poor                  -4.049
perfect                4.296 | ok                    -3.874
excellent              3.840 | terrible              -3.802
lovely                 3.500 | tiny                  -3.740
amazing                3.471 | ruined                -3.700
quiet                  3.430 | worst                 -3.682
wonderful              3.245 | filthy                -3.660
immaculate             3.099 | unhelpful             -3.576
thing_bad              3.093 | not                   -3.541
NOT:eat                3.093 | dated                 -3.531
NOT:disappointed       3.088 | outdated              -3.359
NOT:better             3.067 | horrible              -3.297
NOT:beat               3.001 | disappointed          -3.222
spotless               2.977 | small_so              -2.958
spacious               2.893 | rude     

In [23]:
def everything(doc):
    return [ add_neg(w.head) + '_' + add_neg(w) for w in doc ] + \
            [ add_neg(w) for w in doc]

In [24]:
everything(nlp("The didn't have any clean towels."))

['have_the',
 'have_do',
 'have_not',
 'have_have',
 'towels_any',
 'towels_clean',
 'have_towels',
 'have_.',
 'the',
 'do',
 'not',
 'have',
 'any',
 'clean',
 'towels',
 '.']

In [25]:
m4 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=everything),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m4.fit(train['doc'], train['sentiment'])
m4.score(test['doc'], test['sentiment'])

0.9133

In [26]:
print_top_feats(m4, 50)

great                  5.480 | average               -4.770
excellent              4.179 | ok                    -4.652
perfect                3.821 | dirty                 -4.311
wonderful              3.542 | not                   -4.110
comfortable            3.281 | poor                  -3.997
amazing                3.238 | terrible              -3.578
lovely                 3.123 | worst                 -3.541
quiet                  3.112 | no                    -3.392
clean_very             2.796 | tiny                  -3.139
definitely             2.756 | nothing               -3.048
minor                  2.734 | rude                  -3.035
loved                  2.681 | dated                 -2.923
awesome                2.633 | disappointed          -2.906
comfortable_very       2.619 | horrible              -2.894
fantastic              2.548 | unhelpful             -2.653
appointed              2.422 | NOT:again             -2.635
beautiful              2.396 | bad      

In [43]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=everything, max_df=.93, min_df=1),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

0.9122

It seems that the best model to pick is the m4 moddel with the everything tokenizer because after trying out different parameters like min_df, max_df, and alpha for sgd classifier, the score doesn't get higher than 0.9133. The closest I could get to that score was when I set the min_df = 1 and max_df = 0.93, but even then the score still couldn't get higher than m4.