# Sentiment analysis

In [1]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

In [2]:
import spacy
from spacy.tokens import DocBin
from spacy import displacy

nlp = spacy.load("en_core_web_sm", exclude=["ner"])

In [3]:
df = pd.read_parquet('s3://ling583/sentiment.parquet', storage_options={'anon': True})

In [4]:
docs = DocBin().from_disk('parsed.docbin')
df['doc'] = list(docs.get_docs(nlp.vocab))

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train,test = train_test_split(df,
                             test_size=0.2,
                             stratify=df['sentiment'],
                             random_state=619)

----

In [7]:
displacy.render(nlp("They didn't have any clean towels."))

In [8]:
from spacy.tokens import Token
Token.set_extension('neg', default=False)
# add new attribute to the negations 

In [13]:
for doc in df['doc']:
    for t in doc:
        if t.dep_ == 'neg':
            t.head._.neg = True

In [14]:
def add_neg(token):
    return 'NOT:'+token.norm_ if token._.neg else token.norm_

In [15]:
def tokenize(doc):
    return [add_neg(t) for t in doc]

In [24]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier

In [25]:
m1 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=tokenize),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m1.fit(train['doc'], train['sentiment'])
m1.score(test['doc'], test['sentiment'])

0.8986

In [26]:
def print_top_feats(M, k=0):
    V = M.named_steps['countvectorizer'].get_feature_names()
    coef = M.named_steps['sgdclassifier'].coef_[0]
    order = coef.argsort()
    for w1, w2 in zip(order[-k:][::-1],order[:k]):
        print(f'{V[w1]:20s} {coef[w1]:7.3f} | {V[w2]:20s} {coef[w2]:7.3f}')

In [28]:
print_top_feats(m1, 25)

great                  5.614 | NOT:stay              -5.157
perfect                4.925 | average               -5.054
loved                  4.842 | ok                    -4.973
excellent              4.418 | dirty                 -4.793
amazing                4.232 | poor                  -4.536
wonderful              4.001 | ruined                -4.519
definitely             3.978 | unhelpful             -4.460
appointed              3.820 | not                   -4.435
comfortable            3.813 | tiny                  -4.222
spacious               3.745 | dated                 -4.095
pleasantly             3.719 | worst                 -4.024
minor                  3.629 | filthy                -3.988
NOT:beat               3.623 | dingy                 -3.974
downside               3.603 | outdated              -3.841
spotless               3.498 | update                -3.805
complaint              3.478 | terrible              -3.782
elegant                3.462 | poorly   

In [29]:
def negify(tok):
    tok._.neg = True
    for child in tok.children:
        negify(child)

In [30]:
for doc in df['doc']:
    for t in doc:
        t._.neg = False
    for t in doc:        
        if t.dep_ == 'neg':
            t.head._.neg = True
            for r in t.head.rights:
                if r.dep_ in ['acomp', 'advmod', 'attr', 'dobj', 'prep', 'xcomp']:
                    negify(r)

In [31]:
m2 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=tokenize),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m2.fit(train['doc'], train['sentiment'])
m2.score(test['doc'], test['sentiment'])

0.901

In [33]:
print_top_feats(m2, 25)

great                  5.559 | dirty                 -5.075
NOT:hesitate           5.338 | average               -5.043
loved                  5.027 | ok                    -4.680
perfect                4.702 | poor                  -4.566
excellent              4.313 | dated                 -4.338
comfortable            4.088 | ruined                -4.320
wonderful              3.947 | NOT:again             -4.188
amazing                3.895 | disappointed          -4.160
pleasantly             3.748 | worst                 -4.111
downside               3.656 | unhelpful             -4.055
definitely             3.654 | filthy                -4.033
NOT:better             3.652 | outdated              -4.005
NOT:beat               3.647 | terrible              -3.991
appointed              3.640 | not                   -3.984
minor                  3.590 | tiny                  -3.935
spacious               3.571 | horrible              -3.625
lovely                 3.550 | rude     

In [34]:
def mod_tokenizer(doc):
    return [ add_neg(w.head) + '_' + add_neg(w) for w in doc 
            if w.dep_ in ['amod', 'advmod'] ] + \
            [ add_neg(w) for w in doc]

In [35]:
mod_tokenizer(nlp("The didn't have any clean towels."))

['towels_clean', 'the', 'do', 'not', 'have', 'any', 'clean', 'towels', '.']

In [36]:
m3 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=mod_tokenizer),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m3.fit(train['doc'], train['sentiment'])
m3.score(test['doc'], test['sentiment'])

0.909

In [37]:
print_top_feats(m3, 25)

NOT:hesitate           5.042 | dirty                 -4.713
great                  4.607 | average               -4.223
loved                  4.549 | ok                    -3.921
perfect                4.293 | poor                  -3.890
excellent              3.652 | terrible              -3.841
quiet                  3.458 | tiny                  -3.781
amazing                3.449 | worst                 -3.741
lovely                 3.376 | ruined                -3.669
wonderful              3.233 | filthy                -3.647
NOT:disappointed       3.084 | unhelpful             -3.628
NOT:beat               3.054 | not                   -3.588
thing_bad              3.036 | dated                 -3.486
immaculate             3.030 | outdated              -3.216
NOT:better             3.030 | horrible              -3.199
NOT:eat                3.018 | disappointed          -3.178
spotless               2.920 | rude                  -2.989
minor                  2.880 | thing_bes

In [38]:
def everything(doc):
    return [ add_neg(w.head) + '_' + add_neg(w) for w in doc ] + \
            [ add_neg(w) for w in doc]

In [39]:
everything(nlp("The didn't have any clean towels."))

['have_the',
 'have_do',
 'have_not',
 'have_have',
 'towels_any',
 'towels_clean',
 'have_towels',
 'have_.',
 'the',
 'do',
 'not',
 'have',
 'any',
 'clean',
 'towels',
 '.']

In [40]:
m4 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=everything),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m4.fit(train['doc'], train['sentiment'])
m4.score(test['doc'], test['sentiment'])

0.9114

In [41]:
print_top_feats(m4, 50)

great                  5.502 | average               -4.809
excellent              4.077 | ok                    -4.636
perfect                3.792 | dirty                 -4.281
wonderful              3.514 | poor                  -4.011
comfortable            3.283 | not                   -3.934
amazing                3.227 | worst                 -3.629
lovely                 3.203 | terrible              -3.626
quiet                  3.166 | no                    -3.487
clean_very             2.849 | tiny                  -3.238
definitely             2.789 | rude                  -3.026
minor                  2.775 | nothing               -2.924
loved                  2.678 | dated                 -2.922
awesome                2.655 | horrible              -2.868
comfortable_very       2.585 | disappointed          -2.851
fantastic              2.527 | unhelpful             -2.641
spacious               2.488 | NOT:again             -2.605
appointed              2.411 | bad      

### Summary 

Using the method of sentiment analysis, there are serveral advantages such as the algorithm would account for the words in different position and different meaning based on the negative and positive altitude. The final model that we have here have an accuracy of 91%, which is really good. The first 50 positive and 50 negative words seem to be very accurate and they makes sense based on the positive or negative. Like for example, positive pair of words of the hotel reviews would be great hotel, stay again, very clean, very comfortable and pleasantly surprised. On the other hand, some negative pairs of words would be very small, average hotel, need of, very expensive, and stay away. These words make perfect sense and seem very accurate. To improve the model, we can always do hyperparametize the model to get the best `min_df` and `max_df` to filter out the words that occur too often or too little. In addition, we could also try different models like the SGD classifier or the binomial model to find better outputs.  

We also tried to predict setiment using a SGD classified with TfidfTransformer. The output for this model looks very promising. The accuracy and macro average F1 scores are really high. This model is very simple, too. However, this model have an accuracy of 90%, which is 1% less athan the sentiment analysis. Thus, we would go ahead and conclude that using the setiment analysis and account for words in different positioni would be more accurate and reliable, although the differences are really small. 