## Lexicon-based Methods in Practice


In [1]:
%pip install ekorpkit
from ekorpkit.models.metrics import evaluate_classification_performance

In [4]:
import nltk

nltk.download("movie_reviews")

from nltk.corpus import movie_reviews

print("#review count:", len(movie_reviews.fileids()))
print("#samples of file ids:", movie_reviews.fileids()[:10])
print("#categories of reviews:", movie_reviews.categories())
print('#num of "neg" reviews:', len(movie_reviews.fileids(categories="neg")))
print('#num of "pos" reviews:', len(movie_reviews.fileids(categories="pos")))

fileid = movie_reviews.fileids()[0]
print("#id of the first review:", fileid)
print("#part of the first review:", movie_reviews.raw(fileid)[:500])
print("#sentiment of the first review:", movie_reviews.categories(fileid))

fileids = movie_reviews.fileids()
reviews = [movie_reviews.raw(fileid) for fileid in fileids]
categories = [movie_reviews.categories(fileid)[0] for fileid in fileids]


#review count: 2000
#samples of file ids: ['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']
#categories of reviews: ['neg', 'pos']
#num of "neg" reviews: 1000
#num of "pos" reviews: 1000
#id of the first review: neg/cv000_29416.txt
#part of the first review: plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . 
what's the deal ? 
watch the movie and " sorta " find out . . . 
critique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . 
which is what makes this review an even harder one to write , since i generally applaud films which attempt
#sentiment of the first review: ['neg']


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


### TextBlob

- https://textblob.readthedocs.io/en/dev/quickstart.html


In [7]:
%%capture
%pip install -U textblob

In [5]:
from textblob import TextBlob

result = TextBlob(reviews[0])
print(result.sentiment)

Sentiment(polarity=0.06479782948532947, subjectivity=0.5188408350908352)


In [6]:
def sentiment_TextBlob(docs):
    results = []

    for doc in docs:
        testimonial = TextBlob(doc)
        if testimonial.sentiment.polarity > 0:
            results.append("pos")
        else:
            results.append("neg")
    return results


predictions = sentiment_TextBlob(reviews)


In [7]:
cm = evaluate_classification_performance(categories, predictions)

Accuracy:  0.6
Precison:  0.7225010902553423
Recall:  0.6
F1 Score:  0.5361560556566348
Model Report: 
___________________________________________________
              precision    recall  f1-score   support

         neg       0.89      0.23      0.36      1000
         pos       0.56      0.97      0.71      1000

    accuracy                           0.60      2000
   macro avg       0.72      0.60      0.54      2000
weighted avg       0.72      0.60      0.54      2000



### AFINN

- https://github.com/fnielsen/afinn
- http://corpustext.com/reference/sentiment_afinn.html


In [14]:
%%capture
%pip install afinn

In [8]:
from afinn import Afinn


def sentiment_Afinn(docs):
    afn = Afinn(emoticons=True)
    results = []

    for doc in docs:
        if afn.score(doc) > 0:
            results.append("pos")
        else:
            results.append("neg")
    return results


In [9]:
predictions = sentiment_Afinn(reviews)
cm = evaluate_classification_performance(categories, predictions)

Accuracy:  0.664
Precison:  0.6783880680137142
Recall:  0.664
F1 Score:  0.6570854714462421
Model Report: 
___________________________________________________
              precision    recall  f1-score   support

         neg       0.73      0.52      0.61      1000
         pos       0.63      0.81      0.71      1000

    accuracy                           0.66      2000
   macro avg       0.68      0.66      0.66      2000
weighted avg       0.68      0.66      0.66      2000



### VADER

- https://github.com/cjhutto/vaderSentiment


In [10]:
%%capture
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer


def sentiment_vader(docs):
    analyser = SentimentIntensityAnalyzer()
    results = []

    for doc in docs:
        score = analyser.polarity_scores(doc)
        if score["compound"] > 0:
            results.append("pos")
        else:
            results.append("neg")

    return results


In [12]:
predictions = sentiment_vader(reviews)
cm = evaluate_classification_performance(categories, predictions)

Accuracy:  0.635
Precison:  0.6580655585685583
Recall:  0.635
F1 Score:  0.6211802777111816
Model Report: 
___________________________________________________
              precision    recall  f1-score   support

         neg       0.72      0.44      0.55      1000
         pos       0.60      0.83      0.69      1000

    accuracy                           0.64      2000
   macro avg       0.66      0.64      0.62      2000
weighted avg       0.66      0.64      0.62      2000

