In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as tkr
import seaborn as sns
import scipy as stats

import nltk # sentiment library
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
nltk.download('vader_lexicon') # download vader lexicon
nltk.download("stopwords")
nltk.download("wordnet")

from nltk.sentiment import SentimentIntensityAnalyzer as SIA

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kevinvo/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kevinvo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kevinvo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# import sys
# !{sys.executable} -m pip install nltk
# pip install nltk

In [3]:
#provides relative paths for csv as referenced from a folder
import os
for dirname, _, filenames in os.walk('data'): #path to folder
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [4]:
os.getcwd()

'/Users/kevinvo/CUBigDataClass'

**README**
- emot: library used to process emoji and emoticons

- first see polarity adjusted score with TextBlob 

- then compare adjusted polarity scores using NLTK's sentiment analyzer afterwards

**processing emojis via emot library:**
----

In [5]:
# 1. Via pip:
# $ pip install emot --upgrade

import sys
!{sys.executable} -m pip install emot --upgrade

# 2. From master branch: 
# $ git clone https://github.com/NeelShah18/emot.git
# $ cd emot
# $ python setup.py install

import emot

def clean_mean(val):
    return val.replace('_', ' ').replace('-', ' ').replace(':', ' ')

def convert_emojicon(text, isPrint=True):
    for emoti in emot.emo_unicode.EMOTICONS:
        if emoti in text:
            text = text.replace(emoti, clean_mean(emot.emo_unicode.EMOTICONS.get(emoti, '')))
            if isPrint==True:
                print(emoti)
            
    for emoti in emot.emo_unicode.UNICODE_EMO:
        if emoti in text:
            text = text.replace(emoti, clean_mean(emot.emo_unicode.UNICODE_EMO.get(emoti, '')))
            if isPrint==True:
                print(emoti)
            
    for emoti in emot.emo_unicode.EMOTICONS_EMO:
        if emoti in text:
            text = text.replace(emoti, clean_mean(emot.emo_unicode.EMOTICONS_EMO.get(emoti, '')))
            if isPrint==True:
                print(emoti)
    return text



**via TextBlob Sentiment Analyser:**
----


In [6]:
from textblob import TextBlob

def compareAdjustedScores(text):
    adjusted_text = convert_emojicon(text, isPrint==False)
    print("pre-adjustments: {}; post-adjustments: {}".format(TextBlob(text).sentiment.polarity, TextBlob(adjusted_text).sentiment.polarity))

In [7]:
def stem_tokenize(text):
    stop_words = stopwords.words("english")
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    lemmatizer = WordNetLemmatizer()
    tokenzr = RegexpTokenizer('\s+', gaps = True)
    
    tokenized_text = tokenzr.tokenize(text.lower())
    words = [lemmatizer.lemmatize(w) for w in tokenized_text if w not in stop_words]
    stem_text = " ".join([stemmer.stem(i) for i in words])
    return stem_text

In [8]:
isPrint = True
#
sample_sad = "😞"
sample_sad=stem_tokenize(sample_sad)
convert_emojicon(sample_sad, isPrint==True)
compareAdjustedScores(sample_sad)
print("\n---\n")
#
sample_sad2 = "🙁"
sample_sad2=stem_tokenize(sample_sad2)
convert_emojicon(sample_sad2, isPrint==True)
compareAdjustedScores(sample_sad2)
print("\n---\n")
#
sample_happy = "😀"
sample_happy=stem_tokenize(sample_happy)
convert_emojicon(sample_happy, isPrint==True)
compareAdjustedScores(sample_happy)
print("\n---\n")
#
sample_happy2= "🙂"
sample_happy2=stem_tokenize(sample_happy2)
convert_emojicon(sample_happy2, isPrint==True)
compareAdjustedScores(sample_happy2)

😞
pre-adjustments: 0.0; post-adjustments: -0.75

---

🙁
pre-adjustments: 0.0; post-adjustments: -0.16666666666666666

---

😀
pre-adjustments: 0.0; post-adjustments: 0.0

---

🙂
pre-adjustments: 0.0; post-adjustments: -0.16666666666666666


**via NLTK Sentiment Analyser:**
---

In [9]:
sia = SIA()
# initialize object first, as SIA (from NLTK) is an object class

###
sample_text_sad1a = "😞"
score_sad1a, processedScore_sad1a = sia.polarity_scores(sample_text_sad1a), sia.polarity_scores(convert_emojicon(sample_text_sad1a, isPrint==False))
print("\nCase 1a:  disappointed face\n pre-emoji processing: {}\n post-emoji processing: {}\n".format(score_sad1a, processedScore_sad1a))
print("before: {}; after: {}".format(score_sad1a['compound'], processedScore_sad1a['compound'],'\n'))

print(convert_emojicon(sample_text_sad1a, isPrint==True))
####

sample_text_sad1b = "🙁"
score_sad1b, processedScore_sad1b = sia.polarity_scores(sample_text_sad1b), sia.polarity_scores(convert_emojicon(sample_text_sad1b, isPrint==False))
print("\nCase 1b:  slightly frowning face \n pre-emoji processing: {}\n post-emoji processing: {}\n".format(score_sad1b, processedScore_sad1b))
print("before: {}; after: {}".format(score_sad1b['compound'], processedScore_sad1b['compound'],'\n'))

print(convert_emojicon(sample_text_sad1b, isPrint==True))
###

sample_text_happy2a = "😀"
print("\nCase 2a: grinning face \npre-emoji processing: {}\npost-emoji processing: {}\n".format(sia.polarity_scores(sample_text_happy2a), sia.polarity_scores(convert_emojicon(sample_text_happy2a, isPrint==False))))
print("before: {}; after: {}".format(sia.polarity_scores(sample_text_happy2a)['compound'], sia.polarity_scores(convert_emojicon(sample_text_happy2a, isPrint==False))['compound'],'\n'))

print(convert_emojicon(sample_text_happy2a, isPrint==True))
###

sample_text_happy2b = "🙂"
score_happy2b, processedScore_happy2b = sia.polarity_scores(sample_text_happy2b), sia.polarity_scores(convert_emojicon(sample_text_happy2b, isPrint==False))
print("\nCase 2b: slightly smiling face \n pre-emoji processing: {}\n post-emoji processing: {}\n".format(score_happy2b, processedScore_happy2b))
print("before: {}; after: {}".format(score_happy2b['compound'], processedScore_happy2b['compound'],'\n'))

print(convert_emojicon(sample_text_happy2b, isPrint==True))


Case 1a:  disappointed face
 pre-emoji processing: {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
 post-emoji processing: {'neg': 0.756, 'neu': 0.244, 'pos': 0.0, 'compound': -0.4767}

before: 0.0; after: -0.4767
😞
 disappointed face 

Case 1b:  slightly frowning face 
 pre-emoji processing: {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
 post-emoji processing: {'neg': 0.513, 'neu': 0.487, 'pos': 0.0, 'compound': -0.2748}

before: 0.0; after: -0.2748
🙁
 slightly frowning face 

Case 2a: grinning face 
pre-emoji processing: {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
post-emoji processing: {'neg': 0.0, 'neu': 0.286, 'pos': 0.714, 'compound': 0.3612}

before: 0.0; after: 0.3612
😀
 grinning face 

Case 2b: slightly smiling face 
 pre-emoji processing: {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
 post-emoji processing: {'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.4033}

before: 0.0; after: 0.4033
🙂
 slightly smiling face 


----
- adjusting score, without tokenization

- adjusting score, then tokenization 

(to eval how tokenization influences adjusted polarity scores for emojis)


In [10]:
### tokenize than convert
toBe_Adjusted = "😀"
sample_adjusted=convert_emojicon(toBe_Adjusted, isPrint==False)
print("Case 3: no tokenization, grinning face \npre-emoji processing: {}\npost-emoji processing: {}\n".format(sia.polarity_scores(toBe_Adjusted), sia.polarity_scores(sample_adjusted)))

print("adjust score without tokenization: ",sia.polarity_scores(toBe_Adjusted)['compound'], sia.polarity_scores(sample_adjusted)['compound'],'\n')


### tokenize after converting
toBe_Adjusted_Tokenized = "😀"
sample_adjusted=convert_emojicon(toBe_Adjusted_Tokenized, isPrint==False)
sample_adjustedtokenized=stem_tokenize(sample_adjusted)
convert_emojicon(toBe_Adjusted_Tokenized, isPrint==False)
print("Case 3: +tokenization, grinning face \npre-emoji processing: {}\npost-emoji processing: {}\n".format(sia.polarity_scores(toBe_Adjusted_Tokenized), sia.polarity_scores(sample_adjustedtokenized)))

print("adjust score WITH tokenization: ", sia.polarity_scores(toBe_Adjusted_Tokenized)['compound'], sia.polarity_scores(sample_adjustedtokenized)['compound'],'\n')


Case 3: no tokenization, grinning face 
pre-emoji processing: {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
post-emoji processing: {'neg': 0.0, 'neu': 0.286, 'pos': 0.714, 'compound': 0.3612}

adjust score without tokenization:  0.0 0.3612 

Case 3: +tokenization, grinning face 
pre-emoji processing: {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
post-emoji processing: {'neg': 0.0, 'neu': 0.244, 'pos': 0.756, 'compound': 0.4767}

adjust score WITH tokenization:  0.0 0.4767 



In [11]:
sample_adjustedtokenized

'grin face'

In [12]:
sample_adjusted

' grinning face '

In [13]:
toBe_Adjusted

'😀'