In [8]:
import string
import pandas as pd
from nltk.corpus import stopwords

In [9]:
# set up emolex analysis
emolex_df = pd.read_csv('data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', sep='\t', names=['word', 'emotion','association'])
emolex_df = emolex_df[emolex_df.association == 1]
emolex_words = emolex_df.pivot(index='word', columns='emotion', values='association')
emolex_words = emolex_words.reset_index()

In [10]:
def count_emotions(text):

    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    words = text.split()
    stop_words = set(stopwords.words('english'))

    words = [word for word in words if word not in stop_words]
    emotions_count = emolex_words[emolex_words.word.isin(words)].sum()
    emotions_count['word_count'] = len(words)

    return emotions_count

In [11]:
# read the restaurant dataset with vader scores
df = pd.read_csv('data/vader_newstar_df.csv')

In [12]:
# run the emolex analysis on the dataframe and add to the dataframe
emotions_count = df['text'].apply(count_emotions)
vader_emolex_rest = pd.concat([df, emotions_count], axis=1)

In [13]:
# set index to review id and drop columns not required
vader_emolex_rest = vader_emolex_rest.reset_index()
vader_emolex_rest = vader_emolex_rest.set_index('review_id')
vader_emolex_rest.drop(['index','word'], axis=1, inplace=True)

In [14]:
vader_emolex_rest.head()

Unnamed: 0_level_0,text,stars,positive,neutral,negative,compound_sentiment,newstar,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust,word_count
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
fj7N9Lp6AvEEy6LHrDZzjw,When I was shown to my seat of was still wet s...,2,0.036,0.846,0.118,-0.6437,1,0.0,1.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,1.0,26.0
YX2cFHDxlUfGnQ8bHPq4cA,Not Impressed at all. Ordered a omelette and b...,1,0.078,0.715,0.207,-0.8337,1,0.0,1.0,1.0,1.0,1.0,3.0,2.0,3.0,0.0,1.0,33.0
pDN3hRBarmGWXbK64A83MA,never coming back here again. all of the glass...,1,0.065,0.865,0.07,-0.3773,1,0.0,1.0,3.0,1.0,1.0,7.0,2.0,2.0,0.0,1.0,64.0
ae5On6KCPiglMQJ--1JcTQ,I don't recommend this place for breakfast. Th...,2,0.0,0.86,0.14,-0.7672,1,0.0,1.0,0.0,0.0,3.0,3.0,6.0,0.0,1.0,3.0,23.0
i5jMeyoJSbUrQ7T-AU22_A,"Well, lots to say. Managers were busy makin co...",1,0.124,0.741,0.134,-0.3129,1,2.0,4.0,2.0,1.0,1.0,4.0,3.0,1.0,0.0,1.0,60.0


Unnamed: 0_level_0,text,stars,positive,neutral,negative,compound_sentiment,newstar,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust,word_count
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
fj7N9Lp6AvEEy6LHrDZzjw,When I was shown to my seat of was still wet s...,2,0.036,0.846,0.118,-0.6437,1,0.0,1.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,1.0,26.0
YX2cFHDxlUfGnQ8bHPq4cA,Not Impressed at all. Ordered a omelette and b...,1,0.078,0.715,0.207,-0.8337,1,0.0,1.0,1.0,1.0,1.0,3.0,2.0,3.0,0.0,1.0,33.0
pDN3hRBarmGWXbK64A83MA,never coming back here again. all of the glass...,1,0.065,0.865,0.07,-0.3773,1,0.0,1.0,3.0,1.0,1.0,7.0,2.0,2.0,0.0,1.0,64.0
ae5On6KCPiglMQJ--1JcTQ,I don't recommend this place for breakfast. Th...,2,0.0,0.86,0.14,-0.7672,1,0.0,1.0,0.0,0.0,3.0,3.0,6.0,0.0,1.0,3.0,23.0
i5jMeyoJSbUrQ7T-AU22_A,"Well, lots to say. Managers were busy makin co...",1,0.124,0.741,0.134,-0.3129,1,2.0,4.0,2.0,1.0,1.0,4.0,3.0,1.0,0.0,1.0,60.0


In [16]:
# vader_emolex_rest.to_json('data/vader_emolex_rest.json')
vader_emolex_rest.to_csv('data/vader_emolex_rest.csv')