# IR Project 4

## Emotion Classification

In [46]:
#Adapted from https://www.geeksforgeeks.org/emotion-classification-using-nrc-lexicon-in-python/

In [5]:
#!pip install nrclex

In [22]:
# Import libraries
import json
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import demoji
from nrclex import NRCLex 
import pandas as pd

[nltk_data] Downloading package stopwords to /Users/maga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Load data
infile = open('tweets.json','rb')
tweets = json.load(infile)
infile.close()

In [27]:
# Function to remove stopwords
def remove_stopwords(text, lang): 
    text_wo_stopwords = []
    # remove stopwords   
    if lang == 'en':
        stop_words = set(stopwords.words('english'))
    else:
        stop_words = set(stopwords.words('spanish'))
    text = re.split('\s',text)
    for t in text:
        if t not in stop_words:
            text_wo_stopwords.append(t)
    text_wo_stopwords = ' '.join(text_wo_stopwords)
    return text_wo_stopwords

In [26]:
# Function to preprocess tweets
def preprocess(raw_tweet,lang):
    text = raw_tweet.lower() # convert to lowercase
    text = re.sub('\n',' ',text) # remove '\n'
    text = re.sub(r"http\S+", "",text) # remove urls
    text = re.sub('#',' ',text) # remove '#' but leave text from hashtag
    text = re.sub('@[a-zA-Z]+',' ', text) # remove mentions
    text = re.sub('^rt ',' ', text) # remove 'rt'
    text = re.sub('[,\.\:\!¡\?\¿\_–-\’\$%|]',' ',text) # remove punctuation
    text = re.sub('[0-9]+', ' ',text) # remove numbers
    # emojis = list(demoji.findall(text).keys()) # in case we want to store emojis
    text = demoji.replace(text, '')
    text = re.sub('\s+',' ',text) # remove extra whitespaces
    text = re.sub('^\s+','',text) # remove space(s) at start
    text = re.sub('\s+$','',text) # remove space(s) at end
    #text = re.split('\s',text) # tokenize in terms of white space
    if lang == 'en' or 'es': # no stopwrods for hindi
        text = remove_stopwords(text,lang)
    return text

In [28]:
# Extract tweets per language
def get_tweets(tweets_dic):
    
    data_en, data_es, data_hi = [], [], []
    
    for tweet in tweets:
        text = tweet['tweet_text']
        if tweet['tweet_lang'] == 'en':
            text = preprocess(text,'en')
            data_en.append(text)
        elif tweet['tweet_lang'] == 'es':
            text = preprocess(text,'es')
            data_es.append(text)
        elif tweet['tweet_lang'] == 'hi':
            text = preprocess(text,'hi')
            data_hi.append(text) 
            
    return data_en, data_es, data_hi

In [29]:
data_en, data_es, data_hi = get_tweets(tweets)

In [45]:
data_en[0]

'th pragati meeting today reviewed eight projects spread across ministries railways roads power pet…'

In [33]:
df = pd.DataFrame(data_en, columns =['Tweets'])

In [34]:
df

Unnamed: 0,Tweets
0,th pragati meeting today reviewed eight projec...
1,tomorrow th november major day india uttar pra...
2,indeed sir lachit diwas bow courageous lachit ...
3,today lachit diwas pay tributes brave lachit b...
4,"""agenda groups transcends modi targeting india..."
...,...
16050,@ walterp resist dd covid variants around time...
16051,heard yet doin' bojo amazing live uk fed track...
16052,@ lokeshsharma rajasthan coronaupdate covid ca...
16053,covid status report - - fightagainstcoronaviru...


In [47]:
# Assign emotion
i = 0
emotions = []
for tweet in data_en:
    text = ''.join(tweet)
    emotion = NRCLex(text)
    emotions.append(emotion.top_emotions)

In [43]:
df['Emotions'] = emotions

In [44]:
df.head()

Unnamed: 0,Tweets,Emotions
0,th pragati meeting today reviewed eight projec...,"[(fear, 0.0), (anger, 0.0), (anticip, 0.0), (t..."
1,tomorrow th november major day india uttar pra...,"[(positive, 0.5), (anticipation, 0.5)]"
2,indeed sir lachit diwas bow courageous lachit ...,"[(positive, 0.4444444444444444)]"
3,today lachit diwas pay tributes brave lachit b...,"[(trust, 0.25), (positive, 0.25), (joy, 0.25),..."
4,"""agenda groups transcends modi targeting india...","[(trust, 0.5), (positive, 0.5)]"


In [None]:
# Using methods to classify emotion
#print('\n', emotion.words)
#print('\n', emotion.sentences)
#print('\n', emotion.affect_list)
#print('\n', emotion.affect_dict)
#print('\n', emotion.raw_emotion_scores)
print('\n', emotion.top_emotions)
#print('\n', emotion.affect_frequencies)