In [1]:
# General packages
import pandas as pd
import pickle

# Text Processing
import nltk
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import re

# Sentiment Analysis

In [2]:
# Sentiments
def get_sentiments(df):
    df = vader_sentiment(df)
    df = textblob_sentiment(df)
    df['final_sentiment'] = df['Vader_compound_score'] + df['tb_polarity']

    return df

def vader_sentiment(df):
    df['Vader_compound_score'] = df['Comment'].apply(lambda x: vader_compound_score(x))
    return df

def vader_compound_score(x):
    vader_analyser = SentimentIntensityAnalyzer()
    score = vader_analyser.polarity_scores(x)
    return score['compound']

def textblob_sentiment(df):
    df['tb_polarity'] = df['Comment'].apply(lambda x: TextBlob(x).sentiment.polarity)
    df['tb_subjectivity'] = df['Comment'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
    return df

In [3]:
dominant_topics = pd.read_csv('../Data/travel_topics_topic_labels.csv')

# remove rows containing empty comments
print('Travel Comments')
print('-'*20)
print(f"Before cleaning: {len(dominant_topics['Comment'])}")
dominant_topics.dropna(subset=['Comment'], inplace=True)
print(f"After cleaning: {len(dominant_topics['Comment'])}")

Travel Comments
--------------------
Before cleaning: 7058
After cleaning: 6982


### Get overview sentiments of comments for all topics

In [4]:
def get_num_comments_sentiments_by_topic(topics_df, topic):
    df = topics_df[topics_df[f'topic_{topic}'] == 1.0].reset_index()

    df_sentiments = get_sentiments(df)

    df_sentiments_pos = df_sentiments[df_sentiments['final_sentiment'] > 0]
    df_sentiments_neu = df_sentiments[df_sentiments['final_sentiment'] == 0]
    df_sentiments_neg = df_sentiments[df_sentiments['final_sentiment'] < 0]
    
    print(f"There are {len(df_sentiments_pos)} positive comments regarding topic {topic}.")
    print(f"There are {len(df_sentiments_neu)} neutral comments regarding topic {topic}.")
    print(f"There are {len(df_sentiments_neg)} negative comments regarding topic {topic}.\n")

In [5]:
get_num_comments_sentiments_by_topic(topics_df=dominant_topics, topic=1)
get_num_comments_sentiments_by_topic(topics_df=dominant_topics, topic=2)
get_num_comments_sentiments_by_topic(topics_df=dominant_topics, topic=3)
get_num_comments_sentiments_by_topic(topics_df=dominant_topics, topic=4)
get_num_comments_sentiments_by_topic(topics_df=dominant_topics, topic=5)

There are 1229 positive comments regarding topic 1.
There are 231 neutral comments regarding topic 1.
There are 607 negative comments regarding topic 1.

There are 677 positive comments regarding topic 2.
There are 71 neutral comments regarding topic 2.
There are 413 negative comments regarding topic 2.

There are 723 positive comments regarding topic 3.
There are 50 neutral comments regarding topic 3.
There are 413 negative comments regarding topic 3.

There are 733 positive comments regarding topic 4.
There are 86 neutral comments regarding topic 4.
There are 488 negative comments regarding topic 4.

There are 716 positive comments regarding topic 5.
There are 80 neutral comments regarding topic 5.
There are 426 negative comments regarding topic 5.



### Get top positive/neutral/negative comments under a topic

In [6]:
def get_corex_top_positive_comments(topics_df, topic, n):
    df = topics_df[topics_df[f'topic_{topic}'] == 1.0].reset_index()

    df_sentiments = get_sentiments(df)

    df_sentiments_pos = df_sentiments[df_sentiments['final_sentiment'] > 0]
    # df_sentiments_pos.to_csv('df_sentiments_pos_topic_2.csv')

    print(f"There are {len(df_sentiments_pos)} positive comments regarding topic {topic}. {round(len(df_sentiments_pos)/len(df_sentiments), 2) * 100} % of the comments are positive for this topic.\n\n")
    
    positive_comments = df_sentiments_pos.sort_values(by='final_sentiment', ascending=False)['Comment'].to_list()

    print_top_comments(positive_comments, n)

def get_corex_neutral_comments(topics_df, topic, n):
    df = topics_df[topics_df[f'topic_{topic}'] == 1.0].reset_index()

    df_sentiments = get_sentiments(df)


    df_sentiments_neu = df_sentiments[df_sentiments['final_sentiment'] == 0]
    # df_sentiments_neu_2.to_csv('df_sentiments_neu_topic_2.csv')

    print(f"There are {len(df_sentiments_neu)} neutral comments regarding topic {topic}. {round(len(df_sentiments_neu)/len(df_sentiments), 2) * 100} % of the comments are neutral for this topic.\n\n")
    
    comments = df_sentiments_neu['Comment'].sample(n=n).to_list()

    print_comments(comments, n)

def get_corex_top_negative_comments(topics_df, topic, n):
    df = topics_df[topics_df[f'topic_{topic}'] == 1.0].reset_index()

    df_sentiments = get_sentiments(df)

    df_sentiments_neg = df_sentiments[df_sentiments['final_sentiment'] < 0]
    print(f"There are {len(df_sentiments_neg)} negative comments regarding topic {topic}. {round(len(df_sentiments_neg)/len(df_sentiments), 2) * 100} % of the comments are negative for this topic.\n\n")
    
    # comments = df_sentiments_neg['Comment'].sample(n=n).to_list()
    negative_comments = df_sentiments_neg.sort_values(by='final_sentiment', ascending=True)['Comment'].to_list()

    print_top_comments(negative_comments, n)

def print_top_comments(comments, n):
    for i in range(n):
        print(f'Rank {i+1} comment:')
        print(f'{comments[i]}')
        print()

def print_comments(comments, n):
    for i in range(n):
        print(f'Comment {i+1}')
        print(f'{comments[i]}')
        print()

##### Topic 1

In [7]:
# positive comments for Topic 1
get_corex_top_positive_comments(topics_df=dominant_topics, topic=1, n=5)

There are 1229 positive comments regarding topic 1. 59.0 % of the comments are positive for this topic.


Rank 1 comment:
the best thing that has ever happened united states allows united states wfh

Rank 2 comment:
mohd countries like congo indonesia philippines and malaysia who are corrupted are invited they got best democracy even united states where got riots the capitol building and donald trump can talk abt democracy

Rank 3 comment:
eight continues can cme here spread but got chance spread sounds fair singapore ministers fucxxx fuxx off don talk please tell united states something make united states happy

Rank 4 comment:
mark zuckerbird like someone bought umbrella but they need united states buy one protect them from the rain lol

Rank 5 comment:
thailand please can not eat all the delicious and cheap thai foods and see all their beautiful women lol



In [8]:
# negative comments for Topic 1
get_corex_top_negative_comments(topics_df=dominant_topics, topic=1, n=5)

There are 607 negative comments regarding topic 1. 28.999999999999996 % of the comments are negative for this topic.


Rank 1 comment:
andylau google reporters without borders just deleted malaysia posts again one linked another explanation google contemptible and disgusting shitty paper

Rank 2 comment:
pap trying earn money and does not care about united states getting covid nineteen become sick die earning money out our misery and the corpse covid nineteen death absolutely inhumane

Rank 3 comment:
then why allow vaccinated travel lane into singapore all affected are coming singapore aggravate the situation disgusting

Rank 4 comment:
lin nothing with singapore how about germany then they are too having worst outbreak pandemic eversince vaccinated travel lane with singapore coincidence

Rank 5 comment:
schadenfreude that insulting singapore education system



In [9]:
# neutral comments for Topic 1
get_corex_neutral_comments(topics_df=dominant_topics, topic=1, n=5)

There are 231 neutral comments regarding topic 1. 11.0 % of the comments are neutral for this topic.


Comment 1
expect mass infection singapore following the dimwit strategy

Comment 2
why not india

Comment 3
singapore goes another circuit breaker then who will take responsibility this time

Comment 4
philippines are allowed travel singapore

Comment 5
germany just announced wave day



##### Topic 4

In [10]:
# positive comments for Topic 4
get_corex_top_positive_comments(topics_df=dominant_topics, topic=4, n=5)

There are 733 positive comments regarding topic 4. 56.00000000000001 % of the comments are positive for this topic.


Rank 1 comment:
avijeet well tell that the doctor who presented his work and lab studies the covid gop conference the guy has been working covid since sars unbiaised has one the best credentials with regards this virus

Rank 2 comment:
angmo boss very happy more rrt thought won hope contract covid lorry soon

Rank 3 comment:
slumps perfect one hundred ranking covid recovery index source nikkei asia five nov

Rank 4 comment:
peter mish agree with you covid nineteen taught united states the importance having plan good all believe all can relate this

Rank 5 comment:
love singapore but not take medicines and injections and stay home but know the maks not anything but will wear one keep from getting fined malaysia groceries were delivered malaysia home before the call pandemic moved queretaro mexico eleven two two thousand and where safe beautiful and clean like singapore



In [11]:
# negative comments for Topic 4
get_corex_top_negative_comments(topics_df=dominant_topics, topic=4, n=5)

There are 488 negative comments regarding topic 4. 37.0 % of the comments are negative for this topic.


Rank 1 comment:
andylau google reporters without borders just deleted malaysia posts again one linked another explanation google contemptible and disgusting shitty paper

Rank 2 comment:
pap trying earn money and does not care about united states getting covid nineteen become sick die earning money out our misery and the corpse covid nineteen death absolutely inhumane

Rank 3 comment:
hate covid hate covid hate covid hate covid hate covid

Rank 4 comment:
dislike comment section because has been infected with dangerous virus worst than covid

Rank 5 comment:
expect covid cases get worst



In [12]:
# neutral comments for Topic 4
get_corex_neutral_comments(topics_df=dominant_topics, topic=4, n=5)

There are 86 neutral comments regarding topic 4. 7.000000000000001 % of the comments are neutral for this topic.


Comment 1
but there are vaccinated people dying from covid

Comment 2
and now rebranded its creator covid pills

Comment 3
omicron coming

Comment 4
mohan sinnapillay covid nineteen only

Comment 5
its because theres still ample space hospitals for covid patients



In [13]:
df_sentiments = get_sentiments(dominant_topics)
df_sentiments.to_csv('Output/travel_topics_sentiment_labels.csv')

# Emotion Analysis

### Label emotions for each comment

In [14]:
def preprocess_and_tokenize(data):    

    #remove html markup
    data = re.sub("(<.*?>)", "", data)

    #remove urls
    data = re.sub(r'http\S+', '', data)
    
    #remove hashtags and @names
    data= re.sub(r"(#[\d\w\.]+)", '', data)
    data= re.sub(r"(@[\d\w\.]+)", '', data)

    #remove punctuation and non-ascii digits
    data = re.sub("(\\W|\\d)", " ", data)
    
    #remove whitespace
    data = data.strip()
    
    # tokenization with nltk
    data = word_tokenize(data)
    
    # stemming with nltk
    porter = PorterStemmer()
    stem_data = [porter.stem(word) for word in data]
        
    return stem_data

In [15]:
def emotions(df):
    emotions_clf = pickle.load(open('../Data/tfidf_svm.sav', 'rb'))
    df['Emotion'] = df['Comment'].apply(lambda x: emotions_clf.predict([x]))
    return df

df_emot = emotions(dominant_topics)
df_emot.head(5)



Unnamed: 0.1,Unnamed: 0,Comment,topic_1,topic_2,topic_3,topic_4,topic_5,Vader_compound_score,tb_polarity,tb_subjectivity,final_sentiment,Emotion
0,0,you have book the pcr test klia before arrival,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[fear]
1,1,access good information what investors need pr...,0.0,0.0,0.0,0.0,0.0,0.891,0.3625,0.425,1.2535,[neutral]
2,2,please include philippines also are super stre...,0.0,0.0,1.0,0.0,0.0,0.7579,0.333333,0.666667,1.091233,[joy]
3,3,omicron mutated times thats making existing va...,1.0,0.0,1.0,1.0,0.0,-0.9168,-0.058788,0.499318,-0.975588,[fear]
4,4,they are welcoming omicron into singapore when...,1.0,0.0,0.0,1.0,1.0,-0.3736,0.0,0.0,-0.3736,[anger]


In [16]:
df_emot.to_csv('Output/travel_topics_emotion_labels.csv')

### Get overview emotions of comments for all topics

In [17]:
def get_num_comments_by_emotions_by_topic(topics_df, topic):
    df = topics_df[topics_df[f'topic_{topic}'] == 1.0].reset_index()

    comments_joy = df[df['Emotion'] == 'joy'].reset_index()
    comments_sad = df[df['Emotion'] == 'sadness'].reset_index()
    comments_anger = df[df['Emotion'] == 'anger'].reset_index()
    comments_neu = df[df['Emotion'] == 'neutral'].reset_index()
    comments_fear = df[df['Emotion'] == 'fear'].reset_index()

    print(f"There are {len(comments_joy)} joy comments regarding topic {topic}.")
    print(f"There are {len(comments_sad)} sadness comments regarding topic {topic}.")
    print(f"There are {len(comments_anger)} anger comments regarding topic {topic}.")
    print(f"There are {len(comments_neu)} neutral comments regarding topic {topic}.")
    print(f"There are {len(comments_fear)} fear comments regarding topic {topic}.\n")

In [18]:
get_num_comments_by_emotions_by_topic(topics_df=df_emot, topic=1)
get_num_comments_by_emotions_by_topic(topics_df=df_emot, topic=2)
get_num_comments_by_emotions_by_topic(topics_df=df_emot, topic=3)
get_num_comments_by_emotions_by_topic(topics_df=df_emot, topic=4)
get_num_comments_by_emotions_by_topic(topics_df=df_emot, topic=5)

There are 230 joy comments regarding topic 1.
There are 245 sadness comments regarding topic 1.
There are 260 anger comments regarding topic 1.
There are 946 neutral comments regarding topic 1.
There are 386 fear comments regarding topic 1.

There are 151 joy comments regarding topic 2.
There are 162 sadness comments regarding topic 2.
There are 206 anger comments regarding topic 2.
There are 358 neutral comments regarding topic 2.
There are 284 fear comments regarding topic 2.

There are 165 joy comments regarding topic 3.
There are 148 sadness comments regarding topic 3.
There are 178 anger comments regarding topic 3.
There are 464 neutral comments regarding topic 3.
There are 231 fear comments regarding topic 3.

There are 161 joy comments regarding topic 4.
There are 174 sadness comments regarding topic 4.
There are 219 anger comments regarding topic 4.
There are 448 neutral comments regarding topic 4.
There are 305 fear comments regarding topic 4.

There are 143 joy comments regar

### Get top comments by emotions

In [21]:
def get_top_comments_by_emotions(df, emotion, n):
    df_sentiments = get_sentiments(df)

    comments_emotion = df_sentiments[df_sentiments['Emotion'] == emotion].reset_index()

    if emotion == 'joy':
        comments = comments_emotion.sort_values(by='final_sentiment', ascending=False)['Comment'].to_list()
    else:
        comments = comments_emotion.sort_values(by='final_sentiment', ascending=True)['Comment'].to_list()
    

    print_top_comments(comments, n)

def get_corex_top_comments_by_emotions(topics_df, topic, emotion, n):
    df = topics_df[topics_df[f'topic_{topic}'] == 1.0].reset_index()
    
    df_sentiments = get_sentiments(df)

    comments_emotion = df_sentiments[df_sentiments['Emotion'] == emotion].reset_index()

    if emotion == 'joy':
        comments = comments_emotion.sort_values(by='final_sentiment', ascending=False)['Comment'].to_list()
    else:
        comments = comments_emotion.sort_values(by='final_sentiment', ascending=True)['Comment'].to_list()
    

    print_top_comments(comments, n)

In [22]:
# top comments containing joy emotion
get_top_comments_by_emotions(df=df_emot, emotion='joy', n=5)

Rank 1 comment:
avijeet well tell that the doctor who presented his work and lab studies the covid gop conference the guy has been working covid since sars unbiaised has one the best credentials with regards this virus

Rank 2 comment:
mohd countries like congo indonesia philippines and malaysia who are corrupted are invited they got best democracy even united states where got riots the capitol building and donald trump can talk abt democracy

Rank 3 comment:
eight continues can cme here spread but got chance spread sounds fair singapore ministers fucxxx fuxx off don talk please tell united states something make united states happy

Rank 4 comment:
mark zuckerbird like someone bought umbrella but they need united states buy one protect them from the rain lol

Rank 5 comment:
hope can visit batam soon beautiful adventure and looking forward



In [23]:
# top comments containing joy emotion for topic 4
get_corex_top_comments_by_emotions(topics_df=df_emot, topic=4, emotion='joy', n=5)

Rank 1 comment:
avijeet well tell that the doctor who presented his work and lab studies the covid gop conference the guy has been working covid since sars unbiaised has one the best credentials with regards this virus

Rank 2 comment:
angmo boss very happy more rrt thought won hope contract covid lorry soon

Rank 3 comment:
peter mish agree with you covid nineteen taught united states the importance having plan good all believe all can relate this

Rank 4 comment:
now the spike protein has been improved and made one hundred more lethal knowing that the virus was never isolated and thus lab manufactured welcome covid two zero upgraded version

Rank 5 comment:
singapore welcome omicron would like sincerely welcome you our country here you are exempted from any taxes and are free roam you get what mean hope you enjoy your stay



In [24]:
# top comments containing anger emotion for topic 4
get_corex_top_comments_by_emotions(topics_df=df_emot, topic=4, emotion='anger', n=5)

Rank 1 comment:
andylau google reporters without borders just deleted malaysia posts again one linked another explanation google contemptible and disgusting shitty paper

Rank 2 comment:
hate covid hate covid hate covid hate covid hate covid

Rank 3 comment:
demonthatgotlordkeith you are totally lame why don you just come out and say you are anti vaccine anti mask and anti lockdown during the pandemic your are being cowardly that what your talking about just say

Rank 4 comment:
don hate covid understand that the virus not responsible for making everyone suffer

Rank 5 comment:
cases yesterday and yet today simplify rules for travelers open doors for covid nineteen come stupid repeat the mistake stupid believing peoples action party stupidest

