# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from wordcloud import WordCloud
import geopandas
pd.options.display.max_columns = 100
plt.style.use(['default'])

# Constants

In [2]:
themes = [
    'Authorities & Politics', 'Cases and deaths', 'Economic impact', 'Educational impact', 'People stories',
       'Preventive measures', 'Vaccines and vaccination', 'Virus spreading'
]

subthemes = [
    'subtheme11', 'subtheme12', 'subtheme21', 'subtheme22', 'subtheme41', 'subtheme42', 'subtheme43', 
    'subtheme44', 'subtheme51', 'subtheme52', 'subtheme53', 'subtheme61', 'subtheme62', 'subtheme63', 
    'subtheme64', 'subtheme65', 'subtheme66', 'subtheme71', 'subtheme72', 'subtheme73', 'subtheme74'
]

emotions = [
    'anger', 'sadness', 'optimism', 'joy'
]

keywords = [
    'remote working', 'homeschooling', 'panic buying', 'sars-cov-2', 'wearing masks', 'ncov', 'wuhan', 
    'social distancing', 'vaccination', 'quarantine', 'outbreak', 'vaccine', 'lockdown', 'pandemic', 
    'coronavirus', 'covid'
]
keywords2 = [w.replace(' ', '_').replace('-', '_') for w in keywords]

# Functions

In [3]:
def millions(x, pos):
    'The two args are the value and tick position'
    return '%1.1fM' % (x * 1e-6)

def thousands(x, pos):
    'The two args are the value and tick position'
    return '%1.fm' % (x * 1e-3)

# Load and process lookup tables

In [4]:
# Load lookup tables
media_list = pd.read_csv('./../data/external/MediaList.csv', sep=";")
theme_desc = pd.read_csv('./../data/external/theme_desc.csv')
subtheme_desc = pd.read_csv('./../data/external/subtheme_desc.csv')
topics = pd.read_parquet('./news_tweets_topics.parquet')
news_accounts = pd.read_parquet('./../data/raw/news_accounts.parquet')
users = pd.read_parquet('./../data/raw/users.parquet')

# Process lookup tables
topics['tweetId'] = topics.tweetId.astype('float64')
news_accounts = news_accounts.merge(media_list)
news_accounts['userId'] = news_accounts.userId.astype('float64')

# Load and process News Tweets

In [5]:
# Load news tweets
news_tweets = pd.read_parquet('./../data/raw/news_tweets_with_em_scores.parquet')

# Define prevalent emotion and emotion score
news_tweets['emotion_score'] = news_tweets[['anger','joy','optimism','sadness']].max(axis=1)
news_tweets['prevalent_emotion'] = news_tweets[['anger','joy','optimism','sadness']].idxmax(axis=1)
news_tweets['prevalent_emotion'] = np.where(news_tweets.emotion_score>0.5, news_tweets.prevalent_emotion, 'undefined')

# Include country information and filter valid news accountsw
news_tweets = news_tweets.merge(news_accounts[['userId','country']])
news_tweets = news_tweets[~news_tweets[emotions].isnull().any(axis=1)]
news_tweets = news_tweets.sort_values('date').drop_duplicates('conversationId', keep='first')
news_tweets = news_tweets[news_tweets.lang=='en']
print('Number of valid news tweets:', news_tweets.shape[0])

# Include theme and subtheme
news_tweets = news_tweets[~news_tweets.conversationId.isnull()].merge(topics[['theme'] + subthemes + ['tweetId']], how='left', on='tweetId')
news_tweets['subtheme'] = news_tweets[subthemes].idxmax(axis=1)
news_tweets['subtheme'] = news_tweets.subtheme.apply(lambda x: int(x[-2:]))
news_tweets['aux'] = news_tweets[subthemes].sum(axis=1)
news_tweets.loc[news_tweets.aux==0,'subtheme'] = news_tweets.loc[news_tweets.aux==0,'theme']*10
news_tweets = news_tweets.drop(subthemes + ['aux'], axis=1)

# Include date and week features
news_tweets['ds'] = news_tweets.date.dt.date
news_tweets['dsw'] = (news_tweets['date'] + pd.offsets.Week(weekday=6)).dt.date

Number of valid news tweets: 1678217


# Load and process Comments

In [6]:
# Load valid comments
plain_comments = pd.read_parquet('./../data/raw/comments.parquet')
comments = pd.read_parquet('./../data/raw/user_tweets_with_em_scores.parquet')
comments = comments[comments.tweetId.isin(plain_comments.tweetId)]; del plain_comments
comments = comments[comments.conversationId.isin(news_tweets.conversationId)]
comments = comments[~comments[emotions].isnull().any(axis=1)]
comments = comments[comments.lang=='en']
print('Number of valid comments:', comments.shape[0])

# Include news tweet data
comments = comments.merge(news_tweets.rename(columns={
    'date':'news_date', 'userId':'news_id', 'prevalent_emotion':'news_prevalent_emotion', 'emotion_score':'news_emotion_score',
    'anger':'news_anger', 'sadness':'news_sadness', 'optimism':'news_optimism', 'joy':'news_joy'
})[[
    'conversationId', 'news_date', 'news_id', 'news_prevalent_emotion', 'news_emotion_score', 
    'news_anger', 'news_sadness', 'news_optimism', 'news_joy', 'theme', 'subtheme', 'country'
]], how='left')

# Define prevalent emotion and emotion score
comments['emotion_score'] = comments[['anger','joy','optimism','sadness']].max(axis=1)
comments['prevalent_emotion'] = comments[['anger','joy','optimism','sadness']].idxmax(axis=1)
comments['prevalent_emotion'] = np.where(comments.emotion_score>0.5, comments.prevalent_emotion, 'undefined')

# Define continent information
comments = comments.merge(pd.DataFrame({
    'country':['AU', 'UK', 'US', 'CA', 'NZ', 'ZA', 'KE', 'NG', 'IN', 'PH', 'MY','IE'],
    'continent':['Oceania', 'Europe', 'America', 'America', 'Oceania', 'Africa', 'Africa', 'Africa', 'Asia', 'Asia', 'Asia', 'Europe']
}))

# Include date, week and month features
comments['ds'] = comments.news_date.dt.date
comments['dsw'] = (comments['news_date'] + pd.offsets.Week(weekday=6)).dt.date
comments['dsm'] = pd.to_datetime(comments.news_date.dt.year.astype(str) + '-' + comments.news_date.dt.month.astype(str) + '-1')

Number of valid comments: 17620904


In [7]:
news_tweets.userId.nunique()

275

# Tables

### Table: News Tweets Engagement

In [79]:
np.round(news_tweets.groupby('prevalent_emotion')[['replyCount', 'retweetCount', 'likeCount', 'quoteCount']].mean(),1)

Unnamed: 0_level_0,replyCount,retweetCount,likeCount,quoteCount
prevalent_emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
anger,20.7,36.8,113.2,11.0
joy,8.0,22.5,97.7,7.5
optimism,11.0,22.6,87.8,6.7
sadness,9.5,24.9,66.7,6.6
undefined,10.6,22.4,73.0,7.2


### Table: Emotion over Themes

In [84]:
eot = comments.groupby(['theme']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    eot['lift_' + col] = np.round(eot[col] / g[col], 2)
    
eot = eot[['theme'] + ['lift_' + col for col in emotions]]
eot

Unnamed: 0,theme,lift_anger,lift_sadness,lift_optimism,lift_joy
0,1,0.94,1.17,1.02,0.96
1,2,1.01,0.94,1.02,1.02
2,3,0.96,1.05,1.01,1.08
3,4,0.93,1.19,0.97,1.04
4,5,1.08,0.87,0.93,0.93
5,6,1.02,0.95,0.98,1.02
6,7,0.98,1.05,1.01,0.99
7,8,0.93,0.98,1.22,1.09


# Tweets examples

In [8]:
news_tweets.sample(25, random_state=99).merge(news_accounts, on='userId')

Unnamed: 0,tweetId,conversationId,userId,date,content,lang,sourceLabel,replyCount,retweetCount,likeCount,quoteCount,longitude,latitude,place,anger,joy,optimism,sadness,emotion_score,prevalent_emotion,country_x,theme,subtheme,ds,dsw,username,displayname,description,rawDescription,verified,created,followersCount,friendsCount,statusesCount,favouritesCount,listedCount,mediaCount,location,protected,linkUrl,linkTcourl,profileImageUrl,profileBannerUrl,country_y,media_outlet,media_category
0,1.280805e+18,1.280805e+18,9294762.0,2020-07-08 10:06:06+00:00,Covid-19 cases continue unabated in Andhra Pra...,en,SocialFlow,0.0,0.0,2.0,0.0,,,,0.07,0.333,0.542,0.056,0.542,optimism,IN,1,10,2020-07-08,2020-07-12,businessline,Business Line,Business Daily from The Hindu group of newspap...,Business Daily from The Hindu group of newspap...,True,2007-10-07 15:53:59+00:00,81101,753,259377,1210,1303,87187,"Chennai, India",False,http://www.thehindubusinessline.com,https://t.co/BgBFj6enus,https://pbs.twimg.com/profile_images/135936651...,https://pbs.twimg.com/profile_banners/9294762/...,IN,The Hindu,Newspaper
1,1.255242e+18,1.255242e+18,15933690.0,2020-04-28 21:07:52+00:00,#BREAKING: Another 10 people have died after c...,en,SocialFlow,13.0,20.0,27.0,8.0,,,,0.053,0.015,0.028,0.903,0.903,sadness,US,4,40,2020-04-28,2020-05-03,NBCDFW,NBC DFW,The first TV 📺 station in #Texas & the best pl...,The first TV 📺 station in #Texas & the best pl...,True,2008-08-21 17:03:55+00:00,477498,2180,241866,6956,2912,63149,,False,http://www.nbcdfw.com,https://t.co/5g4qcL90td,https://pbs.twimg.com/profile_images/967969509...,https://pbs.twimg.com/profile_banners/15933690...,US,NBC,Television
2,1.241605e+18,1.241605e+18,69271270.0,2020-03-22 05:59:10+00:00,Coronavirus: Being precautious is best https:/...,en,WordPress.com,0.0,2.0,13.0,0.0,,,,0.122,0.041,0.16,0.677,0.677,sadness,NG,5,50,2020-03-22,2020-03-29,daily_trust,Daily Trust,Official Twitter handle of Daily Trust Newspap...,Official Twitter handle of Daily Trust Newspap...,True,2009-08-27 11:58:58+00:00,1858489,839,554081,305,1326,139317,"Abuja, Nigeria",False,http://dailytrust.com,https://t.co/jQsNIpEy9h,https://pbs.twimg.com/profile_images/119901745...,https://pbs.twimg.com/profile_banners/69271273...,NG,Daily Trust,Newspaper
3,1.288342e+18,1.288342e+18,373978900.0,2020-07-29 05:13:31+00:00,"Nigeria discharges 829 COVID-19 patients, high...",en,Twitter for Android,4.0,16.0,76.0,1.0,,,,0.144,0.454,0.314,0.088,0.454,undefined,NG,1,10,2020-07-29,2020-08-02,PremiumTimesng,Premium Times,A multimedia Nigerian publication based in Abu...,A multimedia Nigerian publication based in Abu...,True,2011-09-15 14:08:13+00:00,1667280,322,298754,963,1198,14240,"Abuja, Nigeria",False,http://www.premiumtimesng.com,http://t.co/u2eUV6yuAw,https://pbs.twimg.com/profile_images/913362770...,https://pbs.twimg.com/profile_banners/37397890...,NG,Premium Times,Newspaper
4,1.24627e+18,1.24627e+18,15250660.0,2020-04-04 02:53:26+00:00,Pink has become the latest Hollywood star to a...,en,TweetDeck,16.0,17.0,59.0,3.0,,,,0.033,0.832,0.071,0.065,0.832,joy,AU,5,50,2020-04-04,2020-04-05,newscomauHQ,news.com.au,Australia's number one news site. Bringing you...,Australia's number one news site. Bringing you...,True,2008-06-27 02:03:07+00:00,559636,4225,264447,1905,3632,46717,Australia,False,http://www.news.com.au,https://t.co/2imUDGKnpC,https://pbs.twimg.com/profile_images/109800714...,https://pbs.twimg.com/profile_banners/15250661...,AU,News,Website
5,1.251307e+18,1.251307e+18,18993400.0,2020-04-18 00:28:14+00:00,"""It's not just about -- as far as we know -- s...",en,SocialNewsDesk,0.0,1.0,7.0,0.0,,,,0.052,0.014,0.863,0.071,0.863,optimism,US,1,10,2020-04-18,2020-04-19,abc7newsbayarea,ABC7 News,"#1 source for breaking news, weather, and spor...","#1 source for breaking news, weather, and spor...",True,2009-01-14 20:20:22+00:00,560390,24339,405331,5062,4866,55001,San Francisco Bay Area,False,http://www.abc7news.com,http://t.co/2kLV4sehau,https://pbs.twimg.com/profile_images/875793011...,https://pbs.twimg.com/profile_banners/18993395...,US,ABC (American Broadcasting Company),Television
6,1.295416e+18,1.295416e+18,19000030.0,2020-08-17 17:45:01+00:00,The AMO has at times clashed with the provinci...,en,Buffer,3.0,3.0,2.0,2.0,,,,0.736,0.007,0.043,0.214,0.736,anger,CA,2,20,2020-08-17,2020-08-23,CBCToronto,CBC Toronto,"Your source for GTA breaking, investigative & ...","Your source for GTA breaking, investigative & ...",True,2009-01-14 22:47:52+00:00,614709,934,173191,1814,4336,85153,Toronto,False,http://www.cbc.ca/toronto,https://t.co/B6RD7NB6QA,https://pbs.twimg.com/profile_images/108056115...,https://pbs.twimg.com/profile_banners/19000033...,CA,CBC,Television
7,1.256147e+18,1.256147e+18,5392522.0,2020-05-01 09:02:04+00:00,"So, what exactly is the World Health Organizat...",en,SocialFlow,18.0,82.0,159.0,7.0,,,,0.545,0.023,0.207,0.224,0.545,anger,US,7,71,2020-05-01,2020-05-03,NPR,NPR,News. Arts & Life. Music & more. This is NPR.\...,News. Arts & Life. Music & more. This is NPR.\...,True,2007-04-22 05:10:15+00:00,8744182,68649,215493,2553,69037,9667,,False,http://www.npr.org,http://t.co/SoL86ga4RI,https://pbs.twimg.com/profile_images/138920350...,https://pbs.twimg.com/profile_banners/5392522/...,US,NPR,Radio
8,1.276206e+18,1.276206e+18,3150940000.0,2020-06-25 17:30:00+00:00,"""Breaking up the Goa Forward Party doesn’t rea...",en,TweetDeck,4.0,27.0,110.0,2.0,,,,0.209,0.016,0.158,0.617,0.617,sadness,IN,4,40,2020-06-25,2020-06-28,thewire_in,The Wire,"India's foremost independent news-site, carryi...","India's foremost independent news-site, carryi...",True,2015-04-12 07:32:06+00:00,1077831,287,110239,470,2286,40493,,False,https://thewire.in,https://t.co/uB40W7hztX,https://pbs.twimg.com/profile_images/131159870...,https://pbs.twimg.com/profile_banners/31509402...,IN,The Hindu,Newspaper
9,1.281351e+18,1.281351e+18,16973330.0,2020-07-09 22:15:31+00:00,Girls at risk of never going back to school in...,en,Echobox,0.0,13.0,23.0,1.0,,,,0.058,0.009,0.025,0.907,0.907,sadness,UK,7,70,2020-07-09,2020-07-12,Independent,The Independent,"News, comment and features from The Independen...","News, comment and features from The Independen...",True,2008-10-26 00:00:29+00:00,3566294,646,1122124,54,24941,232882,"London, England",False,http://independent.co.uk,https://t.co/ENon5VCZ03,https://pbs.twimg.com/profile_images/114856191...,https://pbs.twimg.com/profile_banners/16973333...,UK,The Independent,Newspaper


In [8]:
news_tweets.columns

Index(['tweetId', 'conversationId', 'userId', 'date', 'content', 'lang',
       'sourceLabel', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount',
       'longitude', 'latitude', 'place', 'anger', 'joy', 'optimism', 'sadness',
       'emotion_score', 'prevalent_emotion', 'country', 'theme', 'subtheme',
       'ds', 'dsw'],
      dtype='object')

In [9]:
news_tweets.sample(25, random_state=99).merge(news_accounts, on=('userId','country')).iloc[[4,5,8,18]]

Unnamed: 0,tweetId,conversationId,userId,date,content,lang,sourceLabel,replyCount,retweetCount,likeCount,quoteCount,longitude,latitude,place,anger,joy,optimism,sadness,emotion_score,prevalent_emotion,country,theme,subtheme,ds,dsw,username,displayname,description,rawDescription,verified,created,followersCount,friendsCount,statusesCount,favouritesCount,listedCount,mediaCount,location,protected,linkUrl,linkTcourl,profileImageUrl,profileBannerUrl,media_outlet,media_category
4,1.24627e+18,1.24627e+18,15250660.0,2020-04-04 02:53:26+00:00,Pink has become the latest Hollywood star to a...,en,TweetDeck,16.0,17.0,59.0,3.0,,,,0.033,0.832,0.071,0.065,0.832,joy,AU,5,50,2020-04-04,2020-04-05,newscomauHQ,news.com.au,Australia's number one news site. Bringing you...,Australia's number one news site. Bringing you...,True,2008-06-27 02:03:07+00:00,559636,4225,264447,1905,3632,46717,Australia,False,http://www.news.com.au,https://t.co/2imUDGKnpC,https://pbs.twimg.com/profile_images/109800714...,https://pbs.twimg.com/profile_banners/15250661...,News,Website
5,1.251307e+18,1.251307e+18,18993400.0,2020-04-18 00:28:14+00:00,"""It's not just about -- as far as we know -- s...",en,SocialNewsDesk,0.0,1.0,7.0,0.0,,,,0.052,0.014,0.863,0.071,0.863,optimism,US,1,10,2020-04-18,2020-04-19,abc7newsbayarea,ABC7 News,"#1 source for breaking news, weather, and spor...","#1 source for breaking news, weather, and spor...",True,2009-01-14 20:20:22+00:00,560390,24339,405331,5062,4866,55001,San Francisco Bay Area,False,http://www.abc7news.com,http://t.co/2kLV4sehau,https://pbs.twimg.com/profile_images/875793011...,https://pbs.twimg.com/profile_banners/18993395...,ABC (American Broadcasting Company),Television
8,1.276206e+18,1.276206e+18,3150940000.0,2020-06-25 17:30:00+00:00,"""Breaking up the Goa Forward Party doesn’t rea...",en,TweetDeck,4.0,27.0,110.0,2.0,,,,0.209,0.016,0.158,0.617,0.617,sadness,IN,4,40,2020-06-25,2020-06-28,thewire_in,The Wire,"India's foremost independent news-site, carryi...","India's foremost independent news-site, carryi...",True,2015-04-12 07:32:06+00:00,1077831,287,110239,470,2286,40493,,False,https://thewire.in,https://t.co/uB40W7hztX,https://pbs.twimg.com/profile_images/131159870...,https://pbs.twimg.com/profile_banners/31509402...,The Hindu,Newspaper
18,1.249748e+18,1.249748e+18,24700880.0,2020-04-13 17:16:19+00:00,GO Transit ridership down 90% during pandemic ...,en,Echobox,5.0,1.0,5.0,0.0,,,,0.194,0.009,0.021,0.776,0.776,sadness,CA,2,20,2020-04-13,2020-04-19,TheTorontoSun,Toronto Sun,"Toronto's best local news, sports, entertainme...","Toronto's best local news, sports, entertainme...",True,2009-03-16 14:45:06+00:00,334443,586,232156,877,2157,145768,"365 Bloor St. E., Toronto, ON",False,http://www.torontosun.com,https://t.co/syvPNjzI8W,https://pbs.twimg.com/profile_images/797145490...,https://pbs.twimg.com/profile_banners/24700876...,Toronto Sun,Newspaper


In [10]:
n = news_tweets.sample(25, random_state=99).merge(news_accounts, on=('userId','country'))
n['engagement'] = n[['replyCount', 'retweetCount', 'likeCount', 'quoteCount']].sum(axis=1)
n.iloc[[4,5,8,18]][['content', 'theme', 'subtheme', 'country']]

Unnamed: 0,content,theme,subtheme,country
4,Pink has become the latest Hollywood star to a...,5,50,AU
5,"""It's not just about -- as far as we know -- s...",1,10,US
8,"""Breaking up the Goa Forward Party doesn’t rea...",4,40,IN
18,GO Transit ridership down 90% during pandemic ...,2,20,CA


In [11]:
list(news_tweets.sample(25, random_state=99).merge(news_accounts, on='userId').content)

['Covid-19 cases continue unabated in Andhra Pradesh with the detection of 1,062 new positive cases. https://t.co/mvWswPuRqa',
 '#BREAKING: Another 10 people have died after contracting COVID-19, including a 17-year-old Lancaster girl, according to county health officials who also confirm another 135 new infections. https://t.co/eXPAJdCCN8 https://t.co/K45Dpz5l8I',
 'Coronavirus: Being precautious is best https://t.co/xU2PKXcDCn',
 'Nigeria discharges 829 COVID-19 patients, highest number in one day https://t.co/Ci0WkL5khy',
 'Pink has become the latest Hollywood star to announce she has tested positive for coronavirus.\nhttps://t.co/3kpIeN8apQ',
 '"It\'s not just about -- as far as we know -- social distancing. It\'s about taking a number of steps, implementing a number of procedures to make sure that people feel safe." - @RobertIger https://t.co/ZtLfuVaV43',
 "The AMO has at times clashed with the provincial government during the COVID-19 pandemic, saying last month that emergency fu

In [12]:
theme_desc

Unnamed: 0,theme,theme_desc
0,1,Cases and deaths
1,2,Economic impact
2,3,Educational impact
3,4,People stories
4,5,Authorities & Politics
5,6,Preventive measures
6,7,Virus spreading
7,8,Vaccines and vaccination


In [13]:
list(news_tweets.sample(25, random_state=73).merge(news_accounts, on='userId').content)

["Chile's strong vaccination campaign is getting rewarded in financial markets as traders bet on a faster recovery. https://t.co/IHrrhOw7jw",
 'South African business confidence jumps to the highest level in more than two years as further easing of lockdown restrictions spurred a resurgence in activity https://t.co/aT9ZHbifMw',
 'Himachal Pradesh hoteliers not happy with Unlock 2.0 guidelines, they fear allowing tourists without quarantine could lead to a spike in cases. \n#HimachalPradesh #coronavirus\n(@manjeet_sehgal)\nhttps://t.co/rMkOkHSirH',
 '11 Nigerians die of COVID-19 in South Africa https://t.co/q21wdvZ7uF https://t.co/6SsRRyoKbF',
 'Second man dies of #coronavirus in US: health officials  (AFP) \n\n#CoronavirusOutbreak #CoronaVirusUpdate \n\nKeep following our LIVE blog: https://t.co/DCrjmPTiy9 https://t.co/rOQhow6fzY',
 'Toronto has administered more than 923,000 vaccine doses in total. https://t.co/5flVuhTYVB',
 'Melania Trump defends her work on the White House tennis pa

In [14]:
list(news_tweets.sample(25, random_state=13).merge(news_accounts, on='userId').iloc[[3,9,16,21]].content)

['Mike Pompeo Says "Enormous Evidence" Virus Came From Wuhan Lab https://t.co/AhjvLi6Iuj',
 'Students were informed via email that a student has contracted the coronavirus and contact tracing is underway. \n\nUCC did not specify the department the student was in.\n\n#Coronavirus\n#COVID19\n\nhttps://t.co/7sgRrL9WUX',
 'Maureen said she has been in home quarantine for two weeks and the experience has taken a toll on her mental health. She assured, however, that she is "much better now" and she is "grateful to be alive."\n\nhttps://t.co/7kON6kpBhH',
 'PNP Caraga to probe source of flight information of 1st COVID-19 patient https://t.co/DLJxkIRBwa https://t.co/6i4DxF17iG']

In [15]:
comments['clen'] = comments.content.apply(lambda x: len(x))

In [16]:
c = pd.concat([
    comments[(comments.prevalent_emotion=='anger') & (comments.clen>100)].sample(5, random_state=13),
    comments[(comments.prevalent_emotion=='sadness') & (comments.clen>100)].sample(5, random_state=13),
    comments[(comments.prevalent_emotion=='optimism') & (comments.clen>100)].sample(5, random_state=13),
    comments[(comments.prevalent_emotion=='joy') & (comments.clen>100)].sample(5, random_state=13),
])
c

Unnamed: 0,tweetId,conversationId,userId,date,content,lang,sourceLabel,replyCount,retweetCount,likeCount,quoteCount,longitude,latitude,place,anger,joy,optimism,sadness,news_date,news_id,news_prevalent_emotion,news_emotion_score,news_anger,news_sadness,news_optimism,news_joy,theme,subtheme,country,emotion_score,prevalent_emotion,continent,ds,dsw,dsm,clen
4843470,1.275432e+18,1.275176e+18,3188441000.0,2020-06-23 14:15:11+00:00,"@ABC However, most people know when he's ""joki...",en,Twitter for Android,0.0,0.0,0.0,0.0,,,,0.834,0.01,0.109,0.047,2020-06-22 21:49:22+00:00,28785490.0,anger,0.662,0.662,0.156,0.159,0.024,5,52,US,0.834,anger,America,2020-06-22,2020-06-28,2020-06-01,254
15124836,1.258527e+18,1.25852e+18,1443213000.0,2020-05-07 22:38:33+00:00,@channelstv Which experts? What has they all d...,en,Twitter for Android,0.0,0.0,1.0,0.0,,,,0.722,0.016,0.1,0.162,2020-05-07 22:11:26+00:00,125346900.0,sadness,0.69,0.206,0.69,0.09,0.013,6,61,NG,0.722,anger,Africa,2020-05-07,2020-05-10,2020-05-01,142
238376,1.289478e+18,1.289427e+18,466426000.0,2020-08-01 08:27:13+00:00,@Mary__Darly @abcnews Did I argue against othe...,en,Twitter for iPhone,0.0,0.0,0.0,0.0,,,,0.89,0.006,0.033,0.071,2020-08-01 05:07:18+00:00,2768501.0,anger,0.722,0.722,0.232,0.035,0.01,6,61,AU,0.89,anger,Oceania,2020-08-01,2020-08-02,2020-08-01,147
16093835,1.246718e+18,1.246636e+18,398153700.0,2020-04-05 08:36:34+00:00,@IndiaToday @delayedjab Who is at fault for no...,en,Twitter for iPhone,0.0,0.0,0.0,0.0,,,,0.823,0.006,0.023,0.148,2020-04-05 03:10:49+00:00,19897140.0,sadness,0.96,0.017,0.96,0.01,0.012,3,30,IN,0.823,anger,Asia,2020-04-05,2020-04-12,2020-04-01,306
2650156,1.363747e+18,1.363746e+18,1.258519e+18,2021-02-22 07:08:31+00:00,"@BBCWorld Too early, given in to the pressure ...",en,Twitter for iPhone,1.0,0.0,0.0,0.0,,,,0.76,0.008,0.043,0.188,2021-02-22 07:02:07+00:00,742143.0,optimism,0.651,0.156,0.089,0.651,0.105,6,61,UK,0.76,anger,Europe,2021-02-22,2021-02-28,2021-02-01,156
12161058,1.286159e+18,1.285012e+18,9.991129e+17,2020-07-23 04:41:18+00:00,@mylesbecker @RoyThomas77 @Sheri_lp @ABC Excep...,en,Twitter Web App,0.0,0.0,0.0,0.0,,,,0.111,0.013,0.05,0.825,2020-07-20 00:41:22+00:00,28785490.0,undefined,0.392,0.223,0.392,0.374,0.012,4,40,US,0.825,sadness,America,2020-07-20,2020-07-26,2020-07-01,170
16135132,1.236284e+18,1.236128e+18,94761190.0,2020-03-07 13:32:41+00:00,"Two more cases, including one person from Tami...",en,Twitter Web App,14.0,112.0,158.0,28.0,,,,0.153,0.029,0.026,0.792,2020-03-07 03:12:28+00:00,94761190.0,undefined,0.473,0.225,0.473,0.274,0.028,7,72,IN,0.792,sadness,Asia,2020-03-07,2020-03-08,2020-03-01,257
17616466,1.242017e+18,1.241998e+18,1.210952e+18,2020-03-23 09:15:47+00:00,@STAYatHOMEfools @rtenews @SimonHarrisTD @morn...,en,Twitter for Android,1.0,0.0,2.0,0.0,,,,0.107,0.006,0.037,0.85,2020-03-23 07:57:51+00:00,8973062.0,undefined,0.401,0.26,0.401,0.303,0.037,5,52,IE,0.85,sadness,Europe,2020-03-23,2020-03-29,2020-03-01,307
3436332,1.256676e+18,1.256263e+18,1.249247e+18,2020-05-02 20:05:10+00:00,@maniac_angelo @2020DoOver @deidesk @CNN Do yo...,en,Twitter Web App,0.0,0.0,0.0,0.0,,,,0.325,0.013,0.036,0.627,2020-05-01 16:45:03+00:00,759251.0,undefined,0.392,0.089,0.204,0.315,0.392,2,21,US,0.627,sadness,America,2020-05-01,2020-05-03,2020-05-01,152
792617,1.383766e+18,1.383739e+18,977030500.0,2021-04-18 12:53:29+00:00,@MarkIrv64245171 @BBCScotlandNews My dad was t...,en,Twitter for Android,1.0,0.0,0.0,0.0,,,,0.157,0.008,0.015,0.82,2021-04-18 11:08:07+00:00,15687510.0,sadness,0.816,0.126,0.816,0.047,0.011,4,41,UK,0.82,sadness,Europe,2021-04-18,2021-04-25,2021-04-01,302


In [17]:
[str(cnt) + ' | ' + text for cnt, text in enumerate(c.content)]

['0 | @ABC However, most people know when he\'s "joking" and when he\'s not. If the comment indicates harm to a group, he\'s saying exactly what he wants to happen. If he says he\'s going to do something positive for people or the country,  you can bet he\'s lying.',
 '1 | @channelstv Which experts? What has they all done to make Nigeria better? These ones are just normal and random person jawe. Expert for where?',
 "2 | @Mary__Darly @abcnews Did I argue against other approaches as well? No. I don't know why this conversation is still going on so I'll leave it here.",
 '3 | @IndiaToday @delayedjab Who is at fault for not quarantine the youth at the airport ?Can modi answer why he lied to the world during teleconference declaring airport testing &amp;quarantining was inplace from jan 15 ?We had an OPPERTUNITY to learn from China then from italy but we missed that great chance',
 '4 | @BBCWorld Too early, given in to the pressure of a small group of loud voices in parliament again. @Bori

3 "(...) He doesn’t respect the reporters neither the people that are at the frontline fighting for this virus (...)"
7 "(...) so many doctors in Europe who have to decide who gets a respirator and who has to die (...)"
12 "(...) I do hope you and the rest of the 1743 family are well (...)"
17 "(...) I heard this as well via a reputable news source. You're not imagining it. :) (...)"

In [18]:
c['engagement'] = c[['replyCount', 'retweetCount', 'likeCount', 'quoteCount']].sum(axis=1)
c.iloc[[3,7,12,17]][['content', 'prevalent_emotion', 'emotion_score', 'country']]

Unnamed: 0,content,prevalent_emotion,emotion_score,country
16093835,@IndiaToday @delayedjab Who is at fault for no...,anger,0.823,IN
17616466,@STAYatHOMEfools @rtenews @SimonHarrisTD @morn...,sadness,0.85,IE
5909728,@CNN That news is encouraging but vaccines don...,optimism,0.822,US
5486092,@charles00263199 @Mingfan92868688 @Orangishly ...,joy,0.572,US


N/C | Content | Prevalent Emotion | Score | Theme | Subtheme | Engagement

# Analysis

In [9]:
g = comments[emotions].mean()
g

anger       0.538529
sadness     0.194306
optimism    0.143848
joy         0.123318
dtype: float64

In [10]:
c0 = comments.groupby(['country', 'theme', 'subtheme', 'news_prevalent_emotion']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c0['lift_' + col] = c0[col] / g[col]

c0[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.775207
lift_sadness     4.312794
lift_optimism    3.322962
lift_joy         7.533361
dtype: float64

In [11]:
c1 = comments.groupby(['country', 'theme', 'subtheme']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c1['lift_' + col] = c1[col] / g[col]

c1[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.775207
lift_sadness     3.427591
lift_optimism    2.280192
lift_joy         2.319745
dtype: float64

In [12]:
c2 = comments.groupby(['country', 'theme']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c2['lift_' + col] = c2[col] / g[col]

c2[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.121885
lift_sadness     1.579353
lift_optimism    1.735948
lift_joy         1.445416
dtype: float64

In [13]:
c3 = comments.groupby(['country']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c3['lift_' + col] = c3[col] / g[col]

c3[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.043752
lift_sadness     1.198346
lift_optimism    1.596843
lift_joy         1.185100
dtype: float64

In [14]:
c31 = comments.groupby(['continent', 'country', 'dsm']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c31['lift_' + col] = c31[col] / g[col]

c31[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.185070
lift_sadness     1.505039
lift_optimism    2.055937
lift_joy         1.462449
dtype: float64

In [15]:
c4 = comments.groupby(['theme']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c4['lift_' + col] = c4[col] / g[col]

c4[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.080210
lift_sadness     1.190233
lift_optimism    1.216322
lift_joy         1.093226
dtype: float64

In [16]:
c41 = comments.groupby(['theme', 'country']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c41['lift_' + col] = c41[col] / g[col]

c41[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.121885
lift_sadness     1.579353
lift_optimism    1.735948
lift_joy         1.445416
dtype: float64

In [17]:
c42 = comments.groupby(['theme', 'dsw']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c42['lift_' + col] = c42[col] / g[col]

c42[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.494194
lift_sadness     2.382523
lift_optimism    2.875389
lift_joy         1.566086
dtype: float64

In [18]:
c5 = comments.groupby(['subtheme']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c5['lift_' + col] = c5[col] / g[col]

c5[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.145032
lift_sadness     1.511870
lift_optimism    1.216322
lift_joy         1.120586
dtype: float64

In [19]:
c51 = comments.groupby(['subtheme', 'country']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c51['lift_' + col] = c51[col] / g[col]

c51[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.775207
lift_sadness     3.427591
lift_optimism    2.280192
lift_joy         2.319745
dtype: float64

In [20]:
c52 = comments.groupby(['subtheme', 'news_prevalent_emotion']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c52['lift_' + col] = c52[col] / g[col]

c52[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.217813
lift_sadness     1.713152
lift_optimism    1.350780
lift_joy         7.533361
dtype: float64

In [21]:
c6 = comments.groupby(['news_prevalent_emotion']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c6['lift_' + col] = c6[col] / g[col]

c6[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.156106
lift_sadness     1.162794
lift_optimism    1.174986
lift_joy         1.617616
dtype: float64

In [22]:
c61 = comments.groupby(['news_prevalent_emotion', 'country']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c61['lift_' + col] = c61[col] / g[col]

c61[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.188645
lift_sadness     1.410114
lift_optimism    1.948920
lift_joy         2.032791
dtype: float64

In [23]:
c7 = comments.groupby(['dsw']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c7['lift_' + col] = c7[col] / g[col]

c7[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.104867
lift_sadness     1.342395
lift_optimism    1.230986
lift_joy         1.108795
dtype: float64

In [24]:
c8 = comments.groupby(['continent']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c8['lift_' + col] = c8[col] / g[col]

c8[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.037933
lift_sadness     1.106692
lift_optimism    1.273735
lift_joy         1.173476
dtype: float64

In [25]:
c81 = comments.groupby(['continent', 'dsm']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c81['lift_' + col] = c81[col] / g[col]

c81[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.112782
lift_sadness     1.418359
lift_optimism    1.540460
lift_joy         1.364790
dtype: float64

In [26]:
c9 = comments.groupby(['continent', 'country', 'news_id']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c9['lift_' + col] = c9[col] / g[col]

c9[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.148388
lift_sadness     2.312080
lift_optimism    5.575347
lift_joy         5.817179
dtype: float64

In [27]:
c91 = c9[c9.tweetId>1000].merge(news_accounts.drop('country',axis=1), left_on='news_id', right_on='userId', how='left')

# Conclusions

#### T1: joyful news reflect in more joyful comments, mainly in IE, IN and PH

In [28]:
c6

Unnamed: 0,news_prevalent_emotion,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
0,anger,0.622596,0.149089,0.122179,0.106135,3114730,1.156106,0.767292,0.849364,0.860664
1,joy,0.47075,0.186018,0.143751,0.199481,739091,0.87414,0.957348,0.999329,1.617616
2,optimism,0.518069,0.18201,0.169019,0.130902,2679893,0.962008,0.936722,1.174986,1.061496
3,sadness,0.514258,0.225937,0.141646,0.118159,6931213,0.954931,1.162794,0.984697,0.958165
4,undefined,0.541249,0.184842,0.147544,0.126365,4156245,1.005051,0.951293,1.0257,1.024707


In [29]:
c6[c6.lift_joy>1.5]

Unnamed: 0,news_prevalent_emotion,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
1,joy,0.47075,0.186018,0.143751,0.199481,739091,0.87414,0.957348,0.999329,1.617616


In [95]:
c61[c61.news_prevalent_emotion=='joy'].sort_values('lift_joy')

Unnamed: 0,news_prevalent_emotion,country,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
13,joy,CA,0.492252,0.198934,0.146259,0.162554,56064,0.914068,1.023819,1.016767,1.318167
12,joy,AU,0.509158,0.184852,0.137281,0.1687,22123,0.945462,0.951348,0.95435,1.368006
23,joy,ZA,0.451983,0.218784,0.146514,0.182723,13690,0.839292,1.125977,1.018534,1.481719
16,joy,KE,0.423523,0.214678,0.172986,0.188811,6342,0.786445,1.104846,1.202566,1.531086
22,joy,US,0.4926,0.178055,0.135752,0.193592,415149,0.914715,0.916365,0.943723,1.569862
17,joy,MY,0.425265,0.216059,0.163382,0.195305,2614,0.789679,1.111954,1.135798,1.583748
18,joy,NG,0.431903,0.192928,0.162623,0.212556,32514,0.802006,0.992909,1.130523,1.723642
19,joy,NZ,0.4418,0.197077,0.146927,0.21421,3821,0.820384,1.014261,1.021406,1.737055
21,joy,UK,0.448647,0.194351,0.139845,0.217158,114511,0.833098,1.000234,0.972174,1.760955
15,joy,IN,0.397255,0.191865,0.174086,0.236793,43915,0.737667,0.98744,1.210211,1.920178


#### News about vaccines generate more positive feelings (optmism and joy), mainly in PH, In and NZ

In [32]:
c4

Unnamed: 0,theme,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
0,1,0.50849,0.226485,0.146497,0.118529,2060238,0.94422,1.165613,1.018416,0.961165
1,2,0.545863,0.181925,0.146863,0.125349,3170045,1.01362,0.936281,1.020964,1.016469
2,3,0.518199,0.203966,0.144652,0.133183,1433937,0.96225,1.049717,1.005593,1.079992
3,4,0.501165,0.231269,0.139131,0.128435,1131852,0.93062,1.190233,0.967212,1.041491
4,5,0.581724,0.168706,0.134432,0.115137,3395133,1.08021,0.868248,0.934547,0.933659
5,6,0.548773,0.185235,0.14075,0.125241,2421520,1.019023,0.953319,0.978468,1.015595
6,7,0.529069,0.203188,0.145168,0.122575,3225078,0.982435,1.045713,1.009177,0.993977
7,8,0.499114,0.191108,0.174965,0.134815,783369,0.926809,0.983544,1.216322,1.093226


In [33]:
c41[c41.theme==8]

Unnamed: 0,theme,country,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
84,8,AU,0.511112,0.206489,0.167495,0.114909,27426,0.949089,1.0627,1.164393,0.931808
85,8,CA,0.515879,0.197412,0.171567,0.115142,87985,0.957941,1.015986,1.192698,0.933701
86,8,IE,0.50255,0.203974,0.165118,0.12836,23588,0.93319,1.049758,1.147867,1.040884
87,8,IN,0.46005,0.188616,0.192018,0.159315,57862,0.854272,0.970718,1.33487,1.291904
88,8,KE,0.476089,0.212513,0.181214,0.130195,9204,0.884054,1.093704,1.259765,1.055767
89,8,MY,0.476425,0.210975,0.17862,0.133974,3112,0.88468,1.085791,1.24173,1.086412
90,8,NG,0.475462,0.194122,0.179305,0.151116,16202,0.882891,0.999056,1.24649,1.22542
91,8,NZ,0.489096,0.201345,0.18485,0.124717,2447,0.908207,1.036226,1.285038,1.011342
92,8,PH,0.409323,0.193287,0.241604,0.155786,22993,0.760077,0.994755,1.679581,1.263282
93,8,UK,0.490705,0.202529,0.171728,0.135039,142626,0.911196,1.042323,1.19382,1.095042


#### Comments in first weeks of 2020 had more sadness

In [34]:
c7[c7.lift_sadness>1.3]

Unnamed: 0,dsw,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
0,2020-01-05,0.490615,0.256561,0.177074,0.075777,148,0.911028,1.320398,1.230986,0.614484
2,2020-01-19,0.473413,0.260835,0.15333,0.112425,1604,0.879086,1.342395,1.065923,0.911668
6,2020-02-16,0.466971,0.256691,0.150745,0.125598,33225,0.867123,1.321067,1.047948,1.018489


In [35]:
c7.tail(16)

Unnamed: 0,dsw,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
54,2021-01-17,0.519596,0.200317,0.150824,0.129261,232456,0.964844,1.030937,1.048502,1.04819
55,2021-01-24,0.530544,0.194012,0.14723,0.128216,218409,0.985173,0.99849,1.023512,1.03972
56,2021-01-31,0.539758,0.189898,0.147214,0.123129,257593,1.002283,0.977317,1.023405,0.998469
57,2021-02-07,0.514059,0.205686,0.153554,0.126699,192210,0.954562,1.058571,1.06748,1.027412
58,2021-02-14,0.533318,0.193291,0.147002,0.126389,182424,0.990323,0.994779,1.02193,1.024901
59,2021-02-21,0.527288,0.191913,0.150863,0.129937,179161,0.979126,0.987685,1.04877,1.053674
60,2021-02-28,0.527865,0.197351,0.150858,0.123926,181998,0.980199,1.015672,1.048739,1.004929
61,2021-03-07,0.537935,0.18118,0.148929,0.131958,204437,0.998897,0.932449,1.035323,1.070059
62,2021-03-14,0.521773,0.190466,0.156458,0.131303,156749,0.968886,0.980241,1.087665,1.064753
63,2021-03-21,0.531838,0.197153,0.148465,0.122549,162834,0.987575,1.014655,1.032099,0.99376


#### News and comments related to 'Nursing homes and elderly victims', 'Mental health impact' & 'Family stories'

In [36]:
c5[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].min()

lift_anger       0.799227
lift_sadness     0.761694
lift_optimism    0.768057
lift_joy         0.853595
dtype: float64

In [98]:
c5.sort_values('lift_sadness')

Unnamed: 0,subtheme,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
19,63,0.616633,0.148001,0.123286,0.11208,76504,1.145032,0.761694,0.857057,0.908868
15,53,0.598094,0.148752,0.141654,0.111496,64886,1.110607,0.765559,0.984751,0.904136
14,52,0.60867,0.153665,0.126811,0.110852,1737928,1.130246,0.790841,0.881565,0.898913
13,51,0.584499,0.172799,0.137439,0.105264,339524,1.085362,0.889314,0.955449,0.853595
20,64,0.568206,0.17372,0.137663,0.120412,176446,1.055108,0.894055,0.957008,0.976431
5,22,0.559896,0.175178,0.148972,0.115953,651866,1.039677,0.901561,1.035621,0.940275
18,62,0.556871,0.181196,0.140668,0.121265,142288,1.03406,0.932532,0.977899,0.983351
17,61,0.558695,0.182442,0.136233,0.122631,1161968,1.037446,0.938942,0.947064,0.994425
4,21,0.548541,0.182998,0.148917,0.119546,882416,1.018592,0.941803,1.035239,0.969407
3,20,0.538827,0.184034,0.144915,0.132224,1635763,1.000553,0.947137,1.007421,1.072221


In [38]:
c52[(c52.subtheme>40) & (c52.subtheme<50)]

Unnamed: 0,subtheme,news_prevalent_emotion,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
40,41,anger,0.559051,0.238925,0.118639,0.083385,545,1.038109,1.229634,0.824752,0.676181
41,41,joy,0.326154,0.177666,0.174891,0.321268,2269,0.605639,0.914366,1.215809,2.605193
42,41,optimism,0.507413,0.212066,0.1787,0.10181,653,0.942222,1.091404,1.242286,0.825589
43,41,sadness,0.48985,0.297045,0.118391,0.094708,13738,0.909608,1.528754,0.82303,0.767997
44,41,undefined,0.440861,0.2503,0.125393,0.183427,806,0.81864,1.288178,0.87171,1.487428
45,42,anger,0.506,0.332875,0.145,0.016,8,0.939597,1.713152,1.008012,0.129746
46,42,joy,0.012,0.0145,0.0445,0.929,2,0.022283,0.074625,0.309355,7.533361
47,42,optimism,0.42443,0.250689,0.194306,0.130594,470,0.788128,1.290181,1.35078,1.058998
48,42,sadness,0.431752,0.296824,0.155469,0.115959,6601,0.801724,1.527615,1.080789,0.940322
49,42,undefined,0.385147,0.296062,0.168752,0.150124,129,0.715184,1.523693,1.17313,1.217372


In [39]:
c5[c5.lift_sadness>1]

Unnamed: 0,subtheme,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
0,10,0.505693,0.225869,0.146352,0.122087,1380703,0.939026,1.162442,1.017412,0.990014
1,11,0.515926,0.228263,0.145613,0.110199,611967,0.958029,1.174761,1.012272,0.893613
2,12,0.498289,0.222977,0.157452,0.121281,67568,0.925279,1.14756,1.094576,0.983483
6,30,0.518199,0.203966,0.144652,0.133183,1433937,0.96225,1.049717,1.005593,1.079992
7,40,0.504813,0.224564,0.141957,0.128667,934895,0.937392,1.155726,0.986859,1.043372
8,41,0.469766,0.275075,0.128016,0.127135,18011,0.872315,1.415681,0.889943,1.030949
9,42,0.430407,0.293765,0.158196,0.117639,7210,0.799227,1.51187,1.099747,0.953945
10,43,0.562409,0.196294,0.110483,0.130812,24403,1.044343,1.010233,0.768057,1.060772
11,44,0.47518,0.271194,0.126368,0.127257,147333,0.882366,1.39571,0.878486,1.031937
16,60,0.524864,0.19481,0.148838,0.131488,834849,0.974626,1.002595,1.034693,1.06625


In [40]:
c51[c51.lift_sadness>1.5]

Unnamed: 0,subtheme,country,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
20,11,PH,0.340909,0.293267,0.209146,0.156691,15996,0.633037,1.509308,1.453941,1.270622
32,12,PH,0.318746,0.301208,0.248849,0.131212,1471,0.591884,1.550177,1.72995,1.064013
97,41,CA,0.452612,0.331748,0.123237,0.092405,1349,0.840459,1.707352,0.856721,0.749326
98,41,IE,0.426029,0.356193,0.125415,0.092364,966,0.791098,1.833156,0.871861,0.748993
99,41,IN,0.159562,0.501062,0.143875,0.1955,16,0.296293,2.578734,1.000191,1.585331
100,41,MY,0.417,0.54,0.032,0.011,1,0.774332,2.779128,0.222458,0.0892
101,41,NG,0.3085,0.666,0.01475,0.011,4,0.572857,3.427591,0.102539,0.0892
103,41,PH,0.161533,0.394333,0.157933,0.286067,15,0.299953,2.029449,1.097922,2.319745
106,41,ZA,0.27855,0.354,0.13115,0.23635,20,0.517243,1.821872,0.911729,1.916588
110,42,IN,0.373195,0.346887,0.174714,0.105173,133,0.692991,1.785266,1.21458,0.852859


#### Emotions per continent
- America and Oceania have more anger
- Asia is more optimistic
- Europe has more sadness than average
- Africa have more joy and sadness

In [41]:
c8

Unnamed: 0,continent,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
0,Africa,0.503765,0.210874,0.151471,0.13389,1405274,0.935447,1.085269,1.052998,1.085729
1,America,0.558957,0.184328,0.138787,0.117928,10818015,1.037933,0.94865,0.964823,0.956289
2,Asia,0.460086,0.211979,0.183224,0.144711,1468385,0.85434,1.090956,1.273735,1.173476
3,Europe,0.514436,0.215037,0.14091,0.129617,3048041,0.955262,1.106692,0.979581,1.051079
4,Oceania,0.557224,0.189218,0.138358,0.1152,881457,1.034716,0.973818,0.961835,0.934167


In [42]:
c42.query("theme==8").tweetId.mean()

11353.173913043478

In [43]:
c42[c42.dsw>pd.to_datetime('2021-01-01')].sort_values(['dsw', 'theme']).tail(50)

Unnamed: 0,theme,dsw,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
483,7,2021-03-21,0.502419,0.217605,0.151112,0.128865,21063,0.932947,1.119909,1.050501,1.044983
552,8,2021-03-21,0.516568,0.205866,0.159567,0.118003,27458,0.959221,1.059498,1.109281,0.956902
64,1,2021-03-28,0.50932,0.222158,0.145194,0.123331,17208,0.945761,1.143345,1.009363,1.000102
134,2,2021-03-28,0.524075,0.194118,0.151275,0.13053,32379,0.973161,0.999036,1.051632,1.058481
204,3,2021-03-28,0.519259,0.186279,0.141227,0.153235,17272,0.964217,0.958692,0.98178,1.242597
274,4,2021-03-28,0.470035,0.233486,0.152711,0.143773,7019,0.872813,1.201645,1.061618,1.165868
344,5,2021-03-28,0.546214,0.179787,0.143127,0.13087,21913,1.01427,0.92528,0.994993,1.06124
414,6,2021-03-28,0.531679,0.193926,0.150564,0.123829,20856,0.98728,0.998044,1.046691,1.004141
484,7,2021-03-28,0.518256,0.222922,0.143355,0.115468,25546,0.962355,1.147273,0.996573,0.936345
553,8,2021-03-28,0.513803,0.188707,0.169442,0.128053,16149,0.954086,0.971188,1.177926,1.038399


#### African event of high incidence of anger
- In 2020 October there was a news published by a big Nigerian TV Channel saying that people were looting COVID-19 resources, and commenters angrily commented that this was not true.

In [44]:
c81[c81.continent=='Africa']

Unnamed: 0,continent,dsm,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
0,Africa,2020-01-01,0.513392,0.232159,0.141663,0.112805,3715,0.953323,1.194816,0.984812,0.914747
1,Africa,2020-02-01,0.532897,0.200581,0.130265,0.136258,37040,0.989542,1.032299,0.905575,1.104933
2,Africa,2020-03-01,0.476036,0.22222,0.160855,0.140889,238527,0.883956,1.143661,1.11823,1.142484
3,Africa,2020-04-01,0.49839,0.210906,0.155624,0.13508,369594,0.925466,1.085436,1.081867,1.09538
4,Africa,2020-05-01,0.504702,0.207826,0.157962,0.129511,199529,0.937188,1.069585,1.098124,1.050215
5,Africa,2020-06-01,0.506657,0.211102,0.146113,0.136128,78938,0.940818,1.086445,1.015748,1.103874
6,Africa,2020-07-01,0.4977,0.224397,0.147353,0.13055,87817,0.924185,1.154867,1.024366,1.058647
7,Africa,2020-08-01,0.51762,0.202318,0.144489,0.135573,49539,0.961174,1.041236,1.00446,1.099375
8,Africa,2020-09-01,0.510034,0.197073,0.147228,0.145663,31331,0.947088,1.014242,1.0235,1.181201
9,Africa,2020-10-01,0.599265,0.176383,0.116469,0.107883,53715,1.112782,0.907761,0.809668,0.874838


In [45]:
comments[(comments.country=='NG') & (comments.dsm=='2020-10-01') & (comments.prevalent_emotion=='anger')].conversationId.value_counts(normalize=True).cumsum().head(25).index

Float64Index([1.3166856469840773e+18, 1.3171886046077379e+18,
               1.319231803396395e+18, 1.3193416341155226e+18,
              1.3193295694684856e+18,  1.319997321745191e+18,
              1.3183019495995105e+18, 1.3206771328220856e+18,
               1.320977142126125e+18,  1.319737814427312e+18,
              1.3195245308141322e+18,  1.319717867835822e+18,
              1.3196266964091003e+18,  1.318750934030295e+18,
              1.3171733915148657e+18, 1.3210016620902728e+18,
              1.3209854470199828e+18, 1.3148242603392778e+18,
              1.3183316844634972e+18,  1.319709058312491e+18,
              1.3206903975517102e+18,  1.320967983154393e+18,
              1.3209706091285463e+18,  1.319993214435197e+18,
              1.3206377425822884e+18],
             dtype='float64')

In [46]:
c31[c31.continent=='Africa']

Unnamed: 0,continent,country,dsm,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
0,Africa,KE,2020-01-01,0.528234,0.240243,0.133353,0.09817,1274,0.980883,1.236416,0.927046,0.796074
1,Africa,KE,2020-02-01,0.58433,0.206586,0.126127,0.082958,11979,1.08505,1.063201,0.876808,0.672711
2,Africa,KE,2020-03-01,0.486895,0.23223,0.164273,0.116602,46117,0.90412,1.19518,1.141996,0.945538
3,Africa,KE,2020-04-01,0.489843,0.224633,0.169144,0.116382,44919,0.909595,1.156081,1.175856,0.943755
4,Africa,KE,2020-05-01,0.477085,0.234043,0.171259,0.117607,24751,0.885904,1.204511,1.190561,0.953691
5,Africa,KE,2020-06-01,0.475983,0.22458,0.160338,0.139101,12731,0.883857,1.155808,1.114635,1.127988
6,Africa,KE,2020-07-01,0.45918,0.248792,0.163743,0.128279,17545,0.852657,1.280414,1.138306,1.040226
7,Africa,KE,2020-08-01,0.4945,0.234219,0.149402,0.121878,10997,0.918242,1.205414,1.038617,0.988318
8,Africa,KE,2020-09-01,0.503485,0.204077,0.154163,0.138285,8642,0.934926,1.050288,1.071711,1.121366
9,Africa,KE,2020-10-01,0.495175,0.247604,0.149722,0.107505,7925,0.919496,1.2743,1.040836,0.871767


Filtering the 25 Nigerian news tweets in Oct 2020 that have more prevalent anger comments (they represent 60% of anger comments)

In [47]:
news_tweets[news_tweets.tweetId.isin(comments[(comments.country=='NG') & (comments.dsm=='2020-10-01') & (comments.prevalent_emotion=='anger')].conversationId.value_counts(normalize=True).cumsum().head(25).index)].content.values

array(['The FCT Security Committee has just announced a ban on all #EndSARS street demonstrations, protests, and processions anywhere in Abuja.\n\nThe Committee accused protesters of violating COVID-19 guidelines regulating public gatherings, as well as endangering their own lives. https://t.co/rhgvTEFpPq',
       '[BREAKING] Resumption: 181 students, staff contract COVID-19 in Lagos private school https://t.co/AxssGKbjwE',
       '181 Students Test Positive For COVID-19 In Lekki, Lagos\nhttps://t.co/2GX2XGWAWQ https://t.co/TT2YYdltbh',
       '#EndSARS protest: Nigeria should prepare for increase in COVID-19 cases ― FG warns https://t.co/gv0zy6sAqT',
       '#ENDSARS protest: Nigeria should prepare for increase in COVID-19 cases — FG https://t.co/4JVbQJI7Nh',
       'Protesters should kindly vacate the roads and allow supplies, especially food, to get to the people. The economy should not be asphyxiated, otherwise the combined effects of the protests and COVID-19 pandemic could collap

#### NewsId and emotions
- News accounts that have more anger comments are likely to be from America and related to politics
- News accounts with optimistic comments are from the philippines
- News accounts with sad comments are from india
- BB Radio 4 is a case with joyful comments

In [48]:
c9[c9.lift_joy>2]

Unnamed: 0,continent,country,news_id,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
7,Africa,KE,632859300.0,0.327115,0.200474,0.195462,0.276934,392,0.607423,1.031748,1.358812,2.245685
11,Africa,NG,15819470.0,0.343632,0.202714,0.182877,0.270777,1451,0.638094,1.043274,1.271327,2.195763
30,Africa,NG,1698798000.0,0.267,0.0668,0.3424,0.3234,5,0.495795,0.343788,2.380298,2.622485
95,America,US,15635600.0,0.438421,0.197737,0.071842,0.292316,19,0.814109,1.017659,0.499432,2.37042
125,Asia,IN,31632900.0,0.325366,0.177023,0.160806,0.336801,1379,0.604176,0.911056,1.117895,2.731152
133,Asia,IN,52535970.0,0.167969,0.192123,0.124723,0.515138,65,0.311904,0.988768,0.86705,4.177313
134,Asia,IN,92506190.0,0.319943,0.229429,0.137457,0.313343,35,0.594105,1.180762,0.955575,2.540931
156,Asia,IN,2348042000.0,0.268096,0.270543,0.210055,0.251247,470,0.49783,1.392356,1.460264,2.037387
162,Asia,IN,9.633674e+17,0.402309,0.193582,0.121673,0.282491,55,0.747052,0.996275,0.845845,2.290749
174,Asia,MY,477490500.0,0.2095,0.11475,0.271,0.4045,4,0.389023,0.590565,1.883939,3.280134


In [49]:
c9[c9.lift_optimism>2]

Unnamed: 0,continent,country,news_id,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy
30,Africa,NG,1698798000.0,0.267,0.0668,0.3424,0.3234,5,0.495795,0.343788,2.380298,2.622485
116,Asia,IN,9294762.0,0.25114,0.199962,0.307665,0.241182,650,0.466345,1.029109,2.138824,1.955767
141,Asia,IN,261113900.0,0.136,0.031,0.802,0.031,1,0.25254,0.159542,5.575347,0.251382
180,Asia,PH,17644830.0,0.264892,0.220842,0.395969,0.118279,2300,0.49188,1.136571,2.7527,0.959138
188,Asia,PH,63411620.0,0.296483,0.187066,0.344509,0.171927,1420,0.550543,0.962742,2.39496,1.394173
197,Asia,PH,8.233496e+17,0.287268,0.21767,0.329314,0.165775,1783,0.53343,1.120247,2.289327,1.344288


In [50]:
nt = news_tweets.merge(theme_desc)

In [51]:
nt.theme_desc.value_counts(normalize=True)

Economic impact             0.196295
Virus spreading             0.181703
Cases and deaths            0.166808
Preventive measures         0.143179
Authorities & Politics      0.111972
Educational impact          0.091092
People stories              0.057567
Vaccines and vaccination    0.051384
Name: theme_desc, dtype: float64

In [52]:
pd.options.display.max_rows=100

In [53]:
nt.groupby('country').theme_desc.value_counts(normalize=True)

country  theme_desc              
AU       Economic impact             0.167726
         Virus spreading             0.155865
         Preventive measures         0.148459
         People stories              0.144455
         Cases and deaths            0.138862
         Authorities & Politics      0.110512
         Educational impact          0.094835
         Vaccines and vaccination    0.039285
CA       Economic impact             0.195556
         Virus spreading             0.162582
         Cases and deaths            0.161789
         Authorities & Politics      0.160831
         Educational impact          0.118326
         Preventive measures         0.096161
         People stories              0.056719
         Vaccines and vaccination    0.048036
IE       Virus spreading             0.195870
         Economic impact             0.194974
         Cases and deaths            0.163955
         Preventive measures         0.155352
         Educational impact          0.096653


In [54]:
c9 = comments.groupby(['continent', 'country', 'news_id']) \
       .agg({'anger':'mean', 'sadness':'mean', 'optimism':'mean', 'joy':'mean', 'tweetId':'count'}) \
       .reset_index()

for col in ['anger', 'sadness', 'optimism', 'joy']:
    c9['lift_' + col] = c9[col] / g[col]

c9[['lift_'+col for col in ['anger', 'sadness', 'optimism', 'joy']]].max()

lift_anger       1.148388
lift_sadness     2.312080
lift_optimism    5.575347
lift_joy         5.817179
dtype: float64

In [94]:
c91.sort_values('anger').tail(5)

Unnamed: 0,continent,country,news_id,anger,sadness,optimism,joy,tweetId,lift_anger,lift_sadness,lift_optimism,lift_joy,username,displayname,userId,description,rawDescription,verified,created,followersCount,friendsCount,statusesCount,favouritesCount,listedCount,mediaCount,location,protected,linkUrl,linkTcourl,profileImageUrl,profileBannerUrl,media_outlet,media_category
72,America,US,2836421.0,0.60488,0.165092,0.127563,0.102465,635159,1.123209,0.849653,0.88679,0.830899,MSNBC,MSNBC,2836421.0,"The place for in-depth analysis, political com...","The place for in-depth analysis, political com...",True,2007-03-29 13:15:41+00:00,4219331,767,248262,818,28389,61955,,False,http://msnbc.com/live,https://t.co/YBwtJIhwY3,https://pbs.twimg.com/profile_images/132108981...,https://pbs.twimg.com/profile_banners/2836421/...,MSNBC,Television
103,America,US,457984599.0,0.610819,0.155204,0.11955,0.114424,182224,1.134237,0.798765,0.831091,0.92788,BreitbartNews,Breitbart News,457984599.0,"News, commentary, and destruction of the polit...","News, commentary, and destruction of the polit...",True,2012-01-08 01:50:52+00:00,1471106,110,145954,900,9202,3886,,False,http://breitbart.com,http://t.co/2sVbt3n6lO,https://pbs.twimg.com/profile_images/949270171...,https://pbs.twimg.com/profile_banners/45798459...,Breitbart News,Website
65,America,CA,373683618.0,0.617515,0.150138,0.1364,0.095944,21668,1.14667,0.77269,0.948225,0.778023,PnPCBC,Power & Politics,373683618.0,Breaking news and updates from the team at CBC...,Breaking news and updates from the team at CBC...,True,2011-09-15 01:25:56+00:00,222309,475,32699,1018,1793,9542,Ottawa,False,http://cbc.ca/powerandpolitics,https://t.co/mqaJv4m6Gx,https://pbs.twimg.com/profile_images/982295716...,https://pbs.twimg.com/profile_banners/37368361...,CBC,Television
70,America,US,1367531.0,0.618269,0.148134,0.125591,0.108006,199893,1.148071,0.762378,0.873081,0.875834,FoxNews,Fox News,1367531.0,"Follow America's #1 cable news network, delive...","Follow America's #1 cable news network, delive...",True,2007-03-17 19:01:26+00:00,20185319,260,425666,2,68612,138108,U.S.A.,False,http://www.foxnews.com,http://t.co/ZYG58XZtAC,https://pbs.twimg.com/profile_images/918480715...,https://pbs.twimg.com/profile_banners/1367531/...,Fox,Television
81,America,US,13850422.0,0.61844,0.152102,0.126491,0.102965,437591,1.148388,0.7828,0.879344,0.834956,CNNPolitics,CNN Politics,13850422.0,"Political news, campaign stories and Washingto...","Political news, campaign stories and Washingto...",True,2008-02-23 03:12:49+00:00,4136271,344,200996,4,20943,152532,"Washington, DC",False,http://cnn.com/politics,https://t.co/KWFMkrEjdY,https://pbs.twimg.com/profile_images/918899077...,https://pbs.twimg.com/profile_banners/13850422...,CNN,Television


In [91]:
nt[nt.userId==c91[c91.username.isin(c91.sort_values('anger').tail(5).username.values)].userId.values[0]].theme_desc.value_counts(normalize=True)

Economic impact             0.247222
Authorities & Politics      0.238889
Virus spreading             0.152778
Preventive measures         0.122222
Vaccines and vaccination    0.108333
People stories              0.055556
Educational impact          0.038889
Cases and deaths            0.036111
Name: theme_desc, dtype: float64

In [55]:
nt[nt.userId==c91[c91.username=='InsidersABC'].userId.values[0]].theme_desc.value_counts(normalize=True)

Economic impact             0.297297
Authorities & Politics      0.267267
Preventive measures         0.102102
People stories              0.096096
Virus spreading             0.090090
Educational impact          0.081081
Cases and deaths            0.036036
Vaccines and vaccination    0.030030
Name: theme_desc, dtype: float64

In [56]:
nt[nt.userId==c91[c91.username=='BreitbartNews'].userId.values[0]].theme_desc.value_counts(normalize=True)

Economic impact             0.235019
Authorities & Politics      0.201020
Virus spreading             0.178496
Preventive measures         0.131747
Educational impact          0.092648
Cases and deaths            0.084998
People stories              0.050999
Vaccines and vaccination    0.025074
Name: theme_desc, dtype: float64

In [57]:
nt[nt.userId==c91[c91.username=='PnPCBC'].userId.values[0]].theme_desc.value_counts(normalize=True)

Economic impact             0.247222
Authorities & Politics      0.238889
Virus spreading             0.152778
Preventive measures         0.122222
Vaccines and vaccination    0.108333
People stories              0.055556
Educational impact          0.038889
Cases and deaths            0.036111
Name: theme_desc, dtype: float64

In [58]:
nt[nt.userId==c91[c91.username=='FoxNews'].userId.values[0]].theme_desc.value_counts(normalize=True)

Economic impact             0.243740
Authorities & Politics      0.228715
Virus spreading             0.130217
Educational impact          0.095159
Preventive measures         0.093489
Cases and deaths            0.090150
Vaccines and vaccination    0.080134
People stories              0.038397
Name: theme_desc, dtype: float64

In [59]:
nt[nt.userId==c91[c91.username=='CNNPolitics'].userId.values[0]].theme_desc.value_counts(normalize=True)

Authorities & Politics      0.301386
Economic impact             0.277045
Virus spreading             0.143847
Preventive measures         0.072684
Educational impact          0.069473
Cases and deaths            0.067444
People stories              0.036849
Vaccines and vaccination    0.031271
Name: theme_desc, dtype: float64

In [60]:
c91.sort_values('sadness').tail(5).username

146         ABSCBNNews
48           CBCOttawa
126           fpjindia
120    NewIndianXpress
124          firstpost
Name: username, dtype: object

In [61]:
c91.sort_values('sadness').tail(5).description.values

array(["Stories, video, and multimedia for Filipinos worldwide, from ABS-CBN News and Current Affairs, the Philippines' most trusted news organization.",
       "Follow for breaking news, special reports and RTs of our journalists' latest work. Find us on Instagram.com/cbcottawanews or on Facebook.com/cbcottawa",
       'The voice of #Mumbai. The Free Press Journal is one of the oldest English daily newspapers from Mumbai with a heritage of more than 90 years.',
       'All the tweets for breaking news & views.\nFacebook: facebook.com/thenewindianxp…\n\nTelegram: t.me/thenewindianex…',
       'Incisive opinions, in-depth analysis and views that matter.'],
      dtype=object)

In [62]:
nt[nt.userId==c91[c91.username=='TelanganaToday'].userId.values[0]].theme_desc.value_counts(normalize=True)

Cases and deaths            0.257266
Virus spreading             0.207770
Economic impact             0.193094
Preventive measures         0.117602
Educational impact          0.082206
Authorities & Politics      0.068585
People stories              0.040096
Vaccines and vaccination    0.033381
Name: theme_desc, dtype: float64

In [63]:
nt[nt.userId==c91[c91.username=='CBCOttawa'].userId.values[0]].theme_desc.value_counts(normalize=True)

Authorities & Politics      0.245986
Cases and deaths            0.194381
Preventive measures         0.148222
Educational impact          0.122133
Economic impact             0.121273
Virus spreading             0.102351
Vaccines and vaccination    0.034690
People stories              0.030963
Name: theme_desc, dtype: float64

In [64]:
nt[nt.userId==c91[c91.username=='fpjindia'].userId.values[0]].theme_desc.value_counts(normalize=True)

Cases and deaths            0.298095
Virus spreading             0.161655
Economic impact             0.144752
Preventive measures         0.141763
Educational impact          0.086758
Authorities & Politics      0.076111
Vaccines and vaccination    0.052578
People stories              0.038289
Name: theme_desc, dtype: float64

In [65]:
nt[nt.userId==c91[c91.username=='NewIndianXpress'].userId.values[0]].theme_desc.value_counts(normalize=True)

Preventive measures         0.182353
Economic impact             0.181818
Virus spreading             0.181016
Cases and deaths            0.135027
Educational impact          0.109893
People stories              0.085027
Authorities & Politics      0.084225
Vaccines and vaccination    0.040642
Name: theme_desc, dtype: float64

In [66]:
nt[nt.userId==c91[c91.username=='firstpost'].userId.values[0]].theme_desc.value_counts(normalize=True)

Economic impact             0.237861
Cases and deaths            0.187066
Preventive measures         0.158648
Virus spreading             0.140971
Authorities & Politics      0.092414
People stories              0.079436
Educational impact          0.059521
Vaccines and vaccination    0.044081
Name: theme_desc, dtype: float64

In [67]:
c91.sort_values('optimism').tail(5).username

150         ANCALERTS
144    manilabulletin
156          pnagovph
153    TheManilaTimes
147    BusinessMirror
Name: username, dtype: object

In [68]:
c91.sort_values('optimism').tail(5).description.values

array(['News updates & breaking news from the Philippines. Like our Facebook page:  facebook.com/ancalerts.',
       'Breaking news and stories from different sides. RTs from our journalists. Unparalleled journalism in the Philippines since 1900. #BeFullyInformed',
       'The official Twitter account of Philippine News Agency, the newswire service of the Philippine government.',
       'The Manila Times is one of the leading national daily broadsheets in the Philippines. It is one of the longest running having been founded in 1898.',
       'A broader look at today’s business'], dtype=object)

In [69]:
nt[nt.userId==c91[c91.username=='ANCALERTS'].userId.values[0]].theme_desc.value_counts(normalize=True)

Economic impact             0.222869
Preventive measures         0.203236
Virus spreading             0.158960
Cases and deaths            0.146774
Educational impact          0.085438
Vaccines and vaccination    0.080563
Authorities & Politics      0.068174
People stories              0.033986
Name: theme_desc, dtype: float64

In [70]:
nt[nt.userId==c91[c91.username=='manilabulletin'].userId.values[0]].theme_desc.value_counts(normalize=True)

Preventive measures         0.201108
Cases and deaths            0.195007
Economic impact             0.185962
Virus spreading             0.174742
Authorities & Politics      0.078466
Vaccines and vaccination    0.067737
Educational impact          0.063179
People stories              0.033798
Name: theme_desc, dtype: float64

In [71]:
nt[nt.userId==c91[c91.username=='pnagovph'].userId.values[0]].theme_desc.value_counts(normalize=True)

Preventive measures         0.309050
Economic impact             0.214069
Virus spreading             0.126729
Cases and deaths            0.117244
Authorities & Politics      0.093927
Vaccines and vaccination    0.063365
Educational impact          0.050850
People stories              0.024766
Name: theme_desc, dtype: float64

In [72]:
nt[nt.userId==c91[c91.username=='TheManilaTimes'].userId.values[0]].theme_desc.value_counts(normalize=True)

Preventive measures         0.271564
Economic impact             0.252579
Virus spreading             0.139909
Cases and deaths            0.094305
Authorities & Politics      0.087908
Educational impact          0.061907
People stories              0.056954
Vaccines and vaccination    0.034874
Name: theme_desc, dtype: float64

In [73]:
nt[nt.userId==c91[c91.username=='BusinessMirror'].userId.values[0]].theme_desc.value_counts(normalize=True)

Economic impact             0.326987
Preventive measures         0.208682
Virus spreading             0.118304
Authorities & Politics      0.085301
Cases and deaths            0.077177
Educational impact          0.071846
Vaccines and vaccination    0.057883
People stories              0.053821
Name: theme_desc, dtype: float64

In [74]:
c91.sort_values('joy').tail(5).username

129    airnewsalerts
130       DDNewslive
7         bellanaija
112          mid_day
182        BBCRadio4
Name: username, dtype: object

In [75]:
c91.sort_values('joy').tail(5).description.values

array(['Official account of News Services Division, All India Radio.\n\nहिंदी के समाचारों के लिए @AIRNewsHindi को फॉलो करें!\nFollow @AIRNewsUrdu for news updates in Urdu',
       'Official Twitter account of DD News, the Public Broadcaster of India. हिंदी में @DDNewsHindi. Follow us on- instagram.com/ddnews_official',
       'We LOVE Everything Fab & African - Fashion, Music, Movies & Weddings! Join us at BellaNaija.com | Twitter updates by the BN Squad.',
       'All things #MadeinMumbai \nNews | Entertainment | Sports and much more\nSubscribe to #MiddayDigitalTabloid\n\nepaper.mid-day.com',
       'Your friendly lockdown companion - documentaries, politics, news, comedy and drama.'],
      dtype=object)

In [76]:
nt[nt.userId==c91[c91.username=='DDNewslive'].userId.values[0]].theme_desc.value_counts(normalize=True)

Cases and deaths            0.243374
Authorities & Politics      0.207822
Economic impact             0.145120
Virus spreading             0.142211
Preventive measures         0.129929
Vaccines and vaccination    0.056561
Educational impact          0.045249
People stories              0.029735
Name: theme_desc, dtype: float64

In [77]:
nt[nt.userId==c91[c91.username=='bellanaija'].userId.values[0]].theme_desc.value_counts(normalize=True)

Economic impact             0.285523
Authorities & Politics      0.159517
Virus spreading             0.155496
Educational impact          0.112601
Preventive measures         0.109920
Cases and deaths            0.092493
People stories              0.075067
Vaccines and vaccination    0.009383
Name: theme_desc, dtype: float64

In [78]:
nt[nt.userId==c91[c91.username=='mid_day'].userId.values[0]].theme_desc.value_counts(normalize=True)

Cases and deaths            0.206660
Virus spreading             0.176665
Preventive measures         0.176411
Economic impact             0.157855
Educational impact          0.116167
Authorities & Politics      0.073462
People stories              0.060244
Vaccines and vaccination    0.032537
Name: theme_desc, dtype: float64

# Trunk0

In [95]:
nt[nt.userId==c91[c91.username=='RadioCitizenFM'].userId.values[0]].theme_desc.value_counts(normalize=True)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
nt[nt.userId==c91[c91.username=='BBCRadio4'].userId.values[0]].theme_desc.value_counts(normalize=True)

In [None]:
c91['politic'] = c91.description.apply(lambda x: 'politic' in x.lower())
c91['live_or_break'] = c91.description.apply(lambda x: 'live' in x.lower() or 'break' in x.lower())

In [None]:
c91[c91.politic].anger.mean() / c91[~c91.politic].anger.mean()

In [None]:
c91[c91.continent=='America'].anger.mean() / c91[~(c91.continent=='America')].anger.mean()

In [None]:
c91[c91.live_or_break].sadness.mean() / c91[~c91.live_or_break].sadness.mean()

In [None]:
c91[c91.live_or_break]

In [None]:
c91.sort_values('anger').tail(5).description.values

In [None]:
c91.sort_values('optimism')

In [None]:
c91.sort_values('sadness')

In [None]:
c91.sort_values('joy')

In [None]:
nt[nt.userId==23937508.0].sort_values('likeCount').tail(25).content.values

In [None]:
nt[nt.userId==23937508.0].sort_values('likeCount').tail(25)

In [None]:
nt[nt.userId==23937508.0].prevalent_emotion.value_counts(normalize=True)

In [None]:
c9.iloc[201,:]

In [None]:
news_accounts[news_accounts.userId==17644834.0]

In [None]:
comments.query("news_id==17644834.0").news_prevalent_emotion.value_counts()

In [None]:
comments.query("news_id==17644834.0").prevalent_emotion.value_counts()

In [None]:
news_tweets.query("userId==17644834.0 and prevalent_emotion=='optimism'").content.values

In [None]:
comments.query("news_id==17644834.0")

In [None]:
c9.iloc[234,:]

In [None]:
comments.query("news_id==23937508.0").news_prevalent_emotion.value_counts()

In [None]:
comments.query("news_id==23937508.0").prevalent_emotion.value_counts()

In [None]:
comments.query("news_id==23937508.0")

In [None]:
news_accounts[news_accounts.userId==23937508.0]

# Likes, replies and shares related with emotions

In [None]:
comments.columns

In [80]:
comments.groupby('prevalent_emotion')[['replyCount', 'retweetCount', 'likeCount', 'quoteCount']].mean()

Unnamed: 0_level_0,replyCount,retweetCount,likeCount,quoteCount
prevalent_emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
anger,0.401236,0.181391,2.172032,0.042514
joy,0.286419,0.186197,1.954152,0.064334
optimism,0.466677,0.388864,2.852092,0.210058
sadness,0.500163,0.524447,2.862427,0.14367
undefined,0.453811,0.404617,2.665493,0.15336


In [81]:
comments.groupby(['news_prevalent_emotion','prevalent_emotion']).tweetId.count()

news_prevalent_emotion  prevalent_emotion
anger                   anger                2030624
                        joy                   265385
                        optimism              223519
                        sadness               255259
                        undefined             339943
joy                     anger                 346331
                        joy                   135494
                        optimism               67078
                        sadness                91242
                        undefined              98946
optimism                anger                1402380
                        joy                   281880
                        optimism              317402
                        sadness               293520
                        undefined             384711
sadness                 anger                3592959
                        joy                   658633
                        optimism              621955
    

# Trunk

In [None]:
theme_desc[theme_desc.theme==4].theme_desc.values[0]

In [None]:
k = np.random.randint(1,news_tweets.shape[0],1)[0]
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
k = 1145364
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
k = np.random.randint(1,news_tweets.shape[0],1)[0]
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
k = 377531
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
k = np.random.randint(1,news_tweets.shape[0],1)[0]
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
k = 497696 
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
k = np.random.randint(1,news_tweets.shape[0],1)[0]
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
k = np.random.randint(1,news_tweets.shape[0],1)[0]
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
k = np.random.randint(1,news_tweets.shape[0],1)[0]
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
k = np.random.randint(1,news_tweets.shape[0],1)[0]
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
k = np.random.randint(1,news_tweets.shape[0],1)[0]
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
k = np.random.randint(1,news_tweets.shape[0],1)[0]
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
k = np.random.randint(1,news_tweets.shape[0],1)[0]
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
k = np.random.randint(1,news_tweets.shape[0],1)[0]
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
for _ in range(50):
    k = np.random.randint(1,news_tweets.shape[0],1)[0]
    print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
k=0
print('k:', k, '| Tweet:', news_tweets.iloc[k,:].content, '| Theme:', theme_desc[theme_desc.theme==news_tweets.iloc[k,:].theme].theme_desc.values[0])

In [None]:
news_tweets.iloc[k,:]

In [None]:
topics[topics.tweetId==1331811812828803073]

In [None]:
theme_desc

In [None]:
topics = pd.read_parquet('./news_tweets_topics2.parquet')

In [None]:
news_tweets[news_tweets.tweetId==1331811812828803073].content.values

In [None]:
news_tweets.iloc[k,:]

In [None]:
k

In [None]:
news_tweets = pd.read_parquet('./../data/raw/news_tweets_with_em_scores.parquet')
news_tweets['emotion_score'] = news_tweets[['anger','joy','optimism','sadness']].max(axis=1)
news_tweets['prevalent_emotion'] = news_tweets[['anger','joy','optimism','sadness']].idxmax(axis=1)
news_tweets['prevalent_emotion'] = np.where(news_tweets.emotion_score>0.5, news_tweets.prevalent_emotion, 'undefined')

comments = comments[comments.tweetId.isin(plain_comments.tweetId)]
topics['tweetId'] = topics.tweetId.astype('float64')
news_tweets = news_tweets[~news_tweets.conversationId.isnull()].merge(topics[['theme'] + subthemes + ['tweetId']], how='left', on='tweetId')
news_accounts = news_accounts.merge(media_list, left_on='username', right_on='account')
news_accounts['userId'] = news_accounts.userId.astype('float64')
news_tweets = news_tweets.merge(news_accounts[['userId','country']], how='left')

# Define subtheme
news_tweets['subtheme'] = news_tweets[subthemes].idxmax(axis=1)
news_tweets['subtheme'] = news_tweets.subtheme.apply(lambda x: int(x[-2:]))
news_tweets['aux'] = news_tweets[subthemes].sum(axis=1)
news_tweets.loc[news_tweets.aux==0,'subtheme'] = news_tweets.loc[news_tweets.aux==0,'theme']*10
news_tweets = news_tweets.drop(subthemes + ['aux'], axis=1)
news_tweets['ds'] = news_tweets.date.dt.date

df = comments.merge(news_tweets.rename(columns={'date':'newsDate', 'userId':'newsId'})[[
    'conversationId', 'newsDate', 'newsId', 'prevalent_emotion', 'emotion_score', 'theme', 'subtheme', 'country'
]].drop_duplicates('conversationId'))
df['ds'] = (df['newsDate'] + pd.offsets.Week(weekday=6)).dt.date

In [None]:
df.groupby(['country', 'theme']).theme.count().rename('count').reset_index()

In [None]:
# import libraries
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# load dataset
tips = sns.load_dataset("tips")

# set the figure size
plt.figure(figsize=(14, 14))

# from raw value to percentage
total = tips.groupby('day')['total_bill'].sum().reset_index()
smoker = tips[tips.smoker=='Yes'].groupby('day')['total_bill'].sum().reset_index()
smoker['total_bill'] = [i / j * 100 for i,j in zip(smoker['total_bill'], total['total_bill'])]
total['total_bill'] = [i / j * 100 for i,j in zip(total['total_bill'], total['total_bill'])]

# bar chart 1 -> top bars (group of 'smoker=No')
bar1 = sns.barplot(x="day",  y="total_bill", data=total, color='darkblue')

# bar chart 2 -> bottom bars (group of 'smoker=Yes')
bar2 = sns.barplot(x="day", y="total_bill", data=smoker, color='lightblue')

# add legend
top_bar = mpatches.Patch(color='darkblue', label='smoker = No')
bottom_bar = mpatches.Patch(color='lightblue', label='smoker = Yes')
plt.legend(handles=[top_bar, bottom_bar])

# show the graph
plt.show()

# Rosie UK data

In [None]:
comments = comments.merge(news_tweets_with_em.rename(columns={'anger':'news_anger', 'sadness':'news_sadness', 'optimism':'news_optimism', 'joy':'news_joy'}).sort_values('date').drop_duplicates('conversationId', keep='first')[['conversationId', 'news_anger', 'news_sadness', 'news_optimism', 'news_joy']], how='left')
for i, g in df3[df3.newsName.isin(['BBC News (UK)', 'Daily Mail Online', 'The Guardian'])].groupby('newsName'):
    g.to_csv('./uk_users/{}.csv'.format(i), header=True, index_label=False, index=False)

# Google Data Studio

In [None]:
c2 = comments.merge(news_tweets_with_em.rename(columns={'anger':'news_anger', 'sadness':'news_sadness', 'optimism':'news_optimism', 'joy':'news_joy'}).sort_values('date').drop_duplicates('conversationId', keep='first')[['conversationId', 'news_anger', 'news_sadness', 'news_optimism', 'news_joy']], how='left')

In [None]:
c2[c2.country=='UK'].merge(theme_desc).merge(subtheme_desc).merge(news_accounts[['userId', 'displayname']].rename(columns={'userId':'newsId', 'displayname':'newsName'}), how='left').drop(['theme', 'subtheme', 'newsId'], axis=1)

In [None]:
df3 = df[df.country=='UK'].merge(theme_desc).merge(subtheme_desc).merge(news_accounts[['userId', 'displayname']].rename(columns={'userId':'newsId', 'displayname':'newsName'}), how='left').drop(['theme', 'subtheme', 'newsId'], axis=1)

In [None]:
c2

In [None]:
comments.news_emotion_score.isnull().sum()

In [None]:
comments

In [None]:
nt = pd.read_parquet('./../data/raw/news_tweets.parquet')

In [None]:
news_tweets['emotion_score'] = news_tweets[['anger','joy','optimism','sadness']].max(axis=1)
news_tweets['prevalent_emotion'] = news_tweets[['anger','joy','optimism','sadness']].idxmax(axis=1)
news_tweets['prevalent_emotion'] = np.where(news_tweets.emotion_score>0.5, news_tweets.prevalent_emotion, 'undefined')

comments = comments[comments.tweetId.isin(plain_comments.tweetId)]
topics['tweetId'] = topics.tweetId.astype('float64')
news_tweets = news_tweets[~news_tweets.conversationId.isnull()].merge(topics[['theme'] + subthemes + ['tweetId']], how='left', on='tweetId')
news_accounts = news_accounts.merge(media_list, left_on='username', right_on='account')
news_accounts['userId'] = news_accounts.userId.astype('float64')
news_tweets = news_tweets.merge(news_accounts[['userId','country']], how='left')

# Define subtheme
news_tweets['subtheme'] = news_tweets[subthemes].idxmax(axis=1)
news_tweets['subtheme'] = news_tweets.subtheme.apply(lambda x: int(x[-2:]))
news_tweets['aux'] = news_tweets[subthemes].sum(axis=1)
news_tweets.loc[news_tweets.aux==0,'subtheme'] = news_tweets.loc[news_tweets.aux==0,'theme']*10
news_tweets = news_tweets.drop(subthemes + ['aux'], axis=1)
news_tweets['ds'] = news_tweets.date.dt.date

In [None]:
df = comments.merge(news_tweets.rename(columns={'date':'newsDate', 'userId':'newsId', 'anger':'news_anger', 'sadness':'news_sadness', 'joy':'news_joy', 'optimism':'news_optimism'})[[
    'conversationId', 'newsDate', 'newsId', 'prevalent_emotion', 'emotion_score', 'theme', 'subtheme', 'country', 'news_anger', 'news_joy', 'news_sadness', 'news_optimism'
]].drop_duplicates('conversationId'))
df['ds'] = df.newsDate.dt.date

In [None]:
df3 = df[df.country=='UK'].merge(theme_desc).merge(subtheme_desc).merge(news_accounts[['userId', 'displayname']].rename(columns={'userId':'newsId', 'displayname':'newsName'}), how='left').drop(['theme', 'subtheme', 'newsId'], axis=1)

In [None]:
for i, g in df3[df3.newsName.isin(['BBC News (UK)', 'Daily Mail Online', 'The Guardian'])].groupby('newsName'):
    g.to_csv('./uk_users/{}.csv'.format(i), header=True, index_label=False, index=False)

In [None]:
news_tweets.rename(columns={'userId':'newsId'})

In [None]:
nt = news_tweets.rename(columns={'userId':'newsId'}).groupby(['ds', 'theme', 'subtheme', 'country', 'newsId']) \
       .agg({'content':'size', 'anger':'mean', 'joy':'mean', 'optimism':'mean', 'sadness':'mean', 'replyCount':'mean', 'retweetCount':'mean', 'likeCount':'mean', 'quoteCount':'mean'}) \
       .rename(columns={'content':'count'}) \
       .reset_index()
nt = nt.merge(theme_desc).merge(subtheme_desc)
nt = nt.drop(['theme_desc', 'subtheme_desc'], axis=1)
nt['newsId'] = nt.newsId.astype('int64')
nt.to_csv('agg_news_tweets.csv', index=False)

In [None]:
nt.columns

In [None]:
for col in ['replyCount', 'retweetCount', 'likeCount','quoteCount']:
    nt[col] = nt[col].astype('int32')

In [None]:
nt

In [None]:
nt.to_csv('agg_news_tweets.csv', index=False)

In [None]:
df['emotion_score'] = df[['anger','joy','optimism','sadness']].max(axis=1)
df['comment_emotion'] = df[['anger','joy','optimism','sadness']].idxmax(axis=1)
df['comment_emotion'] = np.where(df.emotion_score>0.5, df.comment_emotion, 'undefined')

In [None]:
df2 = df.groupby(['ds', 'theme', 'subtheme', 'country', 'newsId', 'prevalent_emotion']) \
       .agg({'content':'size', 'anger':'mean', 'joy':'mean', 'optimism':'mean', 'sadness':'mean', 'replyCount':'mean', 'retweetCount':'mean', 'likeCount':'mean', 'quoteCount':'mean'}) \
       .rename(columns={'content':'count'}) \
       .reset_index()
df2 = df2.merge(theme_desc).merge(subtheme_desc)
df2 = df2.drop(['theme_desc', 'subtheme_desc'], axis=1)
df2['newsId'] = df2.newsId.astype('int64')

In [None]:
df2[
    ~(df2.anger.isnull()) & ~(df2.joy.isnull()) & ~(df2.optimism.isnull()) & ~(df2.sadness.isnull())
].to_csv('agg_users_tweets.csv', index=False)

In [None]:
news_accounts['userId'] = news_accounts.userId.astype('int64')
news_accounts[['country', 'userId', 'displayname']].to_csv('lookup_news.csv', index=False)

In [None]:
df2

In [None]:
df2.dtypes

In [None]:
emotion_id = pd.DataFrame({'news_emotion_id':range(5),'prevalent_emotion':list(df2.prevalent_emotion.unique())})

In [None]:
country_id = pd.DataFrame({'country_id':range(12),'country':list(df2.country.unique())})

In [None]:
df2 = df2.merge(theme_desc).merge(subtheme_desc).merge(emotion_id).merge(country_id)
df2 = df2.drop(['country', 'prevalent_emotion', 'theme_desc', 'subtheme_desc'], axis=1)

In [None]:
df2.to_csv('emotion_per_theme.csv', index=False)
country_id.to_csv('lookup_coutry.csv', index=False)

In [None]:
df2[
    ~(df2.anger.isnull()) & ~(df2.joy.isnull()) & ~(df2.optimism.isnull()) & ~(df2.sadness.isnull())
].to_csv('agg_users_tweets.csv', index=False)

In [None]:
df2.isnull().sum()

In [None]:
df2[df2.anger.isnull()]

In [None]:
for t in [1,2,3,4,5,6,7,8]:
#for st in [11,12,21,22,41,42,43,44,51,52,53,61,62,63,64,65,66,71,72,73,74]:
    fig, axs = plt.subplots(figsize=(15, 6))
    #df = comments[comments.conversationId.isin(news_tweets[news_tweets.tweetId.isin(topics[topics[fr"subtheme{st}"]].tweetId)].conversationId)]
    df = comments[comments.conversationId.isin(news_tweets[news_tweets.tweetId.isin(topics[topics.theme==t].tweetId)].conversationId)]
    df = df.merge(news_tweets[~news_tweets.conversationId.isnull()].rename(columns={'date':'news_date'})[['conversationId', 'news_date']].drop_duplicates('conversationId'))
    #df['ds'] = pd.to_datetime(df.date.dt.year.astype(str) + '-' + df.date.dt.month.astype(str) + '-1')
    df['ds'] = df.news_date.dt.date
    df = df[['ds', 'anger', 'joy', 'optimism', 'sadness']]
    df = df.groupby('ds')[['anger', 'joy', 'optimism', 'sadness']].mean()
    df.plot.line(ax=axs)
    axs.set_xlabel("datetime")
    axs.set_ylabel("mean score per emotion")
    fig.savefig(fr"theme_monthly_{t}.png")

In [None]:
df = news_tweets[news_tweets.tweetId.isin(news_tweets_topics[news_tweets_topics.theme==8].tweetId)].copy()
df['ds'] = df.date.dt.date
df = df[['ds', 'anger', 'joy', 'optimism', 'sadness']]
df = df.groupby('ds')[['anger', 'joy', 'optimism', 'sadness']].mean()
df.plot.line(figsize=(15, 6))

In [None]:
df

In [None]:
df = df[['date', 'anger', 'joy', 'optimism', 'sadness']]
df.set_index('date').plot.line()

In [None]:
news_tweets_topics = pd.read_parquet('./news_tweets_topics.parquet')

In [None]:
news_tweets_topics[news_tweets_topics.topic_11 == tweets.iloc[14,:].topic_11]

In [None]:
news_tweets[news_tweets.tweetId==1315975830032592896]

In [None]:
news_tweets_topics[news_tweets_topics.tweetId==1315975830032592896]

In [None]:
comments[comments.conversationId==1315975830032592896]

In [None]:
tweets[tweets.conversationId==1315975830032592896]

In [None]:
tweets[(tweets.topic_11==tweets.iloc[14,:].topic_11)].head()

In [None]:
tweets[~tweets.topic_0.isnull()].head()

In [None]:
tweets.iloc[14,:].topic_11

In [None]:
tweets.shape

In [None]:
news_tweets.userId.nunique()

In [None]:
topics.topic.unique()

In [None]:
tweets = pd.read_parquet('./../data/raw/news_tweets.parquet').sample(frac=1, random_state=3)
tweets = tweets.reset_index(drop=True)

In [None]:
df = pd.read_csv('tweets_topics2.csv')

In [None]:
tweets['topic'] = df.Dominant_Topic