# Extracting keywords from tweets

Calculate the sentiment score of the tweets using lexicon-based models VADER and TextBlob. Both are constructed from a generalizable, valence-based, human-curated gold standard sentiment lexicon.

## Set up

In [1]:
import os
import re
import string
import pandas as pd
import numpy as np

from emot.emo_unicode import UNICODE_EMO, EMOTICONS

In [2]:
import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)

/Users/alessiatosi/DS_projects/behavioural-sci-perception/venv/bin/python
3.8.1 (default, Apr  8 2020, 10:42:19) 
[Clang 11.0.0 (clang-1100.0.33.17)]
sys.version_info(major=3, minor=8, micro=1, releaselevel='final', serial=0)


In [3]:
%load_ext autoreload
from src.preproc_text import *
from src.utils import chain_functions
from src.analyse_text import get_sentiment_score_VDR, get_sentiment_score_TB

In [4]:
%reload_ext autoreload

In [5]:
os.getcwd()

'/Users/alessiatosi/DS_projects/behavioural-sci-perception/notebooks'

In [6]:
pd.options.display.max_seq_items = 10000
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

Environment variables and constants

In [7]:
DATA_DIR = os.environ.get("DIR_DATA_INTERIM")

In [8]:
FILENAME = "tweets_original_en"

### Dictionary of keywords

In [21]:
keywords_dict = {
    "thaler" : "Actors",
    "sunstein" : "Actors",
    "kahneman" : "Actors",
    "halpern" : "Actors",
    "michie" : "Actors",
    "chater" : "Actors",
    "spi-b" : "Actors",
    "spib" : "Actors",
    "nudge unit" : "Actors",
    "nudgeunit" : "Actors",
    "behavioural insights team" : "Actors",
    "behaviouralinsightsteam" : "Actors",

    "behavioural science" : "BehavSci",
    "behavioural sciences" : "BehavSci",
    "behavioural scientist" : "BehavSci",
    "behavioural scientists" : "BehavSci",
    "behaviouralscience" : "BehavSci",
    "behaviouralsciences" : "BehavSci",
    "behaviouralscientist" : "BehavSci",
    "behaviouralscientists" : "BehavSci",

    "behavioural insight" : "Behav_ins",
    "behavioural insights": "Behav_ins",
    "behaviouralinsight" : "Behav_ins",
    "behaviouralinsights": "Behav_ins",

    "behaviour change" : "Behav_chan",
    "behavioural change" : "Behav_chan",
    "behaviourchange" : "Behav_chan",
    "behaviouralchange" : "Behav_chan",

    "behavioural policy": "Behav_pol",
    "behaviouralpolicy": "Behav_pol",

    "behavioural analysis": "Behav_anal",
    "behavioural analyst": "Behav_anal",
    "behavioural analysts": "Behav_anal",
    "behaviouralanalysis": "Behav_anal",
    "behaviouralanalyst": "Behav_anal",
    "behaviouralanalysts": "Behav_anal",

    "psychologists": "Psych",
    "psychologist": "Psych",
    "psychology": "Psych",
    "psychological science": "Psych",
    "psychological policy": "Psych",
    "psychologicalscience": "Psych",
    "psychologicalpolicy": "Psych",

    "behavioural economics": "Econ_behav",
    "behavioural economist": "Econ_behav",
    "behavioural economists": "Econ_behav",
    "behaviouraleconomics": "Econ_behav",
    "behaviouraleconomist": "Econ_behav",
    "behaviouraleconomists": "Econ_behav",

    "irrational behaviour": "Econ_irrational",
    "irrational choice": "Econ_irrational",
    "irrational choices": "Econ_irrational",
    "irrationalbehaviour": "Econ_irrational",
    "irrationalchoice": "Econ_irrational",
    "irrationalchoices": "Econ_irrational",

    "nudges": "Nudge",
    "nudge": "Nudge",
    "nudging": "Nudge",
    "nudge theory": "Nudge",
    "nudge strategy": "Nudge",
    "nudgetheory": "Nudge",
    "nudgestrategy": "Nudge",

    "choice architecture": "Nudge_choice",
    "choice architect": "Nudge_choice",
    "choicearchitecture": "Nudge_choice",
    "choicearchitect": "Nudge_choice",

    "paternalism": "Nudge_pater",
    "libertarian paternalism": "Nudge_pater",
    "libertarianpaternalism": "Nudge_pater",
    "paternalistic": "Nudge_pater",

    "pandemic": "Covid",
    "epidemic": "Covid",
    "corona": "Covid",
    "coronavirus": "Covid",
    "covid": "Covid",
    "covid-19": "Covid",
    "covid19": "Covid",
    "health emergency": "Covid",
    "healthemergency": "Covid",

    "behavioural fatigue": "Fatigue",
    "behaviouralfatigue": "Fatigue",

    "herd immunity": "Immunity",
    "herdimmunity": "Immunity"

}

## Get Data

In [9]:
tweets_df = pd.read_csv(os.path.join(DATA_DIR, FILENAME + '.csv'))

In [10]:
tweets_df.shape

(3107, 18)

# Quick glimpse

In [18]:
tweets_df.head(10)

Unnamed: 0.1,Unnamed: 0,created_at,hashtags,favorite_count,id,reweet_id,retweet_screen_name,lang,place,possibly_sensitive,retweet_count,text,user_screen_name,user_followers_count,user_location,user_name,user_screen_name.1,user_time_zone
0,5,Tue Apr 14 18:59:09 +0000 2020,NudgeUnit coronavirus HerdImmunity,0,1250136530238267392,,,en,,False,0,"""The introduction of the rules of behavior taken from the corporate sector into politics means that politicians no longer see people whom they rule as co-citizens but as employees."" – @BrankoMilan …sounds familiar UK? #NudgeUnit #coronavirus #HerdImmunity https://t.co/vABII5Y2jB",38one,784,UK,Denis Radenković,38one,
1,6,Sun Apr 19 07:41:53 +0000 2020,rotary handwashing behaviourchange handhygieneforhealth SPATAP COVID19,0,1251778026603294720,,,en,,False,0,@chprotary 🖐️🚰 #rotary #handwashing #behaviourchange project #handhygieneforhealth WATCH VIDEO https://t.co/uKEEDPh8e0 https://t.co/HtUOzsVYKT with #SPATAP Portable Tap making communities hygienic instantly https://t.co/7X3H0SlUB7 info@handhygieneforhealth.org #COVID19 https://t.co/0PkpOJsC5Q,RotaryHHFH,98,"Sunshine Coast, Queensland",HandHygieneForHealth,RotaryHHFH,
2,14,Tue Apr 07 21:48:18 +0000 2020,,2,1247642382293561345,,,en,,False,0,"""Of course, should behaviour change as governments would wish, actual outcomes will be less terrifying than the models had originally forecast. This is not evidence that the policy was unnecessary: Rather, it is evidence that it worked."" https://t.co/b5jKA0D6r7",FairDuty,496,"Treaty Six - Edmonton, Canada",Meera Nair,FairDuty,
3,17,Sat Mar 14 19:34:45 +0000 2020,coronavirus,10,1238911465303801856,,,en,,False,2,"My cartoon - he doesn’t trust doctors, he wants to see a behavioural scientist \n#coronavirus https://t.co/BLCsrb88cb",MartinShovel,13786,United Kingdom,Martin Shovel,MartinShovel,
4,26,Thu Mar 12 14:40:29 +0000 2020,,0,1238112632969801728,,,en,,False,0,So it seems clear that UK strategy is to let virus spread to achieve herd immunity and to try and protect vulnerable by ... cocooning elderly people? This doctor is a psychologist https://t.co/wbBIjLeOt2,Hippoclides,2109,Dublin 12,Michael O'Sullivan,Hippoclides,
5,34,Fri May 29 23:22:25 +0000 2020,,6,1266510234274672645,,,en,,False,4,Love it.👏👏👏👏 https://t.co/RjOEc3DYPw,trulyScotpics,869,"Scotland, Scotland",Focus On Scotland,trulyScotpics,
6,35,Thu Mar 19 19:58:53 +0000 2020,behaviouralscience nudgesinthewild COVID19,0,1240729478088790016,1.240444e+18,shayonislynn,en,,False,4,RT @shayonislynn: Using #behaviouralscience to improve behaviours #nudgesinthewild #COVID19 https://t.co/OyIDElZwR7,DonSmith_ca,300,"Toronto, ON",Don Smith 🇨🇦,DonSmith_ca,
7,37,Mon May 04 12:47:08 +0000 2020,covid19 lockdown,0,1257290666784235520,,,en,,False,1,"What role will behavioural science play in lifting the lockdown in the UK? Oxera Senior Consultants Leon Fields and Tim Hogg and Senior Adviser Peter Andrews explore underlying questions of nudging, empiricism and compliance here: https://t.co/2ZMVZTSCiV #covid19 #lockdown",OxeraConsulting,1503,"Amsterdam, Berlin, Brussels, London, Oxford, Paris, Rome",Oxera Consulting LLP,OxeraConsulting,
8,39,Wed May 06 14:28:43 +0000 2020,COVID19 KomeshaCorona coronaviruskenya,4,1258041006223765505,,,en,,False,1,"Behaviour Change in the context of #COVID19 , will the tried and tested still hold?. Join @AfricaSBC on 13/05/2020 for an indepth discussion on behaviour change in the context of COVID-19. Click the link to join- https://t.co/dqLFxF6ZHS #KomeshaCorona #coronaviruskenya https://t.co/dv6BcOXL99",TunzaHealth,83,,Tunza Health Network,TunzaHealth,
9,40,Thu Apr 09 20:45:48 +0000 2020,Winnipeg UpToSpeed Manitoba coronavirus COVID19 psychology mentalhealth behaviouralscience,3,1248351427644325890,,,en,,,0,"Hey #Winnipeg!! Looking forward to joining @CBCIsmaila on today's #UpToSpeed to talk isolation, stress, and how to safeguard your mental health in these unprecedented times. @CBCManitoba #Manitoba #coronavirus #COVID19 #psychology #mentalhealth #behaviouralscience @senecacollege",lcava,1082,Toronto,Laura Cavanagh,lcava,


## Are there still duplicates?

Looks like there are still duplicates in the dataset that we need to get rid of. Consider re-teweet counts when doing do.

In [12]:
# Find a duplicate texts
duplicate_tweets = tweets_df[tweets_df.duplicated(['text'])]
print(duplicate_tweets[['favorite_count', 'retweet_count', 'text']])

      favorite_count  retweet_count  \
1212               0              1   
1520               0              2   
1851               1              0   
2057               0              0   
2122               0              2   
2323               2              1   
2709               0              0   
2810               1              1   
2812               1              0   
2997               0              0   

                                                                                                                                                                                                                                                                               text  
1212                                                     The deployment of behavioural science is a feature of the public health response to the Covid-19 pandemic both internationally and in Ireland, https://t.co/o16Wjg5BRd #COVID19ireland #COVID19 #nudge #behaviouralscience  
1520  "It is c

Apparently they are all duplicates of one single tweets.

We will keep the one with the largest count of "favourites". 

In [13]:
duplicate_tweets[duplicate_tweets.favorite_count == max(duplicate_tweets.favorite_count)].index

Int64Index([2323], dtype='int64')

In [14]:
# get index
duplicate_tweets_index = duplicate_tweets[duplicate_tweets.favorite_count != 
                                          max(duplicate_tweets.favorite_count)].index

In [15]:
duplicate_tweets_index

Int64Index([1212, 1520, 1851, 2057, 2122, 2709, 2810, 2812, 2997], dtype='int64')

In [16]:
tweets_df = tweets_df.drop(duplicate_tweets_index, axis=0).copy()

In [17]:
tweets_df.shape

(3098, 18)

## Extract keyword occurrences

From each tweets, we extract the keywords that appear in the tweets

#### Lower text case

In [24]:
tweets_df['text_cl'] = [t.lower() for t in tweets_df.text]

### Extract keywords

In [None]:
# test
[[kword for kword in keywords_dict.keys() if kword in tweet] for 
 tweet in ['UK? #nudgeunit #coronavirus #herdimmunity', 'should behaviour change as governments would wish']]

In [41]:
tweets_df['subkeywords'] = [[kword for kword in keywords_dict.keys() if kword in tweet] for tweet in tweets_df.text_cl]

In [42]:
# take a look
tweets_df[['text_cl', 'subkeywords', 'favorite_count', 'retweet_count']]

Unnamed: 0,text_cl,subkeywords,favorite_count,retweet_count
0,"""the introduction of the rules of behavior taken from the corporate sector into politics means that politicians no longer see people whom they rule as co-citizens but as employees."" – @brankomilan …sounds familiar uk? #nudgeunit #coronavirus #herdimmunity https://t.co/vabii5y2jb","[nudgeunit, nudge, corona, coronavirus, herdimmunity]",0,0
1,@chprotary 🖐️🚰 #rotary #handwashing #behaviourchange project #handhygieneforhealth watch video https://t.co/ukeedph8e0 https://t.co/htuozsvykt with #spatap portable tap making communities hygienic instantly https://t.co/7x3h0slub7 info@handhygieneforhealth.org #covid19 https://t.co/0pkpojsc5q,"[behaviourchange, covid, covid19]",0,0
2,"""of course, should behaviour change as governments would wish, actual outcomes will be less terrifying than the models had originally forecast. this is not evidence that the policy was unnecessary: rather, it is evidence that it worked."" https://t.co/b5jka0d6r7",[behaviour change],2,0
3,"my cartoon - he doesn’t trust doctors, he wants to see a behavioural scientist \n#coronavirus https://t.co/blcsrb88cb","[behavioural scientist, corona, coronavirus]",10,2
4,so it seems clear that uk strategy is to let virus spread to achieve herd immunity and to try and protect vulnerable by ... cocooning elderly people? this doctor is a psychologist https://t.co/wbbijleot2,"[psychologist, herd immunity]",0,0
5,love it.👏👏👏👏 https://t.co/rjoec3dypw,[],6,4
6,rt @shayonislynn: using #behaviouralscience to improve behaviours #nudgesinthewild #covid19 https://t.co/oyidelzwr7,"[behaviouralscience, nudges, nudge, covid, covid19]",0,4
7,"what role will behavioural science play in lifting the lockdown in the uk? oxera senior consultants leon fields and tim hogg and senior adviser peter andrews explore underlying questions of nudging, empiricism and compliance here: https://t.co/2zmvztsciv #covid19 #lockdown","[behavioural science, nudging, covid, covid19]",0,1
8,"behaviour change in the context of #covid19 , will the tried and tested still hold?. join @africasbc on 13/05/2020 for an indepth discussion on behaviour change in the context of covid-19. click the link to join- https://t.co/dqlfxf6zhs #komeshacorona #coronaviruskenya https://t.co/dv6bcoxl99","[behaviour change, corona, coronavirus, covid, covid-19, covid19]",4,1
9,"hey #winnipeg!! looking forward to joining @cbcismaila on today's #uptospeed to talk isolation, stress, and how to safeguard your mental health in these unprecedented times. @cbcmanitoba #manitoba #coronavirus #covid19 #psychology #mentalhealth #behaviouralscience @senecacollege","[behaviouralscience, psychology, corona, coronavirus, covid, covid19]",3,0


## Tweets without a keyword mentioned

These are cases where the keywords is mentioned in another tweet linked from the current tweets

### How many are they?

In [45]:
len([l for l in tweets_df.subkeywords if len(l) == 0])

462

In [48]:
no_kword_tweets_df = tweets_df[tweets_df['subkeywords'].map(lambda d: len(d)) == 0]

In [49]:
no_kword_tweets_df

Unnamed: 0.1,Unnamed: 0,created_at,hashtags,favorite_count,id,reweet_id,retweet_screen_name,lang,place,possibly_sensitive,...,text,user_screen_name,user_followers_count,user_location,user_name,user_screen_name.1,user_time_zone,text_cl,keywords,subkeywords
5,34,Fri May 29 23:22:25 +0000 2020,,6,1266510234274672645,,,en,,False,...,Love it.👏👏👏👏 https://t.co/RjOEc3DYPw,trulyScotpics,869,"Scotland, Scotland",Focus On Scotland,trulyScotpics,,love it.👏👏👏👏 https://t.co/rjoec3dypw,[],[]
12,54,Fri Mar 20 12:20:06 +0000 2020,,0,1240976408563908608,,,en,,False,...,WOO Join us in 15 mins time. &lt;3 Facebook Live for anyone who cannot get into Zoom x https://t.co/VbnsYXyDkg,TaraECooper,128,London,Tara Cooper,TaraECooper,,woo join us in 15 mins time. &lt;3 facebook live for anyone who cannot get into zoom x https://t.co/vbnsyxydkg,[],[]
26,116,Wed Apr 15 16:12:41 +0000 2020,,0,1250457025005457410,,,en,,False,...,"I'd add the fourth one: don't let the policy responses be influenced by the powerful, conservative religious lobbies. https://t.co/WPmGY3kVIz",tomdrabowicz,1150,"Lodz, Poland",Tomasz Drabowicz,tomdrabowicz,,"i'd add the fourth one: don't let the policy responses be influenced by the powerful, conservative religious lobbies. https://t.co/wpmgy3kviz",[],[]
34,148,Mon Mar 30 10:47:19 +0000 2020,,0,1244576935373467648,,,en,,False,...,It Bqs https://t.co/CUnK5fUgkE,JhaAmit91,212,"Faridabad, India",Amit Kumar Jha,JhaAmit91,,it bqs https://t.co/cunk5fugke,[],[]
38,159,Tue May 12 16:00:51 +0000 2020,,2,1260238520284839937,,,en,"Wandsworth, London",False,...,"Cashless is going to be impossible for those who don’t have bank accounts (needs to be a People’s Bank). If we can’t use cars, then better jump on jump off public transport (great that massive 4x4’s will go). https://t.co/hdplmEDKVN",zarosa,451,Wandsworth,MadSally,zarosa,,"cashless is going to be impossible for those who don’t have bank accounts (needs to be a people’s bank). if we can’t use cars, then better jump on jump off public transport (great that massive 4x4’s will go). https://t.co/hdplmedkvn",[],[]
39,164,Sat Apr 18 13:08:19 +0000 2020,,0,1251497788292182016,,,en,,False,...,Ghetto plans for society's vulnerable https://t.co/BmmWLd0JlR,MeAndMyBCause,697,,Plop of sentimental mush on the landscape,MeAndMyBCause,,ghetto plans for society's vulnerable https://t.co/bmmwld0jlr,[],[]
47,174,Thu Mar 12 13:51:30 +0000 2020,,2,1238100307906871296,,,en,,False,...,"OK, interesting. https://t.co/q4M7D4kM0e",frencovfefe,5188,🤡,Fren Covfefe,frencovfefe,,"ok, interesting. https://t.co/q4m7d4km0e",[],[]
57,229,Wed Mar 18 06:59:55 +0000 2020,,0,1240171053655171073,,,en,,False,...,This will be well worth a read https://t.co/QjlMvSoG2d,Clansman2,624,EDINBURGH,John A Morrison,Clansman2,,this will be well worth a read https://t.co/qjlmvsog2d,[],[]
58,240,Tue Mar 31 19:22:43 +0000 2020,,1,1245069030643425283,,,en,,False,...,This.... https://t.co/IHMJ7XfDHo,hilary_b,532,,Hilary Bruffell,hilary_b,,this.... https://t.co/ihmj7xfdho,[],[]
77,312,Tue Mar 24 23:38:27 +0000 2020,,14,1242596669801132032,,,en,,False,...,Honored to have been part of this great initiative. https://t.co/y2aDcHIZuc,jetten_j,1740,University of Queensland,Jolanda,jetten_j,,honored to have been part of this great initiative. https://t.co/y2adchizuc,[],[]


In [51]:
# can we quickly do it automatically?

import urllib.request
link = "https://t.co/cseXRzuw9H"


with urllib.request.urlopen(link) as url:
    s = url.read()
    # I'm guessing this would output the html source code ?
    print(s)
    
    
# too much noise

b'<!DOCTYPE html>\n<html dir="ltr" lang="en">\n<meta charset="utf-8" />\n<meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=0,viewport-fit=cover" />\n<link rel="preconnect" href="//abs.twimg.com" />\n<link rel="preconnect" href="//api.twitter.com" />\n<link rel="preconnect" href="//pbs.twimg.com" />\n<link rel="preconnect" href="//t.co" />\n<link rel="preconnect" href="//video.twimg.com" />\n<link rel="dns-prefetch" href="//abs.twimg.com" />\n<link rel="dns-prefetch" href="//api.twitter.com" />\n<link rel="dns-prefetch" href="//pbs.twimg.com" />\n<link rel="dns-prefetch" href="//t.co" />\n<link rel="dns-prefetch" href="//video.twimg.com" />\n<link rel="preload" as="script" crossorigin="anonymous" href="https://abs.twimg.com/responsive-web/client-web-legacy/polyfills.18e394f5.js" nonce="ZGY5ZjJlOWItOTQ4OC00ZTA4LWI5NWUtMDRiZTRjZjdlODI0" />\n<link rel="preload" as="script" crossorigin="anonymous" href="https://abs.twimg.com/responsive-web/clien

In [54]:
linked_tweet = "government #coronavirus science advisor dr david halpern tells me of plans to ‘cocoon’ vulnerable groups."

### Save the tweets containing no keywords

In [59]:
no_kword_tweets_df[['id', 'created_at', 'user_location', 'text_cl', 'subkeywords']].to_csv(os.path.join(DATA_DIR, "tweets_no_keywords.csv"))