## Set up

In [1]:
import os
import re
import string
import pandas as pd
import numpy as np

In [2]:
%load_ext autoreload

In [3]:
%reload_ext autoreload

In [4]:
os.getcwd()

'/Users/alessiatosi/DS_projects/behavioural-sci-perception/notebooks'

In [5]:
pd.options.display.max_seq_items = 10000
pd.set_option('display.max_colwidth', None)

Load environment variables

In [6]:
DATA_PATH = os.environ.get("DIR_DATA_RAW")

In [7]:
DATA_PATH

'/Users/alessiatosi/DS_projects/behavioural-sci-perception/data/raw'

In [8]:
OUTPUT_DIR = os.environ.get("DIR_DATA_INTERIM")

Constants

In [9]:
TWEETS_FLNM = "tweet-ids-10062020" 

In [10]:
VARS_TO_KEEP = ['created_at', 'hashtags',
       'favorite_count', 'id', 'reweet_id', 'retweet_screen_name', 'lang', 'place',
       'possibly_sensitive', 'retweet_count', 'text',
       'user_screen_name', 'user_followers_count', 'user_location', 'user_name', 'user_screen_name.1',
       'user_time_zone']

## Get data

In [11]:
tweets_raw = pd.read_csv(os.path.join(DATA_PATH, TWEETS_FLNM + '.csv'))

In [23]:
tweets_raw.shape

# Missing tweets: were they not hydrated because the tweet was not found or because of rate limit of the Twitter API?

(14611, 17)

In [13]:
tweets_raw.columns

Index(['coordinates', 'created_at', 'hashtags', 'media', 'urls',
       'favorite_count', 'id', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'lang', 'place',
       'possibly_sensitive', 'retweet_count', 'reweet_id',
       'retweet_screen_name', 'source', 'text', 'tweet_url', 'user_created_at',
       'user_screen_name', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'user_friends_count',
       'user_listed_count', 'user_location', 'user_name', 'user_screen_name.1',
       'user_statuses_count', 'user_time_zone', 'user_urls', 'user_verified'],
      dtype='object')

In [14]:
# filter only relevant columns
tweets_raw = tweets_raw[VARS_TO_KEEP]

In [15]:
tweets_raw[:5]

Unnamed: 0,created_at,hashtags,favorite_count,id,reweet_id,retweet_screen_name,lang,place,possibly_sensitive,retweet_count,text,user_screen_name,user_followers_count,user_location,user_name,user_screen_name.1,user_time_zone
0,Tue Apr 21 12:15:54 +0000 2020,coronavirus consumer behaviorchange restaurants theatre automobile health finance COVID19 CoronaVirusUpdate,1,1252571762627235841,,,en,,False,0,"As covid-19 sweeps the world, shoppers forced to change purchase behaviour.\n#coronavirus #consumer #behaviorchange #restaurants #theatre #automobile #health #finance #COVID19 #CoronaVirusUpdate \nLink: https://t.co/402gGrrCAA https://t.co/WUsB26ingV",BT_India,1020868,India,Business Today,BT_India,
1,Wed Apr 22 10:36:10 +0000 2020,COVID19 Coronavirus ToriesOut PoliceState PoliceStateUK MassSurveillance BehavioralScience behavioraleconomics NWO RevolutionNow Censorship Stasi endthelockdown NoVaccineForMe,0,1252909051886919681,,,und,,False,1,#COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance\n#BehavioralScience #behavioraleconomics #NWO #RevolutionNow #Censorship #Stasi #endthelockdown #NoVaccineForMe https://t.co/FnT77plTkD,DavidIHodgson,830,https://thereluctantdiarist.blogspot.co.uk/,David Hodgson,DavidIHodgson,
2,Sat Mar 14 23:31:19 +0000 2020,Covid19 behaviourchange wecantdothisalone,0,1238970999175032832,1.238968e+18,GSGerShaw,en,,,15,RT @GSGerShaw: Public must follow safety measures for #Covid19 and play their part #behaviourchange #wecantdothisalone \nOur healthcare staf…,MarissaBMcC,567,,MarissaButler,MarissaBMcC,
3,Sat Apr 18 08:12:28 +0000 2020,COVID19 Coronavirus ToriesOut PoliceState PoliceStateUK MassSurveillance BehavioralScience behavioraleconomics NWO RevolutionNow Censorship Stasi endthelockdown NoVaccineForMe,0,1251423335935811584,,,en,,False,0,They live on a different planet....\n\n#COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance\n#BehavioralScience #behavioraleconomics #NWO #RevolutionNow #Censorship #Stasi #endthelockdown #NoVaccineForMe https://t.co/wyPcvRkL1C,DavidIHodgson,830,https://thereluctantdiarist.blogspot.co.uk/,David Hodgson,DavidIHodgson,
4,Sat Mar 14 18:37:38 +0000 2020,,0,1238897091780456449,1.238828e+18,LindaBauld,en,,,23,RT @LindaBauld: Just in case you hadn't seen this - @bmj_latest blog from my colleagues @SusanMichie @robertjwest &amp; coauthors on #behaviour…,MinervaCardioa1,292,"Turin, Piedmont",Minerva Cardioangiologica,MinervaCardioa1,


## Explore

- number/% of geolocated tweets
- number/% of english language tweets (other languages to be dropped)
- number/% of uncommented retweets (to be dropped as simply duplicates)

In [16]:
# general look
tweets_raw.nunique()

created_at              14560
hashtags                 2259
favorite_count            112
id                      14611
reweet_id                1889
retweet_screen_name      1011
lang                       27
place                     103
possibly_sensitive          2
retweet_count              88
text                     6694
user_screen_name         9718
user_followers_count     3944
user_location            3617
user_name                9635
user_screen_name.1       9718
user_time_zone              0
dtype: int64

There are 6,694 unique texts.

And 14,611 unique tweets: WARNING this is less than the number of the dehydrated tweets acquired...

### Uncommented retweets

To be dropped as they are exact duplicates of other tweets already in the dataset.

How to identify them?

- If original tweets was longer than 140 characters: (1) the tweet is a retweet (`retweet_id` is not `NaN`), and (2) the tweet ends in an ellipsis.
    **Rationale**: for retweets that do not include a comment where the original tweet is longer than 140 characters, the Twitter API returns a truncated full_text.  
    
- If original tweets was not longer than 140 characters: (1) the tweet is a retweet, (2) it starts with RT, and (3) its text is the exact duplication of an existing tweet

In [17]:
uncommented_above140_retweets = tweets_raw[tweets_raw.text.str.endswith("\u2026")]

In [20]:
uncommented_above140_retweets.shape

(9252, 17)

In [24]:
# how many also have a null reweet_id? Should be none
uncommented_above140_retweets.isnull().reweet_id.value_counts()    

False    9032
True      220
Name: reweet_id, dtype: int64

In [25]:
# let's see who these 220 are
uncommented_above140_retweets[uncommented_above140_retweets.isnull().reweet_id]

Unnamed: 0,created_at,hashtags,favorite_count,id,reweet_id,retweet_screen_name,lang,place,possibly_sensitive,retweet_count,text,user_screen_name,user_followers_count,user_location,user_name,user_screen_name.1,user_time_zone
14,Sat Apr 18 14:45:22 +0000 2020,BehavioralScience education COVID19 Coronavirus ToriesOut PoliceState PoliceStateUK MassSurveillance BehavioralScience behavioraleconomics,0,1251522214589083655,,,en,,,0,Ask leaders to make #BehavioralScience core #education RT @pikachanyan: RT @DavidIHodgson: #COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance\n#BehavioralScience #behavioraleconomics #…,ScalarHumanity,4466,,Scalar Humanity,ScalarHumanity,
147,Mon Apr 27 12:45:13 +0000 2020,BehavioralScience education COVID19 Coronavirus ToriesOut PoliceState PoliceStateUK MassSurveillance BehavioralScience behavioraleconomics,0,1254753467643629574,,,en,,,0,Ask leaders to make #BehavioralScience core #education RT @ScalarHumanity: RT @DavidIHodgson: #COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance\n#BehavioralScience #behavioraleconomics #…,ScalarHumanity,4466,,Scalar Humanity,ScalarHumanity,
188,Fri Apr 03 01:45:08 +0000 2020,BehavioralScience education covid19,0,1245890041387397123,,,en,,,0,Ask leaders to make #BehavioralScience core #education RT @ScalarHumanity: RT @imanchaya1: A few recommendations on how to improve communication and messaging to promote positive behaviors during COVID-19.\n#covid19…,ScalarHumanity,4466,,Scalar Humanity,ScalarHumanity,
272,Wed Apr 22 10:45:14 +0000 2020,BehavioralScience education COVID19 Coronavirus ToriesOut PoliceState PoliceStateUK MassSurveillance BehavioralScience behavioraleconomics,0,1252911333814743040,,,en,,,0,Ask leaders to make #BehavioralScience core #education RT @NanaPanther248: RT @DavidIHodgson: #COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance\n#BehavioralScience #behavioraleconomics #…,ScalarHumanity,4466,,Scalar Humanity,ScalarHumanity,
319,Mon Apr 27 07:45:07 +0000 2020,BehavioralScience education COVID19 Coronavirus ToriesOut PoliceState PoliceStateUK MassSurveillance BehavioralScience behavioraleconomics,0,1254677946633195520,,,en,,,0,Ask leaders to make #BehavioralScience core #education RT @JustBeMentalist: RT @DavidIHodgson: #COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance\n#BehavioralScience #behavioraleconomics #…,ScalarHumanity,4466,,Scalar Humanity,ScalarHumanity,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13987,Sat Apr 04 12:45:24 +0000 2020,BehavioralScience education COVID19 Coronavirus ToriesOut PoliceSta,0,1246418591974555654,,,en,,,0,Ask leaders to make #BehavioralScience core #education RT @Justice4Marky: RT @DavidIHodgson: Unless there are people with compromised immunity nearby it doesn't matter.\n\n#COVID19 #Coronavirus #ToriesOut #PoliceSta…,ScalarHumanity,4466,,Scalar Humanity,ScalarHumanity,
14214,Wed Apr 15 11:45:14 +0000 2020,BehavioralScience education COVID19 Coronavirus ToriesOut PoliceState PoliceStateUK MassSurveillance BehavioralScience,0,1250389717864062976,,,en,,,0,"Ask leaders to make #BehavioralScience core #education RT @KhawarOzayr: RT @DavidIHodgson: Come on, the people!\n\n#COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance\n#BehavioralScience…",ScalarHumanity,4466,,Scalar Humanity,ScalarHumanity,
14439,Fri May 01 14:45:16 +0000 2020,BehavioralScience education COVID19 Coronavirus ToriesOut PoliceState PoliceStateUK MassSurveillance BehavioralScience behavioraleconomics,0,1256233230174470146,,,en,,,0,Ask leaders to make #BehavioralScience core #education RT @ricardo_ik_ahau: RT @DavidIHodgson: #COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance\n#BehavioralScience #behavioraleconomics #…,ScalarHumanity,4466,,Scalar Humanity,ScalarHumanity,
14441,Wed Apr 15 11:45:22 +0000 2020,BehavioralScience education COVID19 Coronavirus ToriesOut PoliceSt,0,1250389752706215937,,,en,,,0,Ask leaders to make #BehavioralScience core #education RT @JustBeMentalist: RT @DavidIHodgson: Who'd have thought a gov't packed with NHS privateers would do such a thing?\n\n#COVID19 #Coronavirus #ToriesOut #PoliceSt…,ScalarHumanity,4466,,Scalar Humanity,ScalarHumanity,


Note that many are variations of the same core message. They will probably be discounted as duplicates when we remove duplicates after pre-processing the text of the tweets.

In [26]:
# let's remove these 220 from the uncommented retweets as that's not what they are
uncommented_above140_retweets = uncommented_above140_retweets[uncommented_above140_retweets.notnull().reweet_id]

In [28]:
uncommented_above140_retweets.shape

(9032, 17)

In [29]:
tweets_to_go_ids1 = uncommented_above140_retweets.id

**Important NOTE**

The original tweets of these uncommented >140cha retweets may not be present in our dataset as original tweets. However, as for Twitter's policy, uncommented retweets of tweets longer than 140 character are truncated when accessed via the Stream API, so there is no way we can get access to the full text, we will have to exclude them anyway. We cannot, in fact, analyse the sentiment of incomplete text in a reliable way. 

Let's try to identify any other (i.e., up to 140 character) uncommented retweet

In [30]:
retweets = tweets_raw[(tweets_raw.text.str.startswith("RT @"))]

In [31]:
retweets.reweet_id.isnull().value_counts()

False    9303
True        7
Name: reweet_id, dtype: int64

In [33]:
# How many of these retweets are not in the above-140 uncommented rewteets already identified?
retweets[~retweets.id.isin(uncommented_above140_retweets.id)].shape


(278, 17)

In [34]:
# We need to understand which ones of these retweets are also uncommented dulicates of exisiting tweets (so to be dropped)
extra_retweets = retweets[~retweets.id.isin(uncommented_above140_retweets.id)]

In [35]:
# Let's clean the text by removing the "RT @name-original-sender " initial part
extra_retweets = extra_retweets.copy()
extra_retweets['cleaned_retweet'] = [re.sub(r"^RT @.*: ", "", t) for t in extra_retweets.text]

In [36]:
extra_retweets[['text', 'cleaned_retweet']]

Unnamed: 0,text,cleaned_retweet
71,RT @normonics: .@casssunstein nudge in action. Hyper dangerous. https://t.co/iYnfuV1ED1,.@casssunstein nudge in action. Hyper dangerous. https://t.co/iYnfuV1ED1
95,RT @brianne_eby: I wrote about the nexus between #transportation &amp; #BehavioralScience in the context of #COVID19 👇\n\nhttps://t.co/aVDKN3UY3W,I wrote about the nexus between #transportation &amp; #BehavioralScience in the context of #COVID19 👇\n\nhttps://t.co/aVDKN3UY3W
177,RT @peterjukes: Forget all behavioural science stuff. Look at the number of people on respirators - that’s the life saving element,Forget all behavioural science stuff. Look at the number of people on respirators - that’s the life saving element
227,RT @faisalislam: Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -,Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -
251,RT @peterjukes: Forget all behavioural science stuff. Look at the number of people on respirators - that’s the life saving element,Forget all behavioural science stuff. Look at the number of people on respirators - that’s the life saving element
...,...,...
14350,RT @neilgains: How are Indonesians coping with #stayathome? https://t.co/lP2Mgr6w6v #coronavirus #behaviourchange #newmr,How are Indonesians coping with #stayathome? https://t.co/lP2Mgr6w6v #coronavirus #behaviourchange #newmr
14505,RT @socratext: How social &amp; behavioural science might support the mitigation of #COVID19 #coronavirus pandemic @UKDCP @alexanderhaslam,How social &amp; behavioural science might support the mitigation of #COVID19 #coronavirus pandemic @UKDCP @alexanderhaslam
14542,RT @faisalislam: Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -,Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -
14554,RT @TheDanWilson: Clever #Behaviouralinsight idea behind @DHSCgovuk’s messaging and advice for #coronavirus 👇 https://t.co/DI3QBzwoOz,Clever #Behaviouralinsight idea behind @DHSCgovuk’s messaging and advice for #coronavirus 👇 https://t.co/DI3QBzwoOz


In [37]:
pd.DataFrame(extra_retweets.cleaned_retweet.value_counts())

Unnamed: 0,cleaned_retweet
Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -,95
Forget all behavioural science stuff. Look at the number of people on respirators - that’s the life saving element,34
HE IS NOT CORONAVIRUS SCIENCE ADVISOR HE IS A PSYCHOLOGIST AND HEAD OF THE FUCKING NUDGE UNIT WHAT THE FUCK ARE YOU DOING,5
Incase you missed our latest blog - insights from #behaviouralscience for the #COVID19 pandemic by @_shanetimmons 👇👇👇,4
"Thank god for real experts on Pandemics and Healthcare, rather Nudge Unit behavioural scientists.",4
...,...
The #COVID19 Response - What can behavioral science offer? @reshmatrasi https://t.co/HOdIkTEQ88,1
"Have you registered for our next free webinar? The topic is behaviour change. 20 mins in your lunch break. Tuesday the 21st, 12:30pm AEST. \n\nhttps://t.co/ZzPml3HIcA\n\n#freeevent #webinar #behaviour #covid19 #stress #mindfullness #corpo… https://t.co/GoN8zUWTy7",1
This is a good review of behavioural science relevant to #covid19,1
If only Boris Johnson or anyone at the Nudge Unit had spent two minutes on Wikipedia...\n#COVID19\nhttps://t.co/BQ9P7MHWqC,1


Some, those whose count > 1, are definitely uncommented retweets of indentical original tweets - to be removed.

In [38]:
# let's keep one of each, in case the original tweet is not present in the dataset
extra_retweets.drop_duplicates(subset ="cleaned_retweet", keep = "first")

Unnamed: 0,created_at,hashtags,favorite_count,id,reweet_id,retweet_screen_name,lang,place,possibly_sensitive,retweet_count,text,user_screen_name,user_followers_count,user_location,user_name,user_screen_name.1,user_time_zone,cleaned_retweet
71,Sun Mar 08 17:00:22 +0000 2020,,0,1236698282736472065,1.236694e+18,normonics,fr,,False,2,RT @normonics: .@casssunstein nudge in action. Hyper dangerous. https://t.co/iYnfuV1ED1,LuisOje30542321,17,Madrid,Luis Ojeda,LuisOje30542321,,.@casssunstein nudge in action. Hyper dangerous. https://t.co/iYnfuV1ED1
95,Tue Apr 07 20:37:48 +0000 2020,transportation BehavioralScience COVID19,0,1247624638588387331,1.247618e+18,brianne_eby,en,,False,4,RT @brianne_eby: I wrote about the nexus between #transportation &amp; #BehavioralScience in the context of #COVID19 👇\n\nhttps://t.co/aVDKN3UY3W,multimenon,746,"Tampa, FL",Nikhil Menon,multimenon,,I wrote about the nexus between #transportation &amp; #BehavioralScience in the context of #COVID19 👇\n\nhttps://t.co/aVDKN3UY3W
177,Sat Mar 14 19:22:54 +0000 2020,,0,1238908479718465538,1.238906e+18,peterjukes,en,,,90,RT @peterjukes: Forget all behavioural science stuff. Look at the number of people on respirators - that’s the life saving element,AndreaAndreamax,541,Devon,andrea #holdgovernmenttoaccount 🇪🇺✊🐟#SardinesUK,AndreaAndreamax,,Forget all behavioural science stuff. Look at the number of people on respirators - that’s the life saving element
227,Thu Mar 12 22:52:22 +0000 2020,,0,1238236419124088833,1.238092e+18,faisalislam,en,,,191,RT @faisalislam: Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -,StFilansDream,6474,Scotland: a European country,Stephen Dedalus 🏴󠁧󠁢󠁳󠁣󠁴󠁿🇮🇪🇪🇺,StFilansDream,,Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -
403,Sat Apr 18 12:50:57 +0000 2020,uspol regulation COVID19,0,1251493421367218178,1.251492e+18,DrRimmer,en,,False,3,RT @DrRimmer: Why is Trump gutting regulations that save lives? @Casssunstein https://t.co/ycGP9rwBN6 #uspol #regulation #COVID19,Littlesparrow9,745,,Little sparrow,Littlesparrow9,,Why is Trump gutting regulations that save lives? @Casssunstein https://t.co/ycGP9rwBN6 #uspol #regulation #COVID19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13964,Fri Apr 17 22:28:46 +0000 2020,covid19,0,1251276443469770753,1.251105e+18,JulieLeask,en,,,3,RT @JulieLeask: This is a good review of behavioural science relevant to #covid19,_HealthyComms,1816,"Melbourne, Australia",HealthyCommunicators,_HealthyComms,,This is a good review of behavioural science relevant to #covid19
14090,Fri Apr 24 09:55:03 +0000 2020,COVID19 BehavioralEconomics,3,1253623480718970880,,,en,,False,1,"RT @frmreyes: RT @behave4_: Asocial is the New Prosocial (Part 1)\n \n 👉 https://t.co/KzbZmpOQMc\n\n#COVID19 can inconspicuously hitch-hike through our social network until it finds its prey, which has marked the end to the most basic human need — being social.\n#BehavioralEconomics",davdittrich,7854,"Berlin, Germany",Dennis A V Dittrich,davdittrich,,"Asocial is the New Prosocial (Part 1)\n \n 👉 https://t.co/KzbZmpOQMc\n\n#COVID19 can inconspicuously hitch-hike through our social network until it finds its prey, which has marked the end to the most basic human need — being social.\n#BehavioralEconomics"
14350,Tue Apr 28 11:02:20 +0000 2020,stayathome coronavirus behaviourchange newmr,0,1255089963521019904,1.255089e+18,neilgains,en,,False,1,RT @neilgains: How are Indonesians coping with #stayathome? https://t.co/lP2Mgr6w6v #coronavirus #behaviourchange #newmr,tapestryworks,370,Singapore,Neil Gains,tapestryworks,,How are Indonesians coping with #stayathome? https://t.co/lP2Mgr6w6v #coronavirus #behaviourchange #newmr
14505,Thu Apr 09 19:35:53 +0000 2020,COVID19 coronavirus,0,1248333832937517062,1.248245e+18,socratext,en,,,2,RT @socratext: How social &amp; behavioural science might support the mitigation of #COVID19 #coronavirus pandemic @UKDCP @alexanderhaslam,DrEstherCT,1530,"North East, England",Esther,DrEstherCT,,How social &amp; behavioural science might support the mitigation of #COVID19 #coronavirus pandemic @UKDCP @alexanderhaslam


In [39]:
# ids of duplicate retweets (to be removed)
tweets_to_go_ids2 = extra_retweets[extra_retweets.duplicated('cleaned_retweet')].id

In [40]:
tweets_to_go_ids2

251      1239098806110691328
386      1238910144479727621
892      1238910438450094084
937      1237851829146234883
1063     1238907509328535552
                ...         
14224    1247309697977176064
14275    1247085494103674881
14303    1238383948906012672
14542    1238400129142579201
14571    1238128929422131201
Name: id, Length: 183, dtype: int64

In [41]:
# What's left?
extra_retweets[~extra_retweets.id.isin(tweets_to_go_ids2)]

Unnamed: 0,created_at,hashtags,favorite_count,id,reweet_id,retweet_screen_name,lang,place,possibly_sensitive,retweet_count,text,user_screen_name,user_followers_count,user_location,user_name,user_screen_name.1,user_time_zone,cleaned_retweet
71,Sun Mar 08 17:00:22 +0000 2020,,0,1236698282736472065,1.236694e+18,normonics,fr,,False,2,RT @normonics: .@casssunstein nudge in action. Hyper dangerous. https://t.co/iYnfuV1ED1,LuisOje30542321,17,Madrid,Luis Ojeda,LuisOje30542321,,.@casssunstein nudge in action. Hyper dangerous. https://t.co/iYnfuV1ED1
95,Tue Apr 07 20:37:48 +0000 2020,transportation BehavioralScience COVID19,0,1247624638588387331,1.247618e+18,brianne_eby,en,,False,4,RT @brianne_eby: I wrote about the nexus between #transportation &amp; #BehavioralScience in the context of #COVID19 👇\n\nhttps://t.co/aVDKN3UY3W,multimenon,746,"Tampa, FL",Nikhil Menon,multimenon,,I wrote about the nexus between #transportation &amp; #BehavioralScience in the context of #COVID19 👇\n\nhttps://t.co/aVDKN3UY3W
177,Sat Mar 14 19:22:54 +0000 2020,,0,1238908479718465538,1.238906e+18,peterjukes,en,,,90,RT @peterjukes: Forget all behavioural science stuff. Look at the number of people on respirators - that’s the life saving element,AndreaAndreamax,541,Devon,andrea #holdgovernmenttoaccount 🇪🇺✊🐟#SardinesUK,AndreaAndreamax,,Forget all behavioural science stuff. Look at the number of people on respirators - that’s the life saving element
227,Thu Mar 12 22:52:22 +0000 2020,,0,1238236419124088833,1.238092e+18,faisalislam,en,,,191,RT @faisalislam: Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -,StFilansDream,6474,Scotland: a European country,Stephen Dedalus 🏴󠁧󠁢󠁳󠁣󠁴󠁿🇮🇪🇪🇺,StFilansDream,,Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -
403,Sat Apr 18 12:50:57 +0000 2020,uspol regulation COVID19,0,1251493421367218178,1.251492e+18,DrRimmer,en,,False,3,RT @DrRimmer: Why is Trump gutting regulations that save lives? @Casssunstein https://t.co/ycGP9rwBN6 #uspol #regulation #COVID19,Littlesparrow9,745,,Little sparrow,Littlesparrow9,,Why is Trump gutting regulations that save lives? @Casssunstein https://t.co/ycGP9rwBN6 #uspol #regulation #COVID19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13964,Fri Apr 17 22:28:46 +0000 2020,covid19,0,1251276443469770753,1.251105e+18,JulieLeask,en,,,3,RT @JulieLeask: This is a good review of behavioural science relevant to #covid19,_HealthyComms,1816,"Melbourne, Australia",HealthyCommunicators,_HealthyComms,,This is a good review of behavioural science relevant to #covid19
14090,Fri Apr 24 09:55:03 +0000 2020,COVID19 BehavioralEconomics,3,1253623480718970880,,,en,,False,1,"RT @frmreyes: RT @behave4_: Asocial is the New Prosocial (Part 1)\n \n 👉 https://t.co/KzbZmpOQMc\n\n#COVID19 can inconspicuously hitch-hike through our social network until it finds its prey, which has marked the end to the most basic human need — being social.\n#BehavioralEconomics",davdittrich,7854,"Berlin, Germany",Dennis A V Dittrich,davdittrich,,"Asocial is the New Prosocial (Part 1)\n \n 👉 https://t.co/KzbZmpOQMc\n\n#COVID19 can inconspicuously hitch-hike through our social network until it finds its prey, which has marked the end to the most basic human need — being social.\n#BehavioralEconomics"
14350,Tue Apr 28 11:02:20 +0000 2020,stayathome coronavirus behaviourchange newmr,0,1255089963521019904,1.255089e+18,neilgains,en,,False,1,RT @neilgains: How are Indonesians coping with #stayathome? https://t.co/lP2Mgr6w6v #coronavirus #behaviourchange #newmr,tapestryworks,370,Singapore,Neil Gains,tapestryworks,,How are Indonesians coping with #stayathome? https://t.co/lP2Mgr6w6v #coronavirus #behaviourchange #newmr
14505,Thu Apr 09 19:35:53 +0000 2020,COVID19 coronavirus,0,1248333832937517062,1.248245e+18,socratext,en,,,2,RT @socratext: How social &amp; behavioural science might support the mitigation of #COVID19 #coronavirus pandemic @UKDCP @alexanderhaslam,DrEstherCT,1530,"North East, England",Esther,DrEstherCT,,How social &amp; behavioural science might support the mitigation of #COVID19 #coronavirus pandemic @UKDCP @alexanderhaslam


In [42]:
# are they maybe duplicates of tweets of original tweets in the dataset?
tweets_to_investigate = extra_retweets[~extra_retweets.id.isin(tweets_to_go_ids2)]

In [43]:
tweets_to_investigate_text = tweets_to_investigate.cleaned_retweet     

In [44]:
len(tweets_to_investigate_text)

95

In [45]:
tweets_raw[tweets_raw.text.isin(tweets_to_investigate_text)].count()   # 35 are "copies" of original tweets in the dataset

created_at              35
hashtags                27
favorite_count          35
id                      35
reweet_id                0
retweet_screen_name      0
lang                    35
place                    0
possibly_sensitive      34
retweet_count           35
text                    35
user_screen_name        35
user_followers_count    35
user_location           30
user_name               35
user_screen_name.1      35
user_time_zone           0
dtype: int64

In [46]:
pd.DataFrame(tweets_to_investigate_text.value_counts())

Unnamed: 0,cleaned_retweet
A little behavioural economics nudge has a big impact. \n\nNow do toilet paper. :) https://t.co/SFPRp01LNw,1
"Can't see what @casssunstein said because I'm blocked , but Rupert is obviously right.",1
👀 #BigData by #Google on #COVID19 behaviour change of #Anroid users...\n\nhttps://t.co/ohgOyt92a5,1
#healthliteracy + #behaviourchange = improved impact of #COVID19 efforts,1
Eww eewww eeeew eeeew eewww 💩 \n\n👏🏼 wash 👏🏼 your 👏🏼 hands 👏🏼 \n\n#behaviouralscience,1
...,...
#COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance #BehavioralScience #behavioraleconomics,1
If only Boris Johnson or anyone at the Nudge Unit had spent two minutes on Wikipedia...\n#COVID19\nhttps://t.co/BQ9P7MHWqC,1
#handwashing #BehavioralScience,1
Some good tips on use of social media in these changing times #BehavioralScience #psychology #socialmedia #cyberpsychology,1


In [47]:
tweets_to_go_ids3 = tweets_to_investigate[tweets_to_investigate.cleaned_retweet.isin(
    tweets_raw[tweets_raw.text.isin(tweets_to_investigate_text)].text)].id

In [48]:
len(tweets_to_go_ids3)   #ok

35

In [49]:
print(len(tweets_to_go_ids1))
print(len(tweets_to_go_ids2))

9032
183


### Let's remove from the dataset all the uncommented retweets that are duplicate of original tweets already in the dataset 

In [50]:
print(len(set(tweets_to_go_ids1)))
print(len(set(tweets_to_go_ids2)))
print(len(set(tweets_to_go_ids3)))
# GOOD :-), each is unique

9032
183
35


In [51]:
tweets_to_go_ids_all = list(tweets_to_go_ids1) + list(tweets_to_go_ids2) + list(tweets_to_go_ids3)

In [52]:
len(tweets_to_go_ids_all)

9250

In [53]:
len(tweets_raw.index) 

14611

In [54]:
tweets_original = tweets_raw[~tweets_raw.id.isin(tweets_to_go_ids_all)]

In [55]:
len(tweets_original.index)

5361

### English vs. non-English tweets

Non-English tweets will be dropped as they are not part of our population of interest.


In [56]:
count_by_lang = pd.DataFrame(tweets_original.groupby('lang').id.count()) 

In [57]:
count_by_lang.rename(columns={'id': 'count_ids'}, inplace=True)

In [58]:
count_by_lang['prop'] = round(count_by_lang['count_ids'] / sum(count_by_lang['count_ids']),3)

In [59]:
count_by_lang

Unnamed: 0_level_0,count_ids,prop
lang,Unnamed: 1_level_1,Unnamed: 2_level_1
ar,2,0.0
ca,4,0.001
cs,1,0.0
cy,1,0.0
da,1,0.0
de,23,0.004
en,4224,0.788
es,64,0.012
fa,2,0.0
fi,3,0.001


What are the "und" (undefined) ones?

In [60]:
tweets_original[tweets_original.lang == "und"]

Unnamed: 0,created_at,hashtags,favorite_count,id,reweet_id,retweet_screen_name,lang,place,possibly_sensitive,retweet_count,text,user_screen_name,user_followers_count,user_location,user_name,user_screen_name.1,user_time_zone
1,Wed Apr 22 10:36:10 +0000 2020,COVID19 Coronavirus ToriesOut PoliceState PoliceStateUK MassSurveillance BehavioralScience behavioraleconomics NWO RevolutionNow Censorship Stasi endthelockdown NoVaccineForMe,0,1252909051886919681,,,und,,False,1,#COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance\n#BehavioralScience #behavioraleconomics #NWO #RevolutionNow #Censorship #Stasi #endthelockdown #NoVaccineForMe https://t.co/FnT77plTkD,DavidIHodgson,830,https://thereluctantdiarist.blogspot.co.uk/,David Hodgson,DavidIHodgson,
24,Sun Apr 05 08:10:20 +0000 2020,COVID19 Coronavirus ToriesOut PoliceState PoliceStateUK MassSurveillance BehavioralScience behavioraleconomics CoronaVirusHOAX,0,1246711757059641345,,,und,,False,1,#COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance #BehavioralScience #behavioraleconomics #CoronaVirusHOAX https://t.co/R5OiyvPEaf,DavidIHodgson,830,https://thereluctantdiarist.blogspot.co.uk/,David Hodgson,DavidIHodgson,
27,Tue Apr 21 06:44:58 +0000 2020,NoVaccineForMe,0,1252488479650414593,,,und,,False,0,#NoVaccineForMe https://t.co/U449CaEKYv,GrahamPudek,584,,Graham Pudek,GrahamPudek,
44,Tue Apr 21 16:07:47 +0000 2020,COVID19 Coronavirus ToriesOut PoliceState PoliceStateUK MassSurveillance BehavioralScience behavioraleconomics NWO RevolutionNow Censorship Stasi endthelockdown NoVaccineForMe,0,1252630116594843651,,,und,,False,0,#COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance\n#BehavioralScience #behavioraleconomics #NWO #RevolutionNow #Censorship #Stasi #endthelockdown #NoVaccineForMe https://t.co/dE16zGcKyd,DavidIHodgson,830,https://thereluctantdiarist.blogspot.co.uk/,David Hodgson,DavidIHodgson,
55,Tue Mar 31 11:13:36 +0000 2020,,1,1244945938453389317,,,und,,False,0,@ThunstromLinda https://t.co/kNR5A3aspN,JBirdShogren,525,WY,JBird Shogren,JBirdShogren,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14471,Mon Apr 06 09:14:54 +0000 2020,COVID19 Coronavirus ToriesOut PoliceState PoliceStateUK MassSurveillance BehavioralScience behavioraleconomics,0,1247090393386225664,,,und,,False,0,#COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance #BehavioralScience #behavioraleconomics https://t.co/N2rKVktR7k,DavidIHodgson,830,https://thereluctantdiarist.blogspot.co.uk/,David Hodgson,DavidIHodgson,
14501,Fri Apr 10 09:02:39 +0000 2020,COVID19 Coronavirus ToriesOut PoliceState PoliceStateUK MassSurveillance BehavioralScience behavioraleconomics NWO RevolutionNow Censorship Stasi endthelockdown NoVaccineForMe,0,1248536861922004992,,,und,,False,0,#COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance\n#BehavioralScience #behavioraleconomics #NWO #RevolutionNow #Censorship #Stasi #endthelockdown #NoVaccineForMe https://t.co/XmJzy0eLkN,DavidIHodgson,830,https://thereluctantdiarist.blogspot.co.uk/,David Hodgson,DavidIHodgson,
14504,Fri Apr 10 08:56:19 +0000 2020,COVID19 Coronavirus ToriesOut PoliceState PoliceStateUK MassSurveillance BehavioralScience behavioraleconomics NWO RevolutionNow Censorship Stasi endthelockdown NoVaccineForMe,0,1248535270259077120,,,und,,False,0,#COVID19 #Coronavirus #ToriesOut #PoliceState #PoliceStateUK #MassSurveillance\n#BehavioralScience #behavioraleconomics #NWO #RevolutionNow #Censorship #Stasi #endthelockdown #NoVaccineForMe https://t.co/iW2KEarVBz,DavidIHodgson,830,https://thereluctantdiarist.blogspot.co.uk/,David Hodgson,DavidIHodgson,
14593,Fri Apr 10 06:32:47 +0000 2020,,0,1248499146970955776,,,und,,False,0,@CassSunstein https://t.co/LNbrBpreGj,HeidiLDN,21,"London, England",HeidiLDN,HeidiLDN,


They seem to be purely list of hashtags and duplicates of the same one. So let's exclude them from further investigation.

### Exclude all non-English tweets

In [61]:
tweets_original_en = tweets_original[tweets_original.lang == 'en']

In [62]:
len(tweets_original_en.index)

4224

### Geolocation

We'll look at the geolocation of users.

In [63]:
tweets_original_en.place.value_counts(dropna=False)

NaN                                                      4098
Geelong, Victoria                                           6
Carlisle, PA                                                5
Kampala, Uganda                                             3
Tottenham, London                                           3
                                                         ... 
Bandar Kuala Lumpur, Wilayah Persekutuan Kuala Lumpur       1
Fairview, NY                                                1
Cleveland, OH                                               1
Chestfield, England                                         1
Frankfurt am Main, Deutschland                              1
Name: place, Length: 96, dtype: int64

The large majority are not geolocated.

### Save the data

In [64]:
# define file name and file path
output_name = "tweets_original_en"
output_filepath = os.path.join(OUTPUT_DIR, output_name + ".csv")

In [65]:
tweets_original_en.to_csv(output_filepath)