## Set up

In [1]:
import os
import re
import string
import pandas as pd
import numpy as np

In [2]:
%load_ext autoreload

In [3]:
%reload_ext autoreload

In [4]:
os.getcwd()

'/Users/alessiatosi/DS_projects/behavioural-sci-perception/notebooks'

In [5]:
pd.options.display.max_seq_items = 10000
pd.set_option('display.max_colwidth', None)

Load environment variables

In [6]:
DATA_PATH = os.environ.get("DIR_DATA_RAW")

In [7]:
DATA_PATH

'/Users/alessiatosi/DS_projects/behavioural-sci-perception/data/raw'

In [8]:
OUTPUT_DIR = os.environ.get("DIR_DATA_INTERIM")

Constants

In [9]:
TWEETS_FLNM = "tweet-ids-British-Oct2020" 

In [10]:
VARS_TO_KEEP = ['created_at', 'hashtags',
       'favorite_count', 'id', 'reweet_id', 'retweet_screen_name', 'lang', 'place',
       'possibly_sensitive', 'retweet_count', 'text',
       'user_screen_name', 'user_followers_count', 'user_location', 'user_name', 'user_screen_name.1',
       'user_time_zone']

## Get data

In [11]:
tweets_raw = pd.read_csv(os.path.join(DATA_PATH, TWEETS_FLNM + '.csv'))

In [12]:
tweets_raw.shape

# Missing tweets: were they not hydrated because the tweet was not found or because of rate limit of the Twitter API?

(12161, 34)

In [13]:
tweets_raw.columns

Index(['coordinates', 'created_at', 'hashtags', 'media', 'urls',
       'favorite_count', 'id', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'lang', 'place',
       'possibly_sensitive', 'retweet_count', 'reweet_id',
       'retweet_screen_name', 'source', 'text', 'tweet_url', 'user_created_at',
       'user_screen_name', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'user_friends_count',
       'user_listed_count', 'user_location', 'user_name', 'user_screen_name.1',
       'user_statuses_count', 'user_time_zone', 'user_urls', 'user_verified'],
      dtype='object')

In [14]:
# filter only relevant columns
tweets_raw = tweets_raw[VARS_TO_KEEP]

In [15]:
tweets_raw[:3]

Unnamed: 0,created_at,hashtags,favorite_count,id,reweet_id,retweet_screen_name,lang,place,possibly_sensitive,retweet_count,text,user_screen_name,user_followers_count,user_location,user_name,user_screen_name.1,user_time_zone
0,Sun Apr 19 11:29:34 +0000 2020,,0,1251835324801613825,1.251829e+18,Jeremy11223344,en,,,3,"RT @Jeremy11223344: @tjc1_tom @THemingford @dsue1441_re Imperial College modellers in collaboration with the Downing St ""nudge"" unit cannot…",snoopdogtanian,2192,,Snoopdogtanian #UBI🌹,snoopdogtanian,
1,Wed Mar 04 04:35:46 +0000 2020,,0,1235061349685719047,1.234871e+18,bmj_latest,en,,,194,"RT @bmj_latest: Human behaviour will determine how quickly covid-19 spreads, say @SusanMichie @DrRichardAmlot and @robertjwest What needs t…",MAHandley,335,,Margaret Handley,MAHandley,
2,Wed Apr 08 17:21:54 +0000 2020,COVID19 coronavirus,0,1247937727699484673,1.247901e+18,jayvanbavel,en,,,89,RT @jayvanbavel: We posted a revised version of our paper on how social &amp; behavioural science might support #COVID19 #coronavirus pandemic.…,steverathje2,1204,University of Cambridge,Steve Rathje,steverathje2,


## Explore

- number/% of geolocated tweets
- number/% of english language tweets (other languages to be dropped)
- number/% of uncommented retweets (to be dropped as simply duplicates)

In [16]:
# general look
tweets_raw.nunique()

created_at              12124
hashtags                 1630
favorite_count            118
id                      12161
reweet_id                1177
retweet_screen_name       861
lang                       22
place                      92
possibly_sensitive          2
retweet_count              90
text                     4480
user_screen_name         9381
user_followers_count     3994
user_location            3501
user_name                9291
user_screen_name.1       9381
user_time_zone              0
dtype: int64

There are 4,480 unique texts.

And 12,124 unique tweets.

### Uncommented retweets

To be dropped as they are exact duplicates of other tweets already in the dataset.

How to identify them?

- If original tweets was longer than 140 characters: (1) the tweet is a retweet (`retweet_id` is not `NaN`), and (2) the tweet ends in an ellipsis.
    **Rationale**: for retweets that do not include a comment where the original tweet is longer than 140 characters, the Twitter API returns a truncated full_text.  
    
- If original tweets was not longer than 140 characters: (1) the tweet is a retweet, (2) it starts with RT, and (3) its text is the exact duplication of an existing tweet

In [17]:
uncommented_above140_retweets = tweets_raw[tweets_raw.text.str.endswith("\u2026")]

In [18]:
uncommented_above140_retweets.shape

(8575, 17)

In [19]:
# how many also have a null reweet_id? Should be none
uncommented_above140_retweets.isnull().reweet_id.value_counts()    

False    8562
True       13
Name: reweet_id, dtype: int64

In [20]:
# let's see who these 13 are
uncommented_above140_retweets[uncommented_above140_retweets.isnull().reweet_id]

Unnamed: 0,created_at,hashtags,favorite_count,id,reweet_id,retweet_screen_name,lang,place,possibly_sensitive,retweet_count,text,user_screen_name,user_followers_count,user_location,user_name,user_screen_name.1,user_time_zone
1871,Tue May 05 16:45:17 +0000 2020,BehavioralScience education BehavioralScience COVID19,0,1257712986577350659,,,en,,,0,Ask leaders to make #BehavioralScience core #education RT @EconUdesa: RT @IDB_Research: Join us 05/06 for a conversation with @CassSunstein on how to leverage #BehavioralScience to turn the tide on #COVID19 in…,ScalarHumanity,4770,,Scalar Humanity,ScalarHumanity,
2349,Fri May 22 00:45:08 +0000 2020,BehavioralScience education behaviouralscience behavioralscience socmar covid19,0,1263631948536320006,,,en,,,0,Ask leaders to make #BehavioralScience core #education RT @ScalarHumanity: RT @footesea: Wow this is a great compilation of resources! Thank you @isma_org \n#behaviouralscience #behavioralscience #socmar #covid19 ht…,ScalarHumanity,4770,,Scalar Humanity,ScalarHumanity,
2529,Thu May 14 09:15:27 +0000 2020,Covid19 covid19,0,1260861271781650432,,,en,,False,0,@nickyc7475 Waking people is so hard - Gov't use behavioural psychology via all media to keep people asleep. See my page re: #Covid19\n\nFollow the science? What science? #covid19\n\nBehavioural science?\n\nAre you being played?\nWatch\n\nhttps://t.co/bhu4eEOanQ…,ZigmanSara,3762,Quiddity,Sara Zigman,ZigmanSara,
3709,Fri May 08 21:45:33 +0000 2020,BehavioralScience education COVID19,0,1258875711609323522,,,en,,,0,Ask leaders to make #BehavioralScience core #education RT @LillyDerby: RT @BusaraCenter: Join Busara's @nicholas_owsley for an in-depth discussion on behaviour change in the context of #COVID19.\n\n13/05/2020 fro…,ScalarHumanity,4770,,Scalar Humanity,ScalarHumanity,
5088,Sat Apr 25 08:30:06 +0000 2020,COVID19,3,1253964490837811200,,,en,,False,2,What will you do differently beyond #COVID19 to protect our planet? Our Co-founder @ged_savva writes about the behaviour change processes we are taking in this blog. \nhttps://t.co/yvZE2SOP4G…,we_are_magpie,1743,Leeds,magpie,we_are_magpie,
6007,Mon Mar 09 07:10:15 +0000 2020,COVID19 Coronavirus,0,1236912163677241344,,,en,,,0,"bmj_latest: Human behaviour will determine how quickly covid-19 spreads, say SusanMichie DrRichardAmlot and robertjwest What needs to happen to ensure that people have the capability, opportunity and motivation to enact key behaviours #COVID19 #Coronavirus UCL_BSH …",aruberutou,48,"Sapporo, Hokkaido",GAYLEアルベルトウ,aruberutou,
6228,Fri May 08 10:45:08 +0000 2020,BehavioralScience education COVID19,0,1258709513454129152,,,en,,,0,Ask leaders to make #BehavioralScience core #education RT @jhaushofer: RT @BusaraCenter: Join Busara's @nicholas_owsley for an in-depth discussion on behaviour change in the context of #COVID19.\n\n13/05/2020 fro…,ScalarHumanity,4770,,Scalar Humanity,ScalarHumanity,
7256,Mon May 04 16:45:27 +0000 2020,BehavioralScience education BehavioralScience COVID19,0,1257350639916638210,,,en,,,0,Ask leaders to make #BehavioralScience core #education RT @gcaballeroo: RT @IDB_Research: Join us 05/06 for a conversation with @CassSunstein on how to leverage #BehavioralScience to turn the tide on #COVID19 in…,ScalarHumanity,4770,,Scalar Humanity,ScalarHumanity,
8040,Sat May 09 19:45:43 +0000 2020,BehavioralScience education COVID19,0,1259207943754526720,,,en,,,0,Ask leaders to make #BehavioralScience core #education RT @MariaOkumu: RT @BusaraCenter: Join Busara's @nicholas_owsley for an in-depth discussion on behaviour change in the context of #COVID19.\n\n13/05/2020 fro…,ScalarHumanity,4770,,Scalar Humanity,ScalarHumanity,
8128,Wed Mar 04 08:24:17 +0000 2020,COVID19 Coronavirus,0,1235118855648768000,,,en,,,0,"bmj_latest: Human behaviour will determine how quickly covid-19 spreads, say SusanMichie DrRichardAmlot and robertjwest What needs to happen to ensure that people have the capability, opportunity and motivation to enact key behaviours #COVID19 #Coronavirus UCL_BSH …",aruberutou,48,"Sapporo, Hokkaido",GAYLEアルベルトウ,aruberutou,


Note that many are variations of the same core message. They will probably be discounted as duplicates when we remove duplicates after pre-processing the text of the tweets.

In [21]:
# let's remove these 13 from the uncommented retweets as that's not what they are
uncommented_above140_retweets = uncommented_above140_retweets[uncommented_above140_retweets.notnull().reweet_id]

In [22]:
uncommented_above140_retweets.shape

(8562, 17)

In [23]:
tweets_to_go_ids1 = uncommented_above140_retweets.id

**Important NOTE**

The original tweets of these uncommented >140cha retweets may not be present in our dataset as original tweets. However, as for Twitter's policy, uncommented retweets of tweets longer than 140 character are truncated when accessed via the Stream API, so there is no way we can get access to the full text, we will have to exclude them anyway. We cannot, in fact, analyse the sentiment of incomplete text in a reliable way. 

Let's try to identify any other (i.e., up to 140 character) uncommented retweet

In [24]:
retweets = tweets_raw[(tweets_raw.text.str.startswith("RT @"))]

In [25]:
retweets.reweet_id.isnull().value_counts()

False    8829
True        3
Name: reweet_id, dtype: int64

In [26]:
# How many of these retweets are not in the above-140 uncommented rewteets already identified?
retweets[~retweets.id.isin(uncommented_above140_retweets.id)].shape


(270, 17)

In [27]:
# We need to understand which ones of these retweets are also uncommented dulicates of exisiting tweets (so to be dropped)
extra_retweets = retweets[~retweets.id.isin(uncommented_above140_retweets.id)]

In [28]:
# Let's clean the text by removing the "RT @name-original-sender " initial part
extra_retweets = extra_retweets.copy()
extra_retweets['cleaned_retweet'] = [re.sub(r"^RT @.*: ", "", t) for t in extra_retweets.text]

In [29]:
extra_retweets[['text', 'cleaned_retweet']]

Unnamed: 0,text,cleaned_retweet
35,RT @shayonislynn: Using #behaviouralscience to improve behaviours #nudgesinthewild #COVID19 https://t.co/OyIDElZwR7,Using #behaviouralscience to improve behaviours #nudgesinthewild #COVID19 https://t.co/OyIDElZwR7
70,RT @faisalislam: Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -,Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -
106,RT @faisalislam: Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -,Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -
135,"RT @TheLancet: NEW—Three lessons for #COVID19 response from pandemic #HIV \n\n1. Anticipate health inequalities\n2. Support behaviour change\n3. Multidisciplinary effort\n\nComment @TheLancetHIV by J Hargreaves, C Davey, et al @LSHTM\nhttps://t.co/8mPuOHL2tu https://t.co/zYROZ0epQE","NEW—Three lessons for #COVID19 response from pandemic #HIV \n\n1. Anticipate health inequalities\n2. Support behaviour change\n3. Multidisciplinary effort\n\nComment @TheLancetHIV by J Hargreaves, C Davey, et al @LSHTM\nhttps://t.co/8mPuOHL2tu https://t.co/zYROZ0epQE"
170,RT @MDRC_News: Behavioural science and the response to #COVID19: A missed opportunity?\n\nvia @LSEImpactBlog https://t.co/6Ybq1Wgb7N,A missed opportunity?\n\nvia @LSEImpactBlog https://t.co/6Ybq1Wgb7N
...,...,...
12002,RT @Christi55143759: Behaviour change and mass testing in areas with large nos who have tested positive will help contain the virus.,Behaviour change and mass testing in areas with large nos who have tested positive will help contain the virus.
12052,RT @faisalislam: Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -,Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -
12101,RT @socratext: How social &amp; behavioural science might support the mitigation of #COVID19 #coronavirus pandemic @UKDCP @alexanderhaslam,How social &amp; behavioural science might support the mitigation of #COVID19 #coronavirus pandemic @UKDCP @alexanderhaslam
12142,RT @faisalislam: Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -,Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -


In [30]:
pd.DataFrame(extra_retweets.cleaned_retweet.value_counts())

Unnamed: 0,cleaned_retweet
Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -,97
Forget all behavioural science stuff. Look at the number of people on respirators - that’s the life saving element,35
Paging Nudge Unit:\nTry bundling your infection control messages.\n\nhttps://t.co/2pJeBihyge,16
drivers plan to walk for cleaner air #greenrecovery https://t.co/6K3C8cvYFP,6
"Thank god for real experts on Pandemics and Healthcare, rather Nudge Unit behavioural scientists.",4
...,...
And another member of the behavioural science advisory group speaks out 👇\n\n#COVID19 #pandemic https://t.co/j0ENdOSlJq,1
Calculate your #covid-19 risk and get some behaviour change advice too https://t.co/7zhQ4M91Cg https://t.co/id8V5Jy2e8,1
This looks like the triumph of 'nudge theory' over medical science.,1
Using #behaviouralscience to improve behaviours #nudgesinthewild #COVID19 https://t.co/OyIDElZwR7,1


Some, those whose count > 1, are definitely uncommented retweets of indentical original tweets - to be removed.

In [38]:
# let's keep one of each, in case the original tweet is not present in the dataset
extra_retweets.drop_duplicates(subset ="cleaned_retweet", keep = "first")

Unnamed: 0,created_at,hashtags,favorite_count,id,reweet_id,retweet_screen_name,lang,place,possibly_sensitive,retweet_count,text,user_screen_name,user_followers_count,user_location,user_name,user_screen_name.1,user_time_zone,cleaned_retweet
71,Sun Mar 08 17:00:22 +0000 2020,,0,1236698282736472065,1.236694e+18,normonics,fr,,False,2,RT @normonics: .@casssunstein nudge in action. Hyper dangerous. https://t.co/iYnfuV1ED1,LuisOje30542321,17,Madrid,Luis Ojeda,LuisOje30542321,,.@casssunstein nudge in action. Hyper dangerous. https://t.co/iYnfuV1ED1
95,Tue Apr 07 20:37:48 +0000 2020,transportation BehavioralScience COVID19,0,1247624638588387331,1.247618e+18,brianne_eby,en,,False,4,RT @brianne_eby: I wrote about the nexus between #transportation &amp; #BehavioralScience in the context of #COVID19 👇\n\nhttps://t.co/aVDKN3UY3W,multimenon,746,"Tampa, FL",Nikhil Menon,multimenon,,I wrote about the nexus between #transportation &amp; #BehavioralScience in the context of #COVID19 👇\n\nhttps://t.co/aVDKN3UY3W
177,Sat Mar 14 19:22:54 +0000 2020,,0,1238908479718465538,1.238906e+18,peterjukes,en,,,90,RT @peterjukes: Forget all behavioural science stuff. Look at the number of people on respirators - that’s the life saving element,AndreaAndreamax,541,Devon,andrea #holdgovernmenttoaccount 🇪🇺✊🐟#SardinesUK,AndreaAndreamax,,Forget all behavioural science stuff. Look at the number of people on respirators - that’s the life saving element
227,Thu Mar 12 22:52:22 +0000 2020,,0,1238236419124088833,1.238092e+18,faisalislam,en,,,191,RT @faisalislam: Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -,StFilansDream,6474,Scotland: a European country,Stephen Dedalus 🏴󠁧󠁢󠁳󠁣󠁴󠁿🇮🇪🇪🇺,StFilansDream,,Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -
403,Sat Apr 18 12:50:57 +0000 2020,uspol regulation COVID19,0,1251493421367218178,1.251492e+18,DrRimmer,en,,False,3,RT @DrRimmer: Why is Trump gutting regulations that save lives? @Casssunstein https://t.co/ycGP9rwBN6 #uspol #regulation #COVID19,Littlesparrow9,745,,Little sparrow,Littlesparrow9,,Why is Trump gutting regulations that save lives? @Casssunstein https://t.co/ycGP9rwBN6 #uspol #regulation #COVID19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13964,Fri Apr 17 22:28:46 +0000 2020,covid19,0,1251276443469770753,1.251105e+18,JulieLeask,en,,,3,RT @JulieLeask: This is a good review of behavioural science relevant to #covid19,_HealthyComms,1816,"Melbourne, Australia",HealthyCommunicators,_HealthyComms,,This is a good review of behavioural science relevant to #covid19
14090,Fri Apr 24 09:55:03 +0000 2020,COVID19 BehavioralEconomics,3,1253623480718970880,,,en,,False,1,"RT @frmreyes: RT @behave4_: Asocial is the New Prosocial (Part 1)\n \n 👉 https://t.co/KzbZmpOQMc\n\n#COVID19 can inconspicuously hitch-hike through our social network until it finds its prey, which has marked the end to the most basic human need — being social.\n#BehavioralEconomics",davdittrich,7854,"Berlin, Germany",Dennis A V Dittrich,davdittrich,,"Asocial is the New Prosocial (Part 1)\n \n 👉 https://t.co/KzbZmpOQMc\n\n#COVID19 can inconspicuously hitch-hike through our social network until it finds its prey, which has marked the end to the most basic human need — being social.\n#BehavioralEconomics"
14350,Tue Apr 28 11:02:20 +0000 2020,stayathome coronavirus behaviourchange newmr,0,1255089963521019904,1.255089e+18,neilgains,en,,False,1,RT @neilgains: How are Indonesians coping with #stayathome? https://t.co/lP2Mgr6w6v #coronavirus #behaviourchange #newmr,tapestryworks,370,Singapore,Neil Gains,tapestryworks,,How are Indonesians coping with #stayathome? https://t.co/lP2Mgr6w6v #coronavirus #behaviourchange #newmr
14505,Thu Apr 09 19:35:53 +0000 2020,COVID19 coronavirus,0,1248333832937517062,1.248245e+18,socratext,en,,,2,RT @socratext: How social &amp; behavioural science might support the mitigation of #COVID19 #coronavirus pandemic @UKDCP @alexanderhaslam,DrEstherCT,1530,"North East, England",Esther,DrEstherCT,,How social &amp; behavioural science might support the mitigation of #COVID19 #coronavirus pandemic @UKDCP @alexanderhaslam


In [31]:
# ids of duplicate retweets (to be removed)
tweets_to_go_ids2 = extra_retweets[extra_retweets.duplicated('cleaned_retweet')].id

In [32]:
tweets_to_go_ids2

106      1245224046196690945
328      1238100680587579394
574      1238913289058484225
688      1238100459505758210
993      1266325354911801345
                ...         
11866    1264813914208026626
11868    1247085494103674881
11932    1238400099795042304
12052    1238400129142579201
12142    1238128929422131201
Name: id, Length: 197, dtype: int64

In [33]:
# What's left?
extra_retweets[~extra_retweets.id.isin(tweets_to_go_ids2)]

Unnamed: 0,created_at,hashtags,favorite_count,id,reweet_id,retweet_screen_name,lang,place,possibly_sensitive,retweet_count,text,user_screen_name,user_followers_count,user_location,user_name,user_screen_name.1,user_time_zone,cleaned_retweet
35,Thu Mar 19 19:58:53 +0000 2020,behaviouralscience nudgesinthewild COVID19,0,1240729478088790016,1.240444e+18,shayonislynn,en,,False,4,RT @shayonislynn: Using #behaviouralscience to improve behaviours #nudgesinthewild #COVID19 https://t.co/OyIDElZwR7,DonSmith_ca,300,"Toronto, ON",Don Smith 🇨🇦,DonSmith_ca,,Using #behaviouralscience to improve behaviours #nudgesinthewild #COVID19 https://t.co/OyIDElZwR7
70,Thu Mar 12 16:35:45 +0000 2020,,0,1238141642005254148,1.238092e+18,faisalislam,en,,,189,RT @faisalislam: Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -,PaulDuffy1192,2440,London,🇪🇺🖤Paul Duffy #FBPE EU Citizen 3.5%,PaulDuffy1192,,Mark interviews the nudge unit behavioural scientist whose advice has driven the epidemic response so far -
135,Fri Apr 17 14:21:45 +0000 2020,COVID19 HIV,1,1251153881960046595,,,en,,False,0,"RT @TheLancet: NEW—Three lessons for #COVID19 response from pandemic #HIV \n\n1. Anticipate health inequalities\n2. Support behaviour change\n3. Multidisciplinary effort\n\nComment @TheLancetHIV by J Hargreaves, C Davey, et al @LSHTM\nhttps://t.co/8mPuOHL2tu https://t.co/zYROZ0epQE",equityhealthdev,3300,Washington DC USA,Equity and Health,equityhealthdev,,"NEW—Three lessons for #COVID19 response from pandemic #HIV \n\n1. Anticipate health inequalities\n2. Support behaviour change\n3. Multidisciplinary effort\n\nComment @TheLancetHIV by J Hargreaves, C Davey, et al @LSHTM\nhttps://t.co/8mPuOHL2tu https://t.co/zYROZ0epQE"
170,Sat May 30 16:45:55 +0000 2020,COVID19,0,1266772841967620096,1.266325e+18,MDRC_News,en,,False,2,RT @MDRC_News: Behavioural science and the response to #COVID19: A missed opportunity?\n\nvia @LSEImpactBlog https://t.co/6Ybq1Wgb7N,MDRC_News,20946,NYC/Bay Area/LA/DC,MDRC,MDRC_News,,A missed opportunity?\n\nvia @LSEImpactBlog https://t.co/6Ybq1Wgb7N
277,Sat Apr 04 06:52:45 +0000 2020,COVID19,0,1246329847472848897,1.246162e+18,bgibson49,en,,,1,RT @bgibson49: These were honestly so good. Please check out these talks if you're interested in the behavioural science around #COVID19,ADMakdani,234,"Liverpool, England",Adarsh D Makdani,ADMakdani,,These were honestly so good. Please check out these talks if you're interested in the behavioural science around #COVID19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11561,Sun Apr 05 09:09:33 +0000 2020,BigData Google COVID19 Anroid,0,1246726661866848256,1.246719e+18,mirko_ross,en,,False,1,RT @mirko_ross: 👀 #BigData by #Google on #COVID19 behaviour change of #Anroid users...\n\nhttps://t.co/ohgOyt92a5,chidambara09,7732,Mysore and BERLIN,Chidambara .ML.,chidambara09,,👀 #BigData by #Google on #COVID19 behaviour change of #Anroid users...\n\nhttps://t.co/ohgOyt92a5
11943,Tue Apr 28 11:02:20 +0000 2020,stayathome coronavirus behaviourchange newmr,0,1255089963521019904,1.255089e+18,neilgains,en,,False,1,RT @neilgains: How are Indonesians coping with #stayathome? https://t.co/lP2Mgr6w6v #coronavirus #behaviourchange #newmr,tapestryworks,372,Singapore,Neil Gains,tapestryworks,,How are Indonesians coping with #stayathome? https://t.co/lP2Mgr6w6v #coronavirus #behaviourchange #newmr
12002,Wed May 06 18:08:30 +0000 2020,,0,1258096316116279296,1.257669e+18,Christi55143759,en,,,1,RT @Christi55143759: Behaviour change and mass testing in areas with large nos who have tested positive will help contain the virus.,KINGPINDANNY,1529,"Nairobi, or any place near you",Danny Marley-Jr,KINGPINDANNY,,Behaviour change and mass testing in areas with large nos who have tested positive will help contain the virus.
12101,Thu Apr 09 19:35:53 +0000 2020,COVID19 coronavirus,0,1248333832937517062,1.248245e+18,socratext,en,,,2,RT @socratext: How social &amp; behavioural science might support the mitigation of #COVID19 #coronavirus pandemic @UKDCP @alexanderhaslam,DrEstherCT,1634,"North East, England",Esther,DrEstherCT,,How social &amp; behavioural science might support the mitigation of #COVID19 #coronavirus pandemic @UKDCP @alexanderhaslam


In [34]:
# are they maybe duplicates of tweets of original tweets in the dataset?
tweets_to_investigate = extra_retweets[~extra_retweets.id.isin(tweets_to_go_ids2)]

In [35]:
tweets_to_investigate_text = tweets_to_investigate.cleaned_retweet     

In [36]:
len(tweets_to_investigate_text)

73

In [37]:
tweets_raw[tweets_raw.text.isin(tweets_to_investigate_text)].count()   # 35 are "copies" of original tweets in the dataset

created_at              26
hashtags                20
favorite_count          26
id                      26
reweet_id                0
retweet_screen_name      0
lang                    26
place                    0
possibly_sensitive      23
retweet_count           26
text                    26
user_screen_name        26
user_followers_count    26
user_location           23
user_name               26
user_screen_name.1      26
user_time_zone           0
dtype: int64

In [38]:
pd.DataFrame(tweets_to_investigate_text.value_counts())

Unnamed: 0,cleaned_retweet
Paging Nudge Unit:\nTry bundling your infection control messages.\n\nhttps://t.co/2pJeBihyge,1
Forget all behavioural science stuff. Look at the number of people on respirators - that’s the life saving element,1
It will be very important to get good evidence on this @PHE_uk @NCSCT @UCL_BSH @SusanMichie,1
👀 #BigData by #Google on #COVID19 behaviour change of #Anroid users...\n\nhttps://t.co/ohgOyt92a5,1
Such a pleasure to be part of this fantastic group of authors. Now in press at Nature Behavioural Science.,1
...,...
How do #behaviouralscience and testing aid responses to #coronavirus ? https://t.co/FnMnT0NFNO https://t.co/QSvahdf0vu,1
How social &amp; behavioural science might support the mitigation of #COVID19 #coronavirus pandemic @UKDCP @alexanderhaslam,1
#SocialDistance on #QandA tonight. Appropriate behaviour change for #COVID19 https://t.co/Fkz9CWOA0v,1
a missed opportunity?\n\n#COVID19 https://t.co/vvijU3DEU2,1


In [39]:
tweets_to_go_ids3 = tweets_to_investigate[tweets_to_investigate.cleaned_retweet.isin(
    tweets_raw[tweets_raw.text.isin(tweets_to_investigate_text)].text)].id

In [40]:
len(tweets_to_go_ids3)   #ok

26

In [41]:
print(len(tweets_to_go_ids1))
print(len(tweets_to_go_ids2))

8562
197


### Let's remove from the dataset all the uncommented retweets that are duplicate of original tweets already in the dataset 

In [42]:
print(len(set(tweets_to_go_ids1)))
print(len(set(tweets_to_go_ids2)))
print(len(set(tweets_to_go_ids3)))
# GOOD :-), each is unique

8562
197
26


In [43]:
tweets_to_go_ids_all = list(tweets_to_go_ids1) + list(tweets_to_go_ids2) + list(tweets_to_go_ids3)

In [44]:
len(tweets_to_go_ids_all)

8785

In [45]:
len(tweets_raw.index) 

12161

In [46]:
tweets_original = tweets_raw[~tweets_raw.id.isin(tweets_to_go_ids_all)]

In [47]:
len(tweets_original.index)

3376

### English vs. non-English tweets

Non-English tweets will be dropped as they are not part of our population of interest.


In [48]:
count_by_lang = pd.DataFrame(tweets_original.groupby('lang').id.count()) 

In [49]:
count_by_lang.rename(columns={'id': 'count_ids'}, inplace=True)

In [50]:
count_by_lang['prop'] = round(count_by_lang['count_ids'] / sum(count_by_lang['count_ids']),3)

In [51]:
count_by_lang

Unnamed: 0_level_0,count_ids,prop
lang,Unnamed: 1_level_1,Unnamed: 2_level_1
ar,1,0.0
cy,1,0.0
da,2,0.001
de,5,0.001
en,3107,0.92
es,54,0.016
fa,1,0.0
fi,2,0.001
fr,9,0.003
hi,3,0.001


What are the "und" (undefined) ones?

In [52]:
tweets_original[tweets_original.lang == "und"]

Unnamed: 0,created_at,hashtags,favorite_count,id,reweet_id,retweet_screen_name,lang,place,possibly_sensitive,retweet_count,text,user_screen_name,user_followers_count,user_location,user_name,user_screen_name.1,user_time_zone
99,Tue Apr 21 22:09:59 +0000 2020,,0,1252721267339190272,,,und,,False,0,@BotPutins https://t.co/SMWlZCiAm9,owhy3,2173,France 29y -US 21y- JPN 27y,owhy3,owhy3,
236,Fri Mar 13 12:11:49 +0000 2020,Coronavirus BehaviouralScience MoneyBeforeLives CullTheVulnerable TorySocialCleransing ToryEugenics HelpUs MurderingToryScum BorisOut GTTO,3,1238437609749258241,,,und,,False,2,#Coronavirus #BehaviouralScience #MoneyBeforeLives #CullTheVulnerable #TorySocialCleransing #ToryEugenics #HelpUs #MurderingToryScum #BorisOut #GTTO https://t.co/XORuB9oC9W,DebbieC14766711,719,,Debbie Carr,DebbieC14766711,
339,Wed May 27 16:15:56 +0000 2020,,0,1265678130896867328,,,und,,False,0,@RobertoBoffi https://t.co/aMCHpkwukn,Simondarc,862,,Simona D'Arcangeli,Simondarc,
449,Sun Jun 07 10:43:06 +0000 2020,marr MarrShow Dispatches cummingstogetya HerdImmunity dreddjustice,0,1269580639105355776,,,und,,True,0,#marr #MarrShow\n\n#Dispatches \n\n#cummingstogetya #HerdImmunity #dreddjustice https://t.co/b27DTN9gEy,GalacTR0N,1470,Orbiting Ͼ_Ͽ,Sir Muppet of Smegg,GalacTR0N,
454,Mon Mar 09 22:17:05 +0000 2020,,2,1237140377569759233,,,und,,False,0,@MahiTuna https://t.co/Sv2z044Mit,masarat,6357,"London, Rajasthan, Dubai",Masarat Daud,masarat,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11926,Mon May 04 15:51:34 +0000 2020,ApnoKaDhyan,5,1257337079203315719,,,und,,False,0,#ApnoKaDhyan https://t.co/q6s2ZRpuQ4,ShwetaChaubey9,127,"Bihar, India",Shweta Chaubey,ShwetaChaubey9,
11953,Thu Apr 02 08:11:22 +0000 2020,BorisResign BorisOut HerdimmunityIsMurder ukcoronavirus COVIDー19,0,1245624852179902469,,,und,,False,0,#BorisResign #BorisOut #HerdimmunityIsMurder #ukcoronavirus #COVIDー19 https://t.co/3JvZHQRu9b,Soc_Sciences,309,,The Social Sciences,Soc_Sciences,
12056,Fri Mar 13 09:42:52 +0000 2020,,2,1238400124394627073,,,und,,False,0,https://t.co/VKvBQ9t5g5 https://t.co/N9ogUiu7MR,MattRudd,9652,,Matt Rudd,MattRudd,
12092,Fri Mar 13 09:42:54 +0000 2020,CoronaOutbreak coronavirus,0,1238400131516575744,,,und,,False,0,#CoronaOutbreak #coronavirus https://t.co/J1v0lbhJtD,POD198,55,Scotland,POD,POD198,


They seem to be purely list of hashtags and duplicates of the same one. So let's exclude them from further investigation.

### Exclude all non-English tweets

In [53]:
tweets_original_en = tweets_original[tweets_original.lang == 'en']

In [54]:
len(tweets_original_en.index)

3107

### Geolocation

We'll look at the geolocation of users.

In [55]:
tweets_original_en.place.value_counts(dropna=False)

NaN                     2982
Geelong, Victoria          9
Dunstable, England         7
Blackpool, England         6
North East, England        3
                        ... 
Croydon, London            1
Chelsea, Québec            1
Halifax, Nova Scotia       1
Bampton, England           1
Richmond, London           1
Name: place, Length: 87, dtype: int64

The large majority are not geolocated.

### Save the data

In [56]:
# define file name and file path
output_name = "tweets_original_en"
output_filepath = os.path.join(OUTPUT_DIR, output_name + ".csv")

In [57]:
tweets_original_en.to_csv(output_filepath)