## 1. Import packages

In [1]:
#!pip install tweepy

In [1]:
import os
import tweepy as tw
import pandas as pd
from tqdm import tqdm

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1) #To be set as None depending on python version you are using
pd.set_option('display.width', None)

## 2. Twitter API authentication

In [None]:
consumer_api_key = os.environ["TWITTER_CONSUMER_API_KEY"]
consumer_api_secret = os.environ["TWITTER_CONSUMER_API_SECRET"]

In [22]:
auth = tw.OAuthHandler(consumer_api_key, consumer_api_secret)

In [23]:
api = tw.API(auth, wait_on_rate_limit=True)

## 3. Tweets query

### 3.1. Define the query

In [15]:
search_words = "#covid19 -filter:retweets"
date_since = "2020-03-01"
# Collect tweets
tweets = tw.Cursor(api.search,
              q=search_words,
              lang="en",
              since=date_since).items(12452)

### 3.2. Retreive the tweets

In [24]:
tweets_copy = []
for tweet in tqdm(tweets):
     tweets_copy.append(tweet)




0it [00:00, ?it/s][A[A[A


1it [00:00,  3.32it/s][A[A[A


16it [00:00,  4.69it/s][A[A[A


31it [00:00,  6.58it/s][A[A[A


46it [00:00,  9.18it/s][A[A[A


61it [00:01, 12.02it/s][A[A[A


75it [00:01, 16.41it/s][A[A[A


89it [00:01, 22.02it/s][A[A[A


103it [00:01, 29.12it/s][A[A[A


118it [00:01, 34.70it/s][A[A[A


133it [00:01, 43.90it/s][A[A[A


148it [00:01, 52.64it/s][A[A[A


163it [00:02, 64.22it/s][A[A[A


178it [00:02, 71.80it/s][A[A[A


193it [00:02, 79.61it/s][A[A[A


208it [00:02, 87.99it/s][A[A[A


223it [00:02, 91.64it/s][A[A[A


238it [00:04, 27.05it/s][A[A[A


253it [00:04, 34.88it/s][A[A[A


268it [00:04, 44.35it/s][A[A[A


283it [00:04, 42.63it/s][A[A[A


296it [00:04, 49.58it/s][A[A[A


311it [00:05, 60.17it/s][A[A[A


326it [00:05, 69.34it/s][A[A[A


341it [00:05, 79.45it/s][A[A[A


356it [00:05, 62.04it/s][A[A[A


371it [00:05, 72.61it/s][A[A[A


386it [00:05, 78.52it/s][A[A[A


401

In [25]:
print(f"new tweets retrieved: {len(tweets_copy)}")

new tweets retrieved: 1603


## 4. Populate the dataset

In [26]:
tweets_df = pd.DataFrame()
for tweet in tqdm(tweets_copy):
    hashtags = []
    try:
        for hashtag in tweet.entities["hashtags"]:
            hashtags.append(hashtag["text"])
        text = api.get_status(id=tweet.id, tweet_mode='extended').full_text
    except:
        pass
    tweets_df = tweets_df.append(pd.DataFrame({'user_name': tweet.user.name, 
                                               'user_location': tweet.user.location,\
                                               'user_description': tweet.user.description,
                                               'user_created': tweet.user.created_at,
                                               'user_followers': tweet.user.followers_count,
                                               'user_friends': tweet.user.friends_count,
                                               'user_favourites': tweet.user.favourites_count,
                                               'user_verified': tweet.user.verified,
                                               'date': tweet.created_at,
                                               'text': text, 
                                               'hashtags': [hashtags if hashtags else None],
                                               'source': tweet.source,
                                               'is_retweet': tweet.retweeted}, index=[0]))




  0%|          | 0/1603 [00:00<?, ?it/s][A[A[A


  0%|          | 1/1603 [00:00<07:40,  3.48it/s][A[A[A


  0%|          | 2/1603 [00:00<06:51,  3.89it/s][A[A[A


  0%|          | 3/1603 [00:00<06:17,  4.24it/s][A[A[A


  0%|          | 4/1603 [00:00<06:15,  4.25it/s][A[A[A


  0%|          | 5/1603 [00:01<05:42,  4.67it/s][A[A[A


  0%|          | 6/1603 [00:01<05:32,  4.80it/s][A[A[A


  0%|          | 7/1603 [00:01<05:31,  4.82it/s][A[A[A


  0%|          | 8/1603 [00:01<06:28,  4.11it/s][A[A[A


  1%|          | 9/1603 [00:01<05:55,  4.49it/s][A[A[A


  1%|          | 10/1603 [00:02<06:14,  4.25it/s][A[A[A


  1%|          | 11/1603 [00:02<05:47,  4.58it/s][A[A[A


  1%|          | 12/1603 [00:02<05:17,  5.01it/s][A[A[A


  1%|          | 13/1603 [00:02<04:55,  5.38it/s][A[A[A


  1%|          | 14/1603 [00:02<04:55,  5.39it/s][A[A[A


  1%|          | 15/1603 [00:03<04:42,  5.63it/s][A[A[A


  1%|          | 16/1603 [00:03<04:35, 

In [38]:
tweets_df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,Dr. Ken Milne - EBM and Rural,Goderich/London CANADA,"Remember to be skeptical of anything you learn, even if you heard it on the Skeptics’ Guide to EM.",2012-07-11 22:10:32,18876,437,46325,False,2020-08-26 12:09:30,Majority of people got the correct answer for #SGEMHOP\nSingle PCR test for #COVID19 has a sensitivity of 60-78%\nhttps://t.co/x1rfVskz4e https://t.co/xWN6940EGU,"[SGEMHOP, COVID19]",Twitter Web App,False
0,VUMC OAP,"Nashville, TN","Office of Advanced Practice, Vanderbilt University Medical Center\nhttps://t.co/VSh2N6rqLh",2017-03-16 20:22:32,284,96,831,False,2020-08-26 12:09:25,"VUMC has been awarded a one-year, $34-million grant by the National Center for Advancing Translational Sciences, part of the National Institutes of Health, to conduct a nationwide study of “convalescent plasma” as a treatment for #COVID19 . https://t.co/HqicEjMOGQ",,Twitter Web App,False
0,😷Matt Oduor,"Nairobi, Kenya",Black-ish 😷😷😷,2011-05-23 17:55:19,510,69,10705,False,2020-08-26 12:09:25,"RUMORS:\n@LeFigaro reports the #Covid19 pandemic has created space for merger between accommodation giants @Accor &amp; @IHGCorporate. Both companies to consider the deal, though the @FinancialTimes in the UK considers any move less likely.\n\nhttps://t.co/c5hO3Z82sM",[Covid19],Twitter Web App,False
0,Kirstin Manges PhD RN,"Philadelphia, PA","In the weeds of quality, policy, nursing, aging, & equity| @NCSP_Penn |@AAN_Nursing Scholar, Luther, UIowa, VAQS alum| she/they |🐶💕 | My views (& typos) only.",2015-08-05 04:39:14,1993,1994,16224,False,2020-08-26 12:09:21,Happy birthday to an amazing nurse &amp; my best friend! Last night this pediatric RN was floated to the adult MICU to care for #COVID19 patients — something a year ago we all never would have imagined. Please wear a mask &amp; social distance so nurses can stop doing “the unimaginable.” https://t.co/e6ucCUUp7D,,Twitter Web App,False
0,ISF,New Delhi,"Indian Staffing Federation, ISF has been created with one common goal - Staffing India’s Growth.",2011-04-27 05:33:22,2069,3957,41,False,2020-08-26 12:09:19,"@farhanazmiINC , Vice President- India Staffing Federation, shared his thoughts about How unlock 4.0 will boost the hiring process of temporary jobs in India. Read More At https://t.co/1UXBSoPPzo\n#staffingindustry #solutions #Covid199 #business\n#employee #HR #jobseekers #covid19 https://t.co/978FHiXSQt",,Twitter Web App,False


## 5. Save the data

### 5.1. Read past data

In [39]:
tweets_old_df = pd.read_csv("covid19_tweets.csv")
print(f"past tweets: {tweets_old_df.shape}")

past tweets: (166656, 13)


### 5.2. Merge past and present data

In [40]:
tweets_all_df = pd.concat([tweets_old_df, tweets_df], axis=0)
print(f"new tweets: {tweets_df.shape[0]} past tweets: {tweets_old_df.shape[0]} all tweets: {tweets_all_df.shape[0]}")

new tweets: 1603 past tweets: 166656 all tweets: 168259


### 5.3. Drop duplicates

In [41]:
tweets_all_df.drop_duplicates(subset = ["user_name", "date", "text"], inplace=True)
print(f"all tweets: {tweets_all_df.shape}")

all tweets: (168259, 13)


### 5.4. Export the updated data

In [42]:
tweets_all_df.to_csv("covid19_tweets.csv", index=False)