# Collect Twitter Data for Final Project
- Name: Congxin (David) Xu
- Computing ID: cx2rx

In [1]:
import tweepy as tw
import pandas as pd
from dotenv import load_dotenv
import os 
from datetime import datetime

In [2]:
#load hidden credentials and values
load_dotenv('API_Key.env') 
# set current working directory to where this file is saved
# thisdir = globals()['_dh'][0] + "\\"  ##jupyter notebook
# thisdir = os.path.dirname(os.path.abspath(__file__)) + "\\" 
# os.chdir(thisdir)

apikey = os.getenv("apikey")
apisecret = os.getenv("apisecret")
accesstoken = os.getenv("accesstoken")
accesssecret = os.getenv("accesssecret")

In [3]:
date_since = "2021-03-01"
how_many_tweets = 2000

hashtaglist = """
vaccine
vaccination
vaccinate
pfizerbiontech
pfizercovidvaccine
pfizervaccine
covid pfizer
covid19 pfizer
covid-19 pfizer
covid_19 pfizer
corona pfizer
covidvaccine pfizer
coronavirus pfizer
coronavirusupdates pfizer
modernavaccine
modernacovidvaccine
covid moderna
covid19 moderna
covid-19 moderna
covid_19 moderna
corona moderna
coronavirus moderna
coronavirusupdates moderna
biontechvaccine
biontechcovidvaccine
covid biontech
covid19 biontech
covid-19 biontech
covid_19 biontech
corona biontech
coronavirus biontech
coronavirusupdates biontech
azvaccine
astrazenecacovidvaccine
astrazenecavaccine
covid astrazeneca
covid19 astrazeneca
covid-19 astrazeneca
covid_19 astrazeneca
corona astrazeneca
coronavirus astrazeneca
coronavirusupdates astrazeneca
cepi
coronavaccine
coronavirusvaccine
covax
covid19vaccine
covidvaccine
gavi
glyphosate
mrna
nvic
oxfordvaccine
pharmagreed
rna
sputnikv
vaccinessavelives
vax
vaxx
vaxxx
covidiots
getvaccinated
iwillgetvaccinated
thisisourshot
vaccineworks
vaccinessavelives
depopulation
eugenics
greatreset
notocoronavirusvaccines
mybodymychoice
peoplesbodyyourchoice
iwillnotcomply
endthelockdown
kungflu
plandemic
"""

In [4]:
# connect to twitter using creds above
auth = tw.OAuthHandler(apikey, apisecret)
auth.set_access_token(accesstoken, accesssecret)
api = tw.API(auth, wait_on_rate_limit=True)

# split new line delimited list above into clean list
hashtaglist = hashtaglist.split('\n')
hashtaglist = [x.strip() for x in hashtaglist if x.strip()]
hashtaglist = [' '.join([y for y in x.split(' ')]) for x in hashtaglist]
hashtaglist

['vaccine',
 'vaccination',
 'vaccinate',
 'pfizerbiontech',
 'pfizercovidvaccine',
 'pfizervaccine',
 'covid pfizer',
 'covid19 pfizer',
 'covid-19 pfizer',
 'covid_19 pfizer',
 'corona pfizer',
 'covidvaccine pfizer',
 'coronavirus pfizer',
 'coronavirusupdates pfizer',
 'modernavaccine',
 'modernacovidvaccine',
 'covid moderna',
 'covid19 moderna',
 'covid-19 moderna',
 'covid_19 moderna',
 'corona moderna',
 'coronavirus moderna',
 'coronavirusupdates moderna',
 'biontechvaccine',
 'biontechcovidvaccine',
 'covid biontech',
 'covid19 biontech',
 'covid-19 biontech',
 'covid_19 biontech',
 'corona biontech',
 'coronavirus biontech',
 'coronavirusupdates biontech',
 'azvaccine',
 'astrazenecacovidvaccine',
 'astrazenecavaccine',
 'covid astrazeneca',
 'covid19 astrazeneca',
 'covid-19 astrazeneca',
 'covid_19 astrazeneca',
 'corona astrazeneca',
 'coronavirus astrazeneca',
 'coronavirusupdates astrazeneca',
 'cepi',
 'coronavaccine',
 'coronavirusvaccine',
 'covax',
 'covid19vaccine'

In [5]:
# iterate through words, collect tweets, save as dicts in a list
dict_list = []
counter = 0
for search_words in hashtaglist:
    print(search_words)
    try:
        # Collect tweets
        tweets = tw.Cursor(api.search,
                    q=search_words,
                    lang="en",
                    since=date_since).items(how_many_tweets)

        # Iterate and print tweets
        for tweet in tweets:
            try:
                thed = dict(tweet._json)
                cleand = {'scraped_hashtag': search_words.strip(),
                    'scraped_order': counter}

                keepers = ['created_at', 'id_str', 'text', 'truncated', 
                            'in_reply_to_screen_name', 
                            'retweet_count', 'favorite_count', 'lang']
                for k in keepers:
                    cleand[k] = thed[k]

                cleand['screen_name'] = thed['user']['screen_name']
                cleand['user_name'] = thed['user']['name']
                cleand['user_description'] = thed['user']['description']
                cleand['user_verified'] = thed['user']['verified']
                cleand['user_followers_count'] = thed['user']['followers_count']

                cleand['hashtags'] = [x['text'] for x in thed['entities']['hashtags']]
                cleand['symbols'] = thed['entities']['symbols']

                if 'retweeted_status' in thed.keys():
                    p1 = thed['retweeted_status']['user']['screen_name']
                    cleand['og_tweet_by'] = p1 + "; " + thed['retweeted_status']['user']['name']
                    cleand['og_tweet_truncated'] = thed['retweeted_status']['truncated']

                dict_list.append(cleand)
            except:
                pass
        counter += 1
    except:
        pass

vaccine
vaccination
vaccinate
pfizerbiontech
pfizercovidvaccine
pfizervaccine
covid pfizer
covid19 pfizer
covid-19 pfizer
covid_19 pfizer
corona pfizer
covidvaccine pfizer
coronavirus pfizer
coronavirusupdates pfizer
modernavaccine
modernacovidvaccine
covid moderna
covid19 moderna
covid-19 moderna
covid_19 moderna
corona moderna
coronavirus moderna
coronavirusupdates moderna
biontechvaccine
biontechcovidvaccine
covid biontech
covid19 biontech
covid-19 biontech
covid_19 biontech
corona biontech
coronavirus biontech
coronavirusupdates biontech
azvaccine
astrazenecacovidvaccine
astrazenecavaccine
covid astrazeneca
covid19 astrazeneca
covid-19 astrazeneca
covid_19 astrazeneca
corona astrazeneca
coronavirus astrazeneca
coronavirusupdates astrazeneca
cepi
coronavaccine
coronavirusvaccine
covax
covid19vaccine
covidvaccine
gavi
glyphosate
mrna
nvic
oxfordvaccine
pharmagreed
rna
sputnikv
vaccinessavelives
vax
vaxx
vaxxx
covidiots
getvaccinated
iwillgetvaccinated
thisisourshot
vaccineworks
vacci

In [6]:
df = pd.DataFrame(dict_list)
dt = datetime.now().strftime("%y%m%d_%H%M")
df.head()

Unnamed: 0,scraped_hashtag,scraped_order,created_at,id_str,text,truncated,in_reply_to_screen_name,retweet_count,favorite_count,lang,screen_name,user_name,user_description,user_verified,user_followers_count,hashtags,symbols,og_tweet_by,og_tweet_truncated
0,vaccine,0,Sun Mar 21 04:09:39 +0000 2021,1373486939433627649,RT @oni_blackstock: Unpopular opinion:\n\nHavi...,False,,270,0,en,spacejessss,evil sorceress & her sentient vibrator | she/her,"definitely depressed, mostly cat, horny jail r...",False,2626,[],[],oni_blackstock; Dr. Oni Blackstock,True
1,vaccine,0,Sun Mar 21 04:09:38 +0000 2021,1373486935558082568,"RT @TheRickyDavila: Oh and by the way, both my...",False,,71,0,en,Kathlee14861239,Kathleen Wood,,False,24,[],[],TheRickyDavila; Ricky Davila,True
2,vaccine,0,Sun Mar 21 04:09:38 +0000 2021,1373486935147171844,"@Covid19Joker Have not had Covid, had to take...",False,Covid19Joker,0,0,en,HercheLisa,Lisa,,False,74,[],[],,
3,vaccine,0,Sun Mar 21 04:09:38 +0000 2021,1373486934757101570,RT @ger_trends: From the idea to produce a #va...,False,,33,0,en,alexandra2018,Alexia,You can write me here https://t.co/xqtE7wcicJ ...,False,341,"[vaccine, mRNA]",[],ger_trends; Insights into Germany | deutschlan...,True
4,vaccine,0,Sun Mar 21 04:09:37 +0000 2021,1373486933159067649,RT @AmandiOnAir: The feeling they must have ex...,False,,32,0,en,RBRB1516,dunbar1942,,False,664,[],[],AmandiOnAir; Fernand R. Amandi,True


In [7]:
# convert list of collected dicts into dataframe, 
# then save as file with datetime in this directory
df = pd.DataFrame(dict_list)
dt = datetime.now().strftime("%y%m%d_%H%M")
df.to_csv(f'hashtag_output_{dt}.csv', encoding="utf-8")