In [43]:
## https://towardsdatascience.com/an-extensive-guide-to-collecting-tweets-from-twitter-api-v2-for-academic-research-using-python-3-518fcb71df2a

import requests
import os
import json
import pandas as pd
import numpy as np
import csv
import datetime
import dateutil.parser
import unicodedata
import time
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [2]:
def auth():
    #return os.getenv('BEARER_TOKEN')

def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

def create_url(keyword, start_date, end_date, max_results = 10):
    
    search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    query_params = {'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': {}}
    return (search_url, query_params)

def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def append_tweet_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Author ID
        author_id = tweet['author_id']

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Geolocation
        if ('geo' in tweet):   
            geo = tweet['geo']['place_id']
        else:
            geo = " "

        # 4. Tweet and Conversation ID
        tweet_id = tweet['id']
        conversation_id = tweet['conversation_id']

        # 5. Language
        lang = tweet['lang']

        # 6. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        # 7. source
        source = tweet['source']

        # 8. Tweet text
        text = tweet['text']
        
        # Assemble all data in a list
        res = [tweet_id, conversation_id, author_id, created_at, geo, lang, like_count, quote_count, reply_count, retweet_count, source, text]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 
    
def append_user_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for user in json_response['includes']['users']:

        # 1. Author ID, Username and DisplayName
        author_id = user['id']
        username = user['username']
        display_name = user['name']

        # 2. Description
        description = user['description']
        
        # 3. Verified
        verified = user['verified']
        
        # 4. Time user created
        created_at = dateutil.parser.parse(user['created_at'])

        # 5. User metrics
        followers_count = user['public_metrics']['followers_count']
        following_count = user['public_metrics']['following_count']
        tweet_count = user['public_metrics']['tweet_count']
        listed_count = user['public_metrics']['listed_count']
        
        # Assemble all data in a list
        res = [author_id, username, display_name, description, verified, created_at, followers_count, following_count, tweet_count, listed_count]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of users added from this response: ", counter) 
    
def append_place_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for place in json_response['includes']['places']:

        # Features
        geo = place['id']
        full_name = place['full_name']
        place_type = place['place_type']
        name = place['name']
        country_code = place['country_code']
        
        # Assemble all data in a list
        res = [geo, full_name, place_type, name, country_code]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Places added from this response: ", counter) 


In [11]:
'places' in json_response['includes'].keys()

False

In [3]:
# Create tweet file and write header
csvFile = open("./../data/processed/covid_tweets.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['tweet_id', 'conversation_id', 'author_id', 'created_at', 'geo', 'lang', 'like_count', 'quote_count', 'reply_count', 'retweet_count', 'source', 'text'])
csvFile.close()

# Create user file and write header
csvFile = open("./../data/processed/covid_users.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['author_id', 'username', 'display_name', 'description', 'verified', 'created_at', 'followers_count', 'following_count', 'tweet_count', 'listed_count'])
csvFile.close()

# Create place file and write header
csvFile = open("./../data/processed/covid_places.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['geo', 'full_name', 'place_type', 'name', 'country_code'])
csvFile.close()

# Inputs for tweets
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = "context:123.1220701888179359745 is:verified lang:en -is:retweet -is:reply"
start_date = '2020-12-01T00:00:00.000Z'
end_date = '2020-12-31T23:59:59.000Z'
max_results = 500

# Loop Inputs
total_tweets = 0
count = 0 # Counting tweets per time period
max_count = 1000000 # Max tweets per time period
flag = True
next_token = None
n_requests = 0

# Check if flag is true
start_time = time.time()

In [15]:
while flag:
    # Check if max_count reached
    if count >= max_count:
        break
    print("-------------------")
    print("Request #: ", n_requests+1)
    print("Requesting Time: ", int(time.time() - start_time))
    print("Token: ", next_token)
    url = create_url(keyword, start_date,end_date, max_results)
    json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
    result_count = json_response['meta']['result_count']

    if 'next_token' in json_response['meta']:
        # Save the token to use for next call
        next_token = json_response['meta']['next_token']
        print("Next Token: ", next_token)
        if result_count is not None and result_count > 0 and next_token is not None:
            append_tweet_to_csv(json_response, "./../data/processed/covid_tweets.csv")
            append_user_to_csv(json_response, "./../data/processed/covid_users.csv")
            if 'places' in json_response['includes'].keys():
                append_place_to_csv(json_response, "./../data/processed/covid_places.csv")
            count += result_count
            total_tweets += result_count
            n_requests += 1
            print("Total # of Tweets added: ", total_tweets)
            print("-------------------")
            time.sleep(1)                
    # If no next token exists
    else:
        if result_count is not None and result_count > 0:
            print("-------------------")
            append_tweet_to_csv(json_response, "./../data/processed/covid_tweets.csv")
            append_user_to_csv(json_response, "./../data/processed/covid_users.csv")
            if 'places' in json_response['includes'].keys():
                append_place_to_csv(json_response, "./../data/processed/covid_places.csv")
            count += result_count
            total_tweets += result_count
            print("Total # of Tweets added: ", total_tweets)
            print("-------------------")
            time.sleep(1)

        #Since this is the final request, turn flag to false to move to the next time period.
        flag = False
        next_token = None
    time.sleep(1)
    # If reachs requests cap, stop it
    if n_requests%295==0 and time.time()-start_time<900:
        time.sleep(900 - (time.time()-start_time))
    elif n_requests%295==0  and time.time()-start_time>=900:
        start_time=time.time()
print("Total number of results: ", total_tweets)

-------------------
Request #:  592
Requesting Time:  861
Token:  b26v89c19zqg8o3foshs43sql1wutqt6tg7kud2j0ymbh
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3foshs43sfvo7o2x52vujyb5frutb3x
# of Tweets added from this response:  500
# of users added from this response:  432
# of Places added from this response:  5
Total # of Tweets added:  295468
-------------------
-------------------
Request #:  593
Requesting Time:  865
Token:  b26v89c19zqg8o3foshs43sfvo7o2x52vujyb5frutb3x
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3foshs43s56bczuz0otq63co3mev3i5
# of Tweets added from this response:  500
# of users added from this response:  435
# of Places added from this response:  5
Total # of Tweets added:  295968
-------------------
-------------------
Request #:  594
Requesting Time:  869
Token:  b26v89c19zqg8o3foshs43s56bczuz0otq63co3mev3i5
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3foshs43rufe6ircn2k5215pn5pc1od
# of Tweets added from this response:  500
# 

In [19]:
# GET USERS
userlist=['SkyNews','nypost','CNN','nytimes','thehill','Reuters','washingtonpost','ABC','MSNBC','AP','NBCNews','inquirerdotnet','CBSNews','BBCNews','cnnbrk','CP24','ANI','FoxNews','NPR','ABSCBNNews','CNNPolitics','CTVNews','nowthisnews','SkyNewsBreak','dwnews','GMB','TMZ','LBC','ABC7','BBCBreaking','talkRADIO','BNODesk','latimes','newsmax','WSJ','BBCWorld','business','Independent','guardian','CNBC','gmanews','News24','ClayTravis','thedailybeast','politico','SMTOWNGLOBAL','rapplerdotcom','THR','BreitbartNews','Variety','DailyMailUK','therecount','itvnews','TIME','eNCA','SkySportsNews','cnnphilippines','globalnews','cnni','CBCAlerts','CNBCnow','rtenews','CBCNews','MobilePunch','KTLA','MailOnline','ndtv','SportsCenter','TorontoStar','Complex','SkyNewsAust','AFP','HuffPost','USATODAY','RT_com','theblaze','Public_Citizen','Newsweek','TheEconomist','axios','abcnews','NYDailyNews','DailyMail','TheSun','PhilippineStar','nprpolitics','OANN','BusinessInsider','BBCPolitics','ANCALERTS','CBCToronto','globaltimesnews','EpochTimes','japantimes','dailystar','TheLeadCNN','Daily_Express','DailyMirror','VibeMagazine','Forbes','Quicktake','guardiannews','BW','BuzzFeedNews','propublica','BBCScotlandNews','FOXLA','TheTorontoSun','BBCBreakfast','TODAYshow','WalesOnline','CTVToronto','ABCPolitics','newscomauHQ','nationalpost','STVNews','SunSentinel','Independent_ie','BBCWalesNews','NBCLA','StarTribune','AP_Politics','PhilstarNews','bbc5live','KUSINews','GMA','9NewsAUS','CityNewsTO','people','AJEnglish','theage','MENnewsdesk','smh','citizentvkenya','abc7newsbayarea','TimesLIVE','ntvkenya','ChannelNewsAsia','NewsHour','globeandmail','CBSEveningNews','KTVU','freep','NBCPolitics','KDKA','Channel4News','SABCNews','htTweets','fox7austin','nytpolitics','FinancialTimes','YahooNews','PTI_News','CBSLA','LBCNews','ABC7NY','WashTimes','denverpost','IrishTimes','Reuters_Health','chicagotribune','MiamiHerald','TheView','orlandosentinel','bopinion','sfchronicle','BostonGlobe','thetimes','ABCWorldNews','BBCSport','KFOX14','abc3340','XHNews','TheDailyShow','SBSNews','680NEWS','timesofindia','CBSMornings','9NewsSyd','MetroUK','ajplus','GlobalEdmonton','cbcnewsbc','TimesNow','TheOnion','CTVNationalNews','FOX10Phoenix','PremiumTimesng','WIONews','vanguardngrnews','KATUNews','IndiaToday','9NewsMelb','ctvottawa','theheraldsun','FOX9','ScottishSun','komonews','CityNewsVAN','standardnews','seattletimes','Jerusalem_Post','GlobalBC','ESPNCFB','8NewsNow','ntvuganda','statnews','SCMPNews','MeetThePress','BelTel','MirrorPolitics','FOX5Vegas','XXL','TB_Times','CGTNOfficial','detroitnews','kron4news','republic','NewYorker','BreakingNews','NationAfrica','ABC7Chicago','calgaryherald','DailyPostNGR','azcentral','OpIndia_com','IOL','SkySportsPL','FOX2News','TexasTribune','ewnupdates','bpolitics','cspanwj','ctvedmonton','NBCPhiladelphia','WCCO','manilabulletin','6abc','CTVCalgary','fox5dc','FOX13News','BuzzFeed','BBCNewsNI','Oregonian','CBCOttawa','voxdotcom','PulseNigeria247','TPostMillennial','ajc','RebelNewsOnline','newscientist','VanityFair','thenewsoncnbc','abc13houston','abc15','KING5Seattle','wbz','DEADLINE','FOX29philly','ABC30','CityAM','RNCResearch','IGN','MLive','straits_times','CTVVancouver','thejournal_ie','Investingcom','WCVB','bmj_latest','ABC11_WTVD','TheNationNews','HuffPostPol','TheRoot','ChinaDaily','WRAL','fox13seattle','azfamily','KHOU','CNNHeroes','ReutersUK','NewstalkFM','nytimesworld','latimeshealth','NEWSTALK1010','5_News','nbstv','malaysiakini','tes','newsbusters','wsvn','wsbtv','WGRZ','FT','wxyzdetroit','njdotcom','WFLA','democracynow','globalnewsto','FOX4','nbcbayarea','euronews','NigeriaNewsdesk','ndtvfeed','Local4News','BBCNewsnight','CBCPolitics','the_hindu','10TV','NY1','wjz','Saudi_Gazette','nytopinion','KIRO7Seattle','blogTO','CBCCalgary','WPXI','ctvqp','FOX5Atlanta','TPM','PolitiFact','AlArabiya_Eng','7News','ComplexSports','AP_Europe','WSMV','YahooFinance','CNNnewsroom','CBSMiami','news4buffalo','TheAtlantic','NewDay','nbcchicago','9NEWS','kare11','NBCDFW','NewshubNZ','RealSkipBayless','usatodaysports','DenverChannel','GuardianAus','CTVMontreal','7NewsMelbourne','SputnikInt','FoxBusiness','starsandstripes','OttawaCitizen','nzherald','ksatnews','edmontonjournal','qikipedia','77WABCradio','ITVWales','NBCNightlyNews','kcranews','FOXNashville','irishexaminer','BET','BBCNWT','GlobalNational','CBCTheNational','enews','7NewsSydney','FaceTheNation','fox32news','BloombergAsia','ReutersBiz','fox13','amazonnews','MPRnews','bostonherald','fox5sandiego','IrishTimesWorld','TheAVClub','dallasnews','BBCr4today','CBSDenver','WSJopinion','nbcsandiego','fox12oregon','KVUE','KXAN_News','NBCNewYork','12News','TimesofIsrael','hkfp','ewnreporter','Tennessean','aldotcom','11thHour','NEJM','Local12','daily_trust','SFGate','staronline','DailyMonitor','clevelanddotcom','DailyMailCeleb']
def create_user_url(userlist):
    # Specify the usernames that you want to lookup below
    # You can enter up to 100 comma-separated values.
    usernames = f"usernames={','.join(userlist)}"
    user_fields = "user.fields=location"
    # User fields are adjustable, options include:
    # created_at, description, entities, id, location, name,
    # pinned_tweet_id, profile_image_url, protected,
    # public_metrics, url, username, verified, and withheld
    url = "https://api.twitter.com/2/users/by?{}&{}".format(usernames, user_fields)
    return url


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2UserLookupPython"
    return r


def connect_to_endpoint(url):
    response = requests.request("GET", url, auth=bearer_oauth,)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(
                response.status_code, response.text
            )
        )
    return response.json()

url = create_user_url(userlist[:10])
json_response = connect_to_endpoint(url)
print(json.dumps(json_response, indent=4, sort_keys=True))

200
{
    "data": [
        {
            "id": "7587032",
            "location": "London, UK",
            "name": "Sky News",
            "username": "SkyNews"
        },
        {
            "id": "17469289",
            "location": "New York, NY",
            "name": "New York Post",
            "username": "nypost"
        },
        {
            "id": "759251",
            "name": "CNN",
            "username": "CNN"
        },
        {
            "id": "807095",
            "location": "New York City",
            "name": "The New York Times",
            "username": "nytimes"
        },
        {
            "id": "1917731",
            "location": "Washington, DC",
            "name": "The Hill",
            "username": "thehill"
        },
        {
            "id": "1652541",
            "location": "Around the world",
            "name": "Reuters",
            "username": "Reuters"
        },
        {
            "id": "2467791",
            "location": "Washington, 

In [41]:
news_accounts = pd.DataFrame(columns=['username', 'location'])
for cnt in range(int(np.ceil(len(userlist)/100))):
    url = create_user_url(userlist[cnt*100:(cnt+1)*100])
    json_response = connect_to_endpoint(url)
    for user in (json_response['data']):
        username = user['username']
        if 'location' in user.keys():
            location = user['location']
        else:
            location = ''
        news_accounts.loc[news_accounts.shape[0]] = [username, location]

200
200
200
200


In [44]:
geolocator = Nominatim(user_agent="my_request")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
news_accounts['geotag'] = news_accounts['location'].apply(geocode)
news_accounts['country'] = news_accounts['geotag'].apply(lambda x: x.address.split(',')[-1].strip() if x else None)

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Arlington, VA | New York, NY',), **{}).
Traceback (most recent call last):
  File "C:\Users\f.braulio\Anaconda3\envs\ugpn-covid-emotions\lib\site-packages\urllib3\connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Users\f.braulio\Anaconda3\envs\ugpn-covid-emotions\lib\site-packages\urllib3\connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Users\f.braulio\Anaconda3\envs\ugpn-covid-emotions\lib\http\client.py", line 1347, in getresponse
    response.begin()
  File "C:\Users\f.braulio\Anaconda3\envs\ugpn-covid-emotions\lib\http\client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "C:\Users\f.braulio\Anaconda3\envs\ugpn-covid-emotions\lib\http\client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\f.brauli

In [49]:
news_accounts[['username', 'location', 'country']].to_csv("./../data/processed/covid_usern_places.csv", index=False, sep=';')

In [39]:
#Loop through each user
news_accounts = pd.DataFrame(columns=['username', 'location'])
for user in (json_response['data']):
    username = user['username']
    if 'location' in user.keys():
        location = user['location']
    else:
        location = ''
    news_accounts.loc[news_accounts.shape[0]] = [username, location]


In [40]:
news_accounts

Unnamed: 0,username,location
0,SkyNews,"London, UK"
1,nypost,"New York, NY"
2,CNN,
3,nytimes,New York City
4,thehill,"Washington, DC"
5,Reuters,Around the world
6,washingtonpost,"Washington, DC"
7,ABC,New York City / Worldwide
8,MSNBC,
9,AP,Global
