### Social Media Analytics
# Twitter data collection
## Full archive search
### Requires Twitter research account

(c) Nuno António 2021-2022 - Version 1.03 (for API 2.0)

### Load packages and do the initializations

In [1]:
# Packages
import pandas as pd 
import numpy as np
import requests
import os
import json
from collections import namedtuple
import time
from datetime import datetime, timedelta

In [2]:
# Twitter account settings
# Verify details on https://developer.twitter.com/en/portal/dashboard

bearer_token = ""

In [3]:
# Endpoint
search_url = "https://api.twitter.com/2/tweets/search/all"

### Functions

In [4]:
# Get index of user details
def searchAuthorID(aID,usersObj):
    index = -1
    for user in usersObj:
        index = index + 1
        if user.id==aID:
            break
    return index

In [5]:
# Get index of place
def searchPlacesID(pID,placesObj):
    index = -1
    for place in placesObj:
        index = index + 1
        if place.id==pID:
            break
    return index

In [6]:
# Connection functions - from Twitter sample code: https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/master/Full-Archive-Search/full-archive-search.py

def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

def connect_to_endpoint(url, headers, params):
    errCounter=0
    while errCounter<10:
        try:
            response = requests.request("GET", url, headers=headers, params=params)
            print(response.status_code)
            if response.status_code != 200:
                raise Exception(response.status_code, response.text)
            break
        except:
            errCounter = errCounter+1
            time.sleep(1)
    return response.json()

In [7]:
# Query the next page
def query_page(url, header, query_params):

    # Connects to endpoint and collects the data
    json_response = connect_to_endpoint(url, header, query_params)

    # Dumps the json object into an element
    json_str = json.dumps(json_response)

    # Loads the json to an object
    resp = json.loads(json_str, object_hook=lambda d: namedtuple('X', d.keys())(*d.values()))

    # Returns
    return resp

### Search definitions
For details on how to build seach queries and filters check:
- https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
- https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all
- https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/tweet
- https://developer.twitter.com/en/support/twitter-api/error-troubleshooting

In [8]:
# Dates
#search_text = '(from:Booker4KY OR from:JohnKennedyLA OR from:GaryChambersJr OR from:LukeMixonLA OR from:SupportSyrita OR from:VanHollenForMD OR from:ChaffeeUSSenate OR from:Schmitt4Senate OR from:buschvalentine OR from:JonathanDine OR from:CortezMasto OR from:AdamLaxalt OR from:Maggie_Hassan OR from:GenDonBolduc OR from:jeremykauffman OR from:chuckschumer OR from:JosephPinion OR from:TedBuddNC OR from:CheriBeasleyNC OR from:ShannonBrayNC OR from:hoeven4senate OR from:katrina_senate OR from:JDVance1 OR from:TimRyan OR from:jameslankford OR from:madisonhornok OR from:mchdlny OR from:LibertyCowboyKB OR from:MarkwayneMullin OR from:VoteKendraOK OR from:WydenForOregon OR from:PerkinsPerspect OR from:JohnFetterman OR from:DrOz OR from:Erik4Senate OR from:votetimscott OR from:johnthune OR from:BrianBengs OR from:Lesnar4USSENATE OR from:MikeLeeforUtah OR from:EvanMcMullin OR from:JamesArtHansen OR from:WelchForVT OR from:MurrayCampaign OR from:SmileyForWA OR from:RonJohnsonWI OR from:TheOtherMandela) is:retweet'
search_text = '(from:KatieBrittforAL OR from:willboydforAL OR from:LisaForSenate OR from:KellyForAlaska OR from:CaptMarkKelly OR from:bgmasters OR from:Victor4Senate OR from:Boozman4AR OR from:JamesForAR OR from:SenAlexPadilla OR from:MarkMeuser OR from:MichaelBennet OR from:ODeaForColorado OR from:DickBlumenthal OR from:LeoraLevyCT OR from:TeamMarco OR from:valdemings OR from:ReverendWarnock OR from:HerschelWalker OR from:SchatzforHawaii OR from:repbobmcdermott OR from:crapoforsenate OR from:rothforIdaho OR from:TammyDuckworth OR from:salviforsenate OR from:BillRedpath OR from:ToddYoungIN OR from:gomcdermott OR from:sceniak4senate OR from:GrassleyWorks OR from:FrankenforIowa OR from:moranforkansas OR from:Holland4Kansas OR from:RandPaul) has:media'

beginDate = '2022-03-01T00:00:00.000Z'
endDate = '2022-11-07T23:59:59.999Z'

In [9]:
# Full query
full_query = {'query': search_text,
              'end_time': endDate,
              'start_time':beginDate,
              'max_results':500,
              'tweet.fields': 'author_id,created_at,public_metrics,text,geo',
              'expansions':'author_id,geo.place_id',
              'user.fields':'created_at,username,public_metrics,url,verified',
              'place.fields':'country,country_code,id,name,place_type'
              }

### Main Loop

In [10]:
# Create headers for authentication
headers = create_headers(bearer_token)

In [11]:
# Loop initializations

# Next_token
next_token=''

# Create an empty list for tweets
tws = []

# Counters
pageCounter = 1
collected = 0

# Variables to control time between requests because of limits
requestsPer15Min = 300
requestsPerSec = 1
secsBetweenRequests = int(15*60/requestsPer15Min)+1

# Last request time
lastRequest = datetime.now() + timedelta(seconds=-secsBetweenRequests)

In [12]:
# Loop until there are no more "next_token"
while True:

    # Wait the time between requests to avoid being over limits
    now = datetime.now()
    secsSinceLastRequest = (lastRequest-lastRequest).total_seconds()
    if secsSinceLastRequest<secsBetweenRequests:
        secsToWait = secsBetweenRequests-secsSinceLastRequest
        print(now.strftime("%H:%M:%S"),f"waiting {secsToWait} seconds")
        time.sleep(secsToWait)

    # Give feedback
    lastRequest = datetime.now()
    current_time = lastRequest.strftime("%H:%M:%S")
    print("Page:", pageCounter,'- start:',current_time)

    # Build query and get results for page
    query_params = full_query
    if next_token!='':
        query_params["next_token"] = next_token
    json_response = connect_to_endpoint(search_url, headers, query_params)
    
    # Transform json to object
    json_str = json.dumps(json_response)
    resp = json.loads(json_str, object_hook=lambda d: namedtuple('X', d.keys())(*d.values()))

    # If there are tweets in the response
    if resp.meta.result_count>0:

        # Add results to list
        for tweet in resp.data:
            userIndex = searchAuthorID(tweet.author_id,resp.includes.users)
            try:
                placeIndex = searchPlacesID(tweet.geo.place_id, resp.includes.places)
            except:
                placeIndex = -1
            tw=[tweet.id,
                tweet.text,
                tweet.author_id,
                tweet.created_at,
                tweet.public_metrics.retweet_count,
                tweet.public_metrics.reply_count,
                tweet.public_metrics.like_count,
                tweet.public_metrics.quote_count,
                resp.includes.users[userIndex].id,
                resp.includes.users[userIndex].username,
                resp.includes.users[userIndex].created_at,
                resp.includes.users[userIndex].public_metrics.followers_count,
                resp.includes.users[userIndex].public_metrics.following_count,
                resp.includes.users[userIndex].public_metrics.tweet_count,
                resp.includes.users[userIndex].public_metrics.listed_count,
                resp.includes.users[userIndex].verified,
                resp.includes.places[placeIndex].country_code if placeIndex!=-1 else '',
                resp.includes.places[placeIndex].country if placeIndex!=-1 else '',
                resp.includes.places[placeIndex].full_name if placeIndex!=-1 else '',
                resp.includes.places[placeIndex].place_type if placeIndex!=-1 else ''
            ]
            tws.append(tw)
            collected = collected + 1

        # Give feedback of the page processing
        ended = datetime.now()
        current_time = ended.strftime("%H:%M:%S")
        print("Page:", pageCounter,'- end:',current_time,'\n','Last tweet date:',tweet.created_at, 'Collected:',collected)

    # Test if there is a next_token to continue retrieval or if it should stop
    try:
        next_token = resp.meta.next_token
        pageCounter = pageCounter + 1
    except:
        break

10:40:14 waiting 4.0 seconds
Page: 1 - start: 10:40:18
200
Page: 1 - end: 10:40:21 
 Last tweet date: 2022-11-04T03:02:07.000Z Collected: 498
10:40:21 waiting 4.0 seconds
Page: 2 - start: 10:40:25
200
Page: 2 - end: 10:40:27 
 Last tweet date: 2022-10-30T13:02:01.000Z Collected: 995
10:40:27 waiting 4.0 seconds
Page: 3 - start: 10:40:31
200
Page: 3 - end: 10:40:33 
 Last tweet date: 2022-10-24T23:14:47.000Z Collected: 1492
10:40:33 waiting 4.0 seconds
Page: 4 - start: 10:40:37
200
Page: 4 - end: 10:40:40 
 Last tweet date: 2022-10-18T11:52:34.000Z Collected: 1991
10:40:40 waiting 4.0 seconds
Page: 5 - start: 10:40:44
200
Page: 5 - end: 10:40:46 
 Last tweet date: 2022-10-10T23:44:54.000Z Collected: 2489
10:40:46 waiting 4.0 seconds
Page: 6 - start: 10:40:50
200
Page: 6 - end: 10:40:52 
 Last tweet date: 2022-09-29T11:21:23.000Z Collected: 2986
10:40:52 waiting 4.0 seconds
Page: 7 - start: 10:40:56
200
Page: 7 - end: 10:40:58 
 Last tweet date: 2022-09-17T23:16:14.000Z Collected: 3485
1

### Wrap up

In [13]:
# Create the dataframe
tweetsDF = pd.DataFrame(tws, columns=['id',
            'text',
            'author_id',
            'created_at',
            'public_metrics.retweet_count',
            'public_metrics.reply_count',
            'public_metrics.like_count',
            'public_metrics.quote_count',
            'user.id',
            'user.username',
            'user.created_at',
            'user.public_metrics.followers_count',
            'user.public_metrics.following_count',
            'user.public_metrics.tweet_count',
            'user.public_metrics.listed_count',
            'user.verified',
            'place.country_code',
            'place.country',
            'place.full_name',
            'place.place_type'])

In [15]:
tweetsDF.to_csv("Tweets.csv", index=False, sep="\t")