In [2]:
import requests
import os
import json
import pandas as pd
import numpy as np
import csv
import datetime
import dateutil.parser
import unicodedata
import time
import random
from dotenv import load_dotenv, find_dotenv
# from geopy.geocoders import Nominatim
# from geopy.extra.rate_limiter import RateLimiter

In [3]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

In [4]:
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

def auth():
    #return os.getenv('BEARER_TOKEN')
    return os.environ.get("BEARER_TOKEN")

def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

def create_url(keyword, start_date, end_date, max_results = 10):
    
    search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    query_params = {'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': {}}
    return (search_url, query_params)

def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def append_tweet_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Author ID
        author_id = tweet['author_id']

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Geolocation
        if ('geo' in tweet):   
            geo = tweet['geo']['place_id']
        else:
            geo = " "

        # 4. Tweet and Conversation ID
        tweet_id = tweet['id']
        conversation_id = tweet['conversation_id']
        in_reply_to_user_id = tweet['in_reply_to_user_id']

        # 5. Language
        lang = tweet['lang']

        # 6. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        # 7. source
        source = tweet['source']

        # 8. Tweet text
        text = tweet['text']
        
        # Assemble all data in a list
        res = [tweet_id, conversation_id, author_id, in_reply_to_user_id, created_at, geo, lang, like_count, quote_count, reply_count, retweet_count, source, text]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter)
    
def append_user_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for user in json_response['includes']['users']:

        # 1. Author ID, Username and DisplayName
        author_id = user['id']
        username = user['username']
        display_name = user['name']

        # 2. Description
        description = user['description']
        
        # 3. Verified
        verified = user['verified']
        
        # 4. Time user created
        created_at = dateutil.parser.parse(user['created_at'])

        # 5. User metrics
        followers_count = user['public_metrics']['followers_count']
        following_count = user['public_metrics']['following_count']
        tweet_count = user['public_metrics']['tweet_count']
        listed_count = user['public_metrics']['listed_count']
        
        # Assemble all data in a list
        res = [author_id, username, display_name, description, verified, created_at, followers_count, following_count, tweet_count, listed_count]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Users added from this response: ", counter) 
    
def append_place_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for place in json_response['includes']['places']:

        # Features
        geo = place['id']
        full_name = place['full_name']
        place_type = place['place_type']
        name = place['name']
        country_code = place['country_code']
        
        # Assemble all data in a list
        res = [geo, full_name, place_type, name, country_code]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Places added from this response: ", counter) 

In [None]:
#
priority_news = [
    'CNN','MSNBC','NBCNews','FoxNews','BBCNews','itvnews','SkyNews','CTVNews','CBCNews','globalnews','inquirerdotnet','ABSCBNNews','gmanews','ndtv','timesofindia','TimesNow','republic',
    '7NEWS','9NewsAUS','abcnews','News24','eNCA','SABCNews','rtenews','Independent_ie','thejournal_ie','MobilePunch','vanguardngrnews','PulseNigeria247','citizentvkenya','ntvkenya',
    'NationAfrica','ntvuganda','nbstv','DailyMonitor','malaysiakini','staronline','NewshubNZ','nzherald'
]

#
df = pd.read_csv('./../data/raw/news_tweets.csv')
news_accounts = pd.read_csv('./../data/raw/covid_users.csv')
tmp = df.author_id.value_counts().rename('amount').rename_axis('author_id').reset_index()
news_accounts = news_accounts[news_accounts.author_id.isin(tmp.author_id)].drop_duplicates('author_id').merge(tmp).sort_values('amount', ascending=False)
tweets_replies = pd.read_csv('./../data/raw/tweets_replies.csv')

# Get selected conversations
conversations = list(df[
    (df.author_id.isin(news_accounts[news_accounts.username.isin(priority_news)].author_id.values)) &
    ~(df.conversation_id.isin(tweets_replies.conversation_id.unique())) &
    (df.reply_count>0)
].conversation_id)
random.shuffle(conversations)
batches = list(split(conversations, int(len(conversations)/24)))

# Inputs for tweets
bearer_token = auth()
headers = create_headers(bearer_token)
start_date = '2020-01-01T00:00:00.000Z'
end_date = '2021-11-14T23:59:59.000Z'
max_results = 500

# Loop Inputs
total_tweets = 0
count = 0 # Counting tweets per time period
max_count = 10000000 # Max tweets per time period
flag = True
next_token = None
n_requests = 0
n_batches = 0
valid = False
errCount = 0

In [None]:
# Check if flag is true
start_time = time.time()

for batch in batches:
    
    search = "conversation_id:" + " OR conversation_id:".join( [str(s) for s in batch]) + " lang:en is:reply -is:retweet"
    print("--------------------------------------")
    print("Batch #: ", n_batches)
    total_loop_tweets = 0
    flag = True
    while flag:
        # Check if max_count reached
        if count >= max_count:
            break
        print("Request #: ", n_requests+1, " | Time cap: ", int(time.time() - start_time))
        print("Token: ", next_token)
        while not valid:
            try:
                url = create_url(search, start_date,end_date, max_results)
                json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
                result_count = json_response['meta']['result_count']
                n_requests += 1
                valid = True; errCount = 0
            except:
                errCount += 1
                time.sleep(2^errCount)
                print("-------------------------------------- Request error #", errCount)
        valid = False

        # Save the token to use for next call
        if 'next_token' in json_response['meta']:
            next_token = json_response['meta']['next_token']
            print("Next Token: ", next_token)
            if result_count is not None and result_count > 0 and next_token is not None:
                append_tweet_to_csv(json_response, "./../data/raw/tweets_replies.csv")
                append_user_to_csv(json_response, "./../data/raw/users_replies.csv")
                if 'places' in json_response['includes'].keys():
                    append_place_to_csv(json_response, "./../data/raw/places_replies.csv")
                count += result_count
                total_loop_tweets += result_count
                print("Cumulative # of Tweets in this batch: ", total_loop_tweets)
                print("-------------------")
                time.sleep(1)                
        # If no next token exists
        else:
            if result_count is not None and result_count > 0:
                append_tweet_to_csv(json_response, "./../data/raw/tweets_replies.csv")
                append_user_to_csv(json_response, "./../data/raw/users_replies.csv")
                if 'places' in json_response['includes'].keys():
                    append_place_to_csv(json_response, "./../data/raw/places_replies.csv")
                count += result_count
                total_loop_tweets += result_count
                print("Cumulative # of Tweets in this batch: ", total_loop_tweets)
                time.sleep(1)
            #Since this is the final request, turn flag to false to move to the next time period.
            flag = False
            next_token = None
        time.sleep(1)
        # If reachs requests cap, stop it
        t = time.time()-start_time
        if n_requests==300:
            if t<900:
                time.sleep(900 - t)
            start_time=time.time()
            n_requests = 0
    n_batches += 1
    total_tweets += total_loop_tweets
    print("Total number of batch results: ", total_loop_tweets)
    print("Total # of Tweets added: ", total_tweets)
print("Total number of results: ", total_tweets)