In [5]:
import requests
import os
import json
import pandas as pd
import numpy as np
import csv
import datetime
import dateutil.parser
import unicodedata
import time
import random
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [2]:
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

def auth():
    #return os.getenv('BEARER_TOKEN')
    return "AAAAAAAAAAAAAAAAAAAAAEsNOQEAAAAAYbrutZaFDjSRzoCCwQ6WyOzFxR4%3D1Prt5DCx5O8TUiaUzq56oIAjycSQtH5NsbjSliOs3sStkyNkel"

def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

def create_url(keyword, start_date, end_date, max_results = 10):
    
    search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    query_params = {'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': {}}
    return (search_url, query_params)

def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def append_tweet_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Author ID
        author_id = tweet['author_id']

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Geolocation
        if ('geo' in tweet):   
            geo = tweet['geo']['place_id']
        else:
            geo = " "

        # 4. Tweet and Conversation ID
        tweet_id = tweet['id']
        conversation_id = tweet['conversation_id']
        in_reply_to_user_id = tweet['in_reply_to_user_id']

        # 5. Language
        lang = tweet['lang']

        # 6. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        # 7. source
        source = tweet['source']

        # 8. Tweet text
        text = tweet['text']
        
        # Assemble all data in a list
        res = [tweet_id, conversation_id, author_id, in_reply_to_user_id, created_at, geo, lang, like_count, quote_count, reply_count, retweet_count, source, text]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter)
    
def append_user_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for user in json_response['includes']['users']:

        # 1. Author ID, Username and DisplayName
        author_id = user['id']
        username = user['username']
        display_name = user['name']

        # 2. Description
        description = user['description']
        
        # 3. Verified
        verified = user['verified']
        
        # 4. Time user created
        created_at = dateutil.parser.parse(user['created_at'])

        # 5. User metrics
        followers_count = user['public_metrics']['followers_count']
        following_count = user['public_metrics']['following_count']
        tweet_count = user['public_metrics']['tweet_count']
        listed_count = user['public_metrics']['listed_count']
        
        # Assemble all data in a list
        res = [author_id, username, display_name, description, verified, created_at, followers_count, following_count, tweet_count, listed_count]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Users added from this response: ", counter) 
    
def append_place_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for place in json_response['includes']['places']:

        # Features
        geo = place['id']
        full_name = place['full_name']
        place_type = place['place_type']
        name = place['name']
        country_code = place['country_code']
        
        # Assemble all data in a list
        res = [geo, full_name, place_type, name, country_code]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Places added from this response: ", counter) 

In [17]:
# Get selected conversations
df = pd.read_parquet('./../data/raw/tmp/news_tweets_skynews.parquet')
df = df[df.replyCount>0]
conversations = list(df.conversationId.unique())
random.shuffle(conversations)
batches = list(split(conversations, int(len(conversations)/24)))

# Create tweet file and write header
csvFile = open("./../data/processed/tmp/tweets_replies_skynews.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['tweet_id', 'conversation_id', 'author_id', 'in_reply_to_user_id', 'created_at', 'geo', 'lang', 'like_count', 'quote_count', 'reply_count', 'retweet_count', 'source', 'text'])
csvFile.close()

# Create user file and write header
csvFile = open("./../data/processed/tmp/users_replies_skynews.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['author_id', 'username', 'display_name', 'description', 'verified', 'created_at', 'followers_count', 'following_count', 'tweet_count', 'listed_count'])
csvFile.close()

# Create place file and write header
csvFile = open("./../data/processed/tmp/places_replies_skynews.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['geo', 'full_name', 'place_type', 'name', 'country_code'])
csvFile.close()

# Inputs for tweets
bearer_token = auth()
headers = create_headers(bearer_token)
start_date = '2020-01-01T00:00:00.000Z'
end_date = '2021-05-14T23:59:59.000Z'
max_results = 500

# Loop Inputs
total_tweets = 0
count = 0 # Counting tweets per time period
max_count = 10000000 # Max tweets per time period
flag = True
next_token = None
n_requests = 0
n_batches = 0
valid = False
errCount = 0

In [18]:
# Check if flag is true
start_time = time.time()

for batch in batches:
    
    search = "conversation_id:" + " OR conversation_id:".join( [str(s) for s in batch]) + " to:7587032 lang:en is:reply -is:retweet"
    print("--------------------------------------")
    print("Batch #: ", n_batches)
    total_loop_tweets = 0
    flag = True
    while flag:
        # Check if max_count reached
        if count >= max_count:
            break
        print("Request #: ", n_requests+1, " | Time cap: ", int(time.time() - start_time))
        print("Token: ", next_token)
        while not valid:
            try:
                url = create_url(search, start_date,end_date, max_results)
                json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
                result_count = json_response['meta']['result_count']
                n_requests += 1
                valid = True; errCount = 0
            except:
                errCount += 1
                time.sleep(2^errCount)
                print("-------------------------------------- Request error #", errCount)
        valid = False

        # Save the token to use for next call
        if 'next_token' in json_response['meta']:
            next_token = json_response['meta']['next_token']
            print("Next Token: ", next_token)
            if result_count is not None and result_count > 0 and next_token is not None:
                append_tweet_to_csv(json_response, "./../data/processed/tmp/tweets_replies_skynews.csv")
                append_user_to_csv(json_response, "./../data/processed/tmp/users_replies_skynews.csv")
                if 'places' in json_response['includes'].keys():
                    append_place_to_csv(json_response, "./../data/processed/tmp/places_replies_skynews.csv")
                count += result_count
                total_loop_tweets += result_count
                print("Cumulative # of Tweets in this batch: ", total_loop_tweets)
                print("-------------------")
                time.sleep(1)                
        # If no next token exists
        else:
            if result_count is not None and result_count > 0:
                append_tweet_to_csv(json_response, "./../data/processed/tmp/tweets_replies_skynews.csv")
                append_user_to_csv(json_response, "./../data/processed/tmp/users_replies_skynews.csv")
                if 'places' in json_response['includes'].keys():
                    append_place_to_csv(json_response, "./../data/processed/tmp/places_replies_skynews.csv")
                count += result_count
                total_loop_tweets += result_count
                print("Cumulative # of Tweets in this batch: ", total_loop_tweets)
                time.sleep(1)
            #Since this is the final request, turn flag to false to move to the next time period.
            flag = False
            next_token = None
        time.sleep(1)
        # If reachs requests cap, stop it
        t = time.time()-start_time
        if n_requests==300:
            if t<900:
                time.sleep(900 - t)
            start_time=time.time()
            n_requests = 0
    n_batches += 1
    total_tweets += total_loop_tweets
    print("Total number of batch results: ", total_loop_tweets)
    print("Total # of Tweets added: ", total_tweets)
print("Total number of results: ", total_tweets)

--------------------------------------
Batch #:  0
Request #:  1  | Time cap:  0
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fosbski485qg2rp6cj1oxksab32bnh
# of Tweets added from this response:  470
# of Users added from this response:  406
# of Places added from this response:  6
Cumulative # of Tweets in this batch:  470
-------------------
Request #:  2  | Time cap:  4
Token:  b26v89c19zqg8o3fosbski485qg2rp6cj1oxksab32bnh
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fosbskfziazzn58n3c3i5i2y1djbst
# of Tweets added from this response:  456
# of Users added from this response:  344
# of Places added from this response:  7
Cumulative # of Tweets in this batch:  926
-------------------
Request #:  3  | Time cap:  7
Token:  b26v89c19zqg8o3fosbskfziazzn58n3c3i5i2y1djbst
Endpoint Response Code: 200
# of Tweets added from this response:  285
# of Users added from this response:  187
# of Places added from this response:  3
Cumulative # of Tweets in this b

# Fix the data format

In [84]:
legacy_tweets = pd.read_parquet("./../data/raw/comments.parquet")
tweets = pd.read_csv("./../data/processed/tmp/tweets_replies_skynews.csv")
places = pd.read_csv("./../data/processed/tmp/places_replies_skynews.csv")
skynews_tweets = tweets[(tweets.in_reply_to_user_id==7587032) & (tweets.lang=='en')].drop_duplicates('tweet_id').merge(places[['geo', 'full_name']].drop_duplicates(), how='left')
skynews_tweets['latitude'] = np.nan; skynews_tweets['longitude'] = np.nan
skynews_tweets = skynews_tweets.iloc[:,[0,1,2,4,12,6,11,9,10,7,8,15,14,13]]
skynews_tweets.columns = legacy_tweets.columns

legacy_users = pd.read_parquet("./../data/raw/users.parquet")
users = pd.read_csv("./../data/processed/tmp/users_replies_skynews.csv")
users['favouritesCount'] = users['mediaCount'] = users['location'] = users['protected'] = users['linkUrl'] = users['linkTcourl'] = users['profileImageUrl'] = users['profileBannerUrl'] = None
users = users.iloc[:,[1,2,0,3,3,4,5,6,7,8,10,9,11,12,13,14,15,16,17]]
skynews_users = users.drop_duplicates('author_id')
skynews_users.columns = legacy_users.columns

In [86]:
df

Unnamed: 0,tweetId,conversationId,userId,date,content,lang,sourceLabel,replyCount,retweetCount,likeCount,quoteCount,longitude,latitude,place
0,1387908717480448004,1387908717480448004,7587032,2021-04-29 23:16:39+00:00,All over-40s in England can now book COVID vac...,en,SkyNews Alerts - Latest,21,52,155,13,,,
1,1387905838241697797,1387905838241697797,7587032,2021-04-29 23:05:12+00:00,COVID-19: Which coronavirus variants have been...,en,SkyNews Alerts - Latest,40,11,40,0,,,
2,1387878104136396803,1387878104136396803,7587032,2021-04-29 21:15:00+00:00,Vaccine minister Nadhim Zahawi says he hopes t...,en,Twitter Media Studio,36,8,57,3,,,
3,1387875659448475651,1387875659448475651,7587032,2021-04-29 21:05:17+00:00,Three stories tonight:\n\n🔴 Ireland announces ...,en,Twitter Web App,8,4,24,0,,,
4,1387862127793352713,1387862127793352713,7587032,2021-04-29 20:11:31+00:00,Some lockdown restrictions in Ireland are to b...,en,SocialFlow,7,7,29,2,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29711,1214975841790300160,1214975841790300160,7587032,2020-01-08 18:23:08+00:00,Measles has killed nearly three times as many ...,en,SocialFlow,1,36,45,9,,,
29712,1214870102308511744,1214870102308511744,7587032,2020-01-08 11:22:58+00:00,Measles has killed nearly three times as many ...,en,SocialFlow,6,42,47,3,,,
29713,1214750469039497216,1214750469039497216,7587032,2020-01-08 03:27:35+00:00,Measles outbreak: Death toll in Democratic Rep...,en,SkyNews Alerts - Latest,9,28,28,2,,,
29714,1214066701458968577,1214066701458968577,7587032,2020-01-06 06:10:32+00:00,Healthcare workers in the Wuhan region are sti...,en,SocialFlow,5,24,32,9,,,


In [85]:
skynews_tweets

Unnamed: 0,tweetId,conversationId,userId,date,content,lang,sourceLabel,replyCount,retweetCount,likeCount,quoteCount,longitude,latitude,place
0,1383659647609884672,1383334925483446274,2780851836,2021-04-18 05:52:21+00:00,"@SkyNews No it’s not, better than housing in h...",en,Twitter for iPhone,0,0,0,0,,,
1,1383431071019143180,1383334925483446274,171898033,2021-04-17 14:44:04+00:00,@SkyNews The virus wasn't the problem it was t...,en,Twitter for Android,0,0,1,0,,,
2,1383399358050181123,1383334925483446274,1369513495,2021-04-17 12:38:03+00:00,@SkyNews @Mariebe10098426 It certainly was the...,en,Twitter for iPad,0,1,3,0,,,
3,1383394492338696201,1383334925483446274,329634252,2021-04-17 12:18:43+00:00,@SkyNews Yes and who do you think brought it i...,en,Twitter for iPhone,0,0,1,0,,,
4,1383392021553893386,1383334925483446274,8283902,2021-04-17 12:08:54+00:00,@SkyNews https://t.co/LQId1yjlWB\n\nPatel has ...,en,Twitter for Android,0,1,2,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015514,1223007718186856449,1223007243311951873,1081613910514511874,2020-01-30 22:18:57+00:00,@SkyNews Pray for short pipo,en,Twitter Web App,0,0,0,0,,,
1015515,1223007683659419649,1223007243311951873,467589704,2020-01-30 22:18:48+00:00,@SkyNews We need to stop people coming in and ...,en,Twitter for Android,1,0,1,0,,,
1015516,1223007637740183553,1223007243311951873,244541073,2020-01-30 22:18:37+00:00,@SkyNews The Winchester anyone?,en,Twitter for Android,0,0,13,0,,,
1015517,1223007627724148737,1223007243311951873,61525109,2020-01-30 22:18:35+00:00,"@SkyNews Yeh fair enough, but do you have any ...",en,Twitter for iPhone,1,0,3,0,,,


In [87]:
skynews_tweets.to_parquet('./../data/processed/tmp/comments_skynews.parquet')
skynews_users.to_parquet('./../data/processed/tmp/users_skynews.parquet')