# Scraper for Anti-Vaxx Related Tweets from Twitter

The objective of this notebook is to scrape tweets under 6 hashtags commonly used by anti-vaxxers on Twitter. The scraped information will be cleaned and analyzed in separated notebooks.

# Load Libraries

In [1]:
import pandas as pd
import numpy as np
import json
import datetime
import tweepy
import time
import requests
import os

# Set User Defined Functions

Twitter API v2 has the ability of searching for entire conversations (including replies, quotes and retweets) by conversation ID. Since this was unavailable on Twitter API v1.1, which all of the Twitter python libraries currently use, we had to define our own functions in order to use Twitter API v2. Twitter API v1.1 will still be used through tweepy for querys unavailable on Twitter API v2 at the time of this project.

## Set Twitter Authorization Codes

In [2]:
tokens_to_use = "deborah"

In [3]:
if (tokens_to_use == "albert") :
    ACCESS_TOKEN = '1277460825599508480-7j6rqWR0NZBCPRZ71pUVyZTKwuaBwf'
    ACCESS_SECRET = 'rZ1kGLrc18GJOEBncEA4GCKwUEoQMfBpaVn8qjSXp3rxH'
    CONSUMER_KEY = 'Mvmui6zu16TEK6Q1NnJp3mWOg'
    CONSUMER_SECRET = 'UsK7AvhtSp0oBEV46Pm4sNzRYc3MAidhauTIX1K5tJKYRllgrc'
    #bearer token
    token = 'AAAAAAAAAAAAAAAAAAAAAA2AJwEAAAAAgBoQOaUAxzTzGKEE9nOMssJ01o4%3DZVpQV4BIPqPuLFlUsmjnkxOi8kv0IOgpOa8NgytniNFW0K36q3'
elif (tokens_to_use == "deborah") :
    ACCESS_TOKEN = '168323323-6hftUbBs0WYhDNmtn8c7X2sH8PvvWdri1ABKecBI'
    ACCESS_SECRET = '4cOcQpqjwKorL0jssomGTUy5nW7d81bVz6TWJp8dwlzEO'
    CONSUMER_KEY = 'WcUcc7EOc5Bw4NuHGI5cGgJZ8'
    CONSUMER_SECRET = 'xrhvRXPhaJZEbtBo9kshs3pgYYnWabPdNO5rqJy5toT7JPDY5S'
    #bearer token
    token = 'AAAAAAAAAAAAAAAAAAAAAIFxJwEAAAAAkL0GXCPub0I%2BMW9v3p1CsNBZMrQ%3DWbaCrcrY6miB8TD7E5L5Xrz6ItfTJoxUy3EPDIksC2of4OVySs'

In [4]:
# Setup access to API
def connect_to_twitter_OAuth():
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

    api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True, retry_count = 10, retry_delay = 10)
    return api


# Create API object
api = connect_to_twitter_OAuth()

In [5]:
# define sleep countdown to avoid maxing out Twitter query window
def sleep_countdown(sleep_time) :
    while sleep_time >= 0 :
        m, s = divmod(sleep_time, 60)
        h, m = divmod(m, 60)
        time_left = str(h).zfill(2) + ":" + str(m).zfill(2) + ":" + str(s).zfill(2)
        print(time_left + "\r", end = "")
        time.sleep(1)
        sleep_time -= 1

In [6]:
def create_url_query_hashtag (hashtag, max_return) :
    """URL query for main conversation only, no retweets and no quotes."""
    query = " -is:retweet -is:quote lang:en"
    tweet_fields = "tweet.fields=conversation_id,created_at,in_reply_to_user_id,public_metrics"
    expansions = "expansions=author_id,referenced_tweets.id"
    max_results = "max_results=" + str(max_return)
    # start_time = "start_time=" + datetime.datetime(year = 2020, month = 11, day = 1).isoformat() + "Z"
    # end_time = "end_time=" + datetime.datetime(year = 2020, month = 11, day = 19).isoformat() + "Z"
    url = "https://api.twitter.com/2/tweets/search/recent?query={}{}&{}&{}&{}".format(
        hashtag, query, tweet_fields, max_results, expansions#, start_time#, end_time
    )
    return url

def create_url_query_by_tweet_id (tweet_id) :
    """URL query for specific tweets only, no retweets and no quotes."""
    tweet_fields = "tweet.fields=conversation_id"
    expansions = "expansions=author_id"
#     max_results = "max_results=" + str(max_return)
#     start_time = "start_time=" + datetime.datetime(year = 2020, month = 11, day = 1).isoformat() + "Z"
#     end_time = "end_time=" + datetime.datetime(year = 2020, month = 11, day = 19).isoformat() + "Z"
    url = "https://api.twitter.com/2/tweets?ids={}&{}&{}".format(
        tweet_id, tweet_fields, expansions
    )
    return url

def create_url_conversation_id_query (id, max_return, search_type) :
    """URL query for specific conversations"""
    if (search_type == "reply") :
        search_type = " -is:retweet -is:quote"
    if (search_type == "retweet") :
        search_type = " is:retweet"
    if (search_type == "quote") :
        search_type = " is:quote"
    query = "conversation_id:" + str(id) + search_type + " lang:en"
    tweet_fields = "tweet.fields=conversation_id,author_id,created_at,in_reply_to_user_id,public_metrics,entities,referenced_tweets"
    expansions = "expansions=author_id,referenced_tweets.id"
    user_fields = "user.fields=created_at"
    max_results = "max_results=" + str(max_return)
    url = "https://api.twitter.com/2/tweets/search/recent?query={}&{}&{}&{}&{}".format(
        query, tweet_fields, expansions, user_fields, max_results
    )
    return url

def create_user_lookup_url (user_id) :
    """URL query for specific tweets only, no retweets and no quotes."""
    user_fields = "user.fields=created_at,description,location,name,username,pinned_tweet_id,protected,verified,public_metrics"
    url = "https://api.twitter.com/2/users?ids={}&{}".format(
        user_id, user_fields
    )
    return url

def create_headers (bearer_token) :
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

def connect_to_endpoint (url, headers) :
    response = requests.request("GET", url, headers = headers)
    if (response.status_code != 200) :
        raise Exception(response.status_code, response.text)
    return response.json()

def main (url) :
    bearer_token = token
    headers = create_headers(bearer_token)
    json_response = connect_to_endpoint(url, headers)
    return (json_response)

In [7]:
def tweet_id_to_conversation_id (tweet_id) :
    """Query to find conversation id by tweet id"""
    if (type(tweet_id) == list) :
        tweet_id = str(tweet_id).strip("[]").replace(" ", "").replace("'", "")
    else :
        tweet_id = tweet_id.tolist()
        tweet_id = str(tweet_id).strip("[]").replace(" ", "").replace("'", "")
    url = create_url_query_by_tweet_id(tweet_id)
    response = main(url)
    response = pd.json_normalize(response["data"])[["author_id", "conversation_id", "id"]]
    return (response)

def search_connections_by_conversation_id (id, max_return, search_type): #user to pass through a list of id's
    """Search replies, quotes and retweets by conversation id"""
    try :
        type(id) == list
    except :
        print("Please pass through a list.")
    else :
        id = str(id).strip('[]').replace(" ", "")
        url = create_url_conversation_id_query(id, max_return, search_type)
        return (main(url))

def search_by_hashtag (hashtag, max_return):
    """Query tweets by hashtag"""
    url = create_url_query_hashtag(hashtag, max_return)
    return (main(url))

def user_lookup (user_id) :
    """Lookup user information by user id"""
    if (type(user_id) == list) :
        user_id = str(user_id).strip("[]").replace(" ", "").replace("'", "")
    else :
        user_id = user_id.tolist()
        user_id = str(user_id).strip("[]").replace(" ", "").replace("'", "")
    url = create_user_lookup_url(user_id)
    response = main(url)
    response = pd.json_normalize(response["data"])
    return (response)

In [8]:
def get_replies(conversation_ids) :
    """Get replies associated specified conversation id"""
    count = 0
    sleep_countdown(3)
    replies = pd.DataFrame()
    r_users = pd.DataFrame()
    for i in range(len(conversation_ids)) :
        count += 1
        sleep_countdown(3)
        id = conversation_ids["conversation_id"][i]
        result = search_connections_by_conversation_id(id, max_request, "reply")
        json_keys = result.keys()
        if ("data" in json_keys) :
            d = pd.json_normalize(result["data"])
            replies = pd.concat([replies, d], ignore_index = True)
            print("Query #:", count, "Replies retrieved:", d.shape[0])
        if ("includes" in json_keys) :
            if ("users" in list(result["includes"].keys())) :
                u = pd.json_normalize(result["includes"]["users"])
                r_users = pd.concat([r_users, u], ignore_index = True)

    if (r_users.shape[0] > 0) :
        r_users = r_users[["username", "created_at", "id"]]
        r_users.columns = ["user_screen_name", "user_created_at", "author_id"]
        replies_df = replies.join(r_users.set_index("author_id"), on = "author_id")
    else :
        replies_df = pd.DataFrame(columns = ['conversation_id', 'id', 'text', 'in_reply_to_user_id', 'created_at',
                                             'author_id', 'referenced_tweets', 'public_metrics.retweet_count',
                                             'public_metrics.reply_count', 'public_metrics.like_count',
                                             'public_metrics.quote_count', 'entities.mentions',
                                             'entities.annotations', 'user_screen_name', 'user_created_at',
                                              'user_screen_name', 'user_created_at'])
        replies_df = pd.concat([replies_df, replies], ignore_index = True)
    replies_df = replies_df.rename(columns = {"in_reply_to_user_id" : "source_user_id"})
    print("Total replies retrieved:", replies_df.shape[0])
    return (replies_df[["source_user_id", "author_id", "id", "conversation_id", "referenced_tweets", "created_at", "text"]])

In [9]:
def get_quotes(conversation_ids) :
    """Get quotes associated specified conversation id"""
    count = 0
    sleep_countdown(3)
    quotes = pd.DataFrame()
    q_users = pd.DataFrame()
    for i in range(len(conversation_ids)) :
        count += 1
        sleep_countdown(3)
        id = conversation_ids["conversation_id"][i]
        result = search_connections_by_conversation_id(id, max_request, "quote")
        json_keys = result.keys()
        if ("data" in json_keys) :
            d = pd.json_normalize(result["data"])
            quotes = pd.concat([quotes, d], ignore_index = True)
            print("Query #:", count, "Quotes retrieved:", d.shape[0])
        if ("includes" in json_keys) :
            if ("users" in list(result["includes"].keys())) :
                u = pd.json_normalize(result["includes"]["users"])
                q_users = pd.concat([q_users, u], ignore_index = True)

    if (q_users.shape[0] > 0) :
        q_users = q_users[["username", "created_at", "id"]]
        q_users.columns = ["user_screen_name", "user_created_at", "author_id"]
        quotes_df = quotes.join(q_users.set_index("author_id"), on = "author_id")
    else :
        quotes_df = pd.DataFrame(columns = ['conversation_id', 'id', 'text', 'in_reply_to_user_id', 'created_at',
                                             'author_id', 'referenced_tweets', 'public_metrics.retweet_count',
                                             'public_metrics.reply_count', 'public_metrics.like_count',
                                             'public_metrics.quote_count', 'entities.mentions',
                                             'entities.annotations', 'user_screen_name', 'user_created_at',
                                              'user_screen_name', 'user_created_at'])
        quotes_df = pd.concat([quotes_df, quotes], ignore_index = True)
    quotes_df = quotes_df.rename(columns = {"in_reply_to_user_id" : "source_user_id"})
    print("Total quotes retrieved:", quotes_df.shape[0])
    return (quotes_df[["source_user_id", "author_id", "id", "conversation_id", "referenced_tweets", "created_at", "text"]])

In [10]:
#retweets search not working in API v2, so go back to API v1.1 to fetch retweets
def get_retweets (conversation_ids) :
    """Get retweets associated specified conversation id"""
    count = 0
    retweets_df = pd.DataFrame()
    convo_ids = []
    source_user_id = []
    for i in range(len(conversation_ids)) :
        sleep_countdown(3)
        count += 1
        id = conversation_ids["conversation_id"][i]
        try :
            source_user = api.get_status(id).user.id
            result = api.retweets(id, max_request)
            tweets_list = [[tweet.text, tweet.created_at, tweet.id_str, tweet.user.screen_name, tweet.user.id_str, tweet.user.location, tweet.user.url, tweet.user.verified, tweet.user.followers_count, tweet.user.friends_count, tweet.user.listed_count, tweet.user.created_at, tweet.in_reply_to_status_id_str, tweet.in_reply_to_screen_name, tweet.entities] for tweet in result]
            tweets_df = pd.DataFrame(tweets_list)
            print("Query #:", count, "Retweets retrieved:", tweets_df.shape[0])
            retweets_df = pd.concat([retweets_df, tweets_df], ignore_index = True)
            l = tweets_df.shape[0]
            if (l > 0) :
                c = [id] * l
                convo_ids.extend(c)
                u = [source_user] * l
                source_user_id.extend(u)
        except :
            print("Error occurred when fetching conversation id:", id)
            pass
    if (retweets_df.shape[0] > 0) :
        retweets_df.columns = ["text", "created_at", "id", "user_screen_name", "author_id", "user_location", "user_url", "user_verified", "user_followers_count", "user_friends_count", "user_listed_count", "user_created_at", "in_reply_to_status_id_str", "in_reply_to_screen_name", "entities"]
        #twitter API v1.1 does not support conversation ID, manually include in results
        retweets_df["conversation_id"] = convo_ids
        retweets_df["source_user_id"] = source_user_id
    else :
        retweets_df = pd.DataFrame(columns = ["source_user_id", "author_id", "id", "conversation_id", "created_at", "text"])
    return(retweets_df[["source_user_id", "author_id", "id", "conversation_id", "created_at", "text"]])

In [11]:
# #fetch referenced tweets
# def get_ref_tweets (conversation_ids) :
#    """Get reference tweets associated specified conversation id"""
#     count = 0
#     ref_tweets_df = pd.DataFrame()
#     convo_ids = []
#     source_user_id = []
#     for i in range(len(conversation_ids)) :
#         sleep_countdown(3)
#         count += 1
#         id = conversation_ids["id"][i]
# #         print("id:", id)
#         source_user = conversation_ids["author_id"][i]
#         tweet = api.get_status(id, max_request)
#         tweets_list = [tweet.text, tweet.created_at, tweet.id_str, tweet.user.screen_name, tweet.user.id_str, tweet.user.location, tweet.user.url, tweet.user.verified, tweet.user.followers_count, tweet.user.friends_count, tweet.user.listed_count, tweet.user.created_at, tweet.in_reply_to_status_id_str, tweet.in_reply_to_screen_name, tweet.entities]
#         tweets_df = pd.DataFrame(tweets_list).transpose()
# #         print("Query #:", count, "Referenced tweets retrieved:", tweets_df.shape[0])
#         ref_tweets_df = pd.concat([ref_tweets_df, tweets_df], ignore_index = True)
#         l = tweets_df.shape[0]
#         if (l > 0) :
#             c = [id] * l
#             convo_ids.extend(c)
#             u = [source_user] * l
#             source_user_id.extend(u)
#     if (ref_tweets_df.shape[0] > 0) :
#         ref_tweets_df.columns = ["text", "created_at", "id", "user_screen_name", "author_id", "user_location", "user_url", "user_verified", "user_followers_count", "user_friends_count", "user_listed_count", "user_created_at", "in_reply_to_status_id_str", "in_reply_to_screen_name", "entities"]
#         #twitter API v1.1 does not support conversation ID, manually include in results
#         ref_tweets_df["conversation_id"] = convo_ids
#         ref_tweets_df["source_user_id"] = source_user_id
#     else :
#         ref_tweets_df = pd.DataFrame(columns = ["source_user_id", "author_id", "id", "conversation_id", "created_at", "text"])
#     return(ref_tweets_df[["source_user_id", "author_id", "id", "conversation_id", "created_at", "text"]])

In [12]:
def get_activity (df) :
    """Separate tweets by action of reply, quote or retweet"""
    result = pd.DataFrame()
    for i in df.index :
        tweet_id = np.nan
        replied_to_id = np.nan
        quoted_id = np.nan
        row = df.iloc[i]
        ref = row["referenced_tweets"]
        l = len(ref)
        tweet_id = row["id"]
        retweeted_id = np.nan
        for j in range(l) :
            act = ref[j]["type"]
            if (act == "replied_to") :
                replied_to_id = ref[j]["id"]
            elif (act == "quoted") :
                quoted_id = (ref[j]["id"])
        l = [tweet_id, replied_to_id, quoted_id, retweeted_id]
        result = result.append([l], ignore_index = True)
    result.columns = ["id", "replied_to_tweet_id", "quoted_tweet_id", "retweeted_id"]
    result.set_index("id", inplace = True)
    return (result)

def get_activity2 (df) :
    """Separate tweets by action of reply, quote or retweet"""
    result = pd.DataFrame()
    for i in df.index :
        tweet_id = np.nan
        replied_to_id = np.nan
        quoted_id = np.nan
        row = df.iloc[i]
        ref = row["referenced_tweets"]
        tweet_id = row["id"]
        retweeted_id = np.nan
        if (pd.isna(ref)) :
            replied_to_id = np.nan
            quoted_id = np.nan
        else :
            l = len(ref)
            for j in range(l) :
                act = ref[j]["type"]
                if (act == "replied_to") :
                    replied_to_id = ref[j]["id"]
                elif (act == "quoted") :
                    quoted_id = (ref[j]["id"])
        act_list = [tweet_id, replied_to_id, quoted_id, retweeted_id]
        result = result.append([act_list], ignore_index = True)
    result.columns = ["id", "replied_to_tweet_id", "quoted_tweet_id", "retweeted_id"]
    result.set_index("id", inplace = True)
    return (result)

In [13]:
def get_engaged_user_ids(var_list) :
    """Get user ids"""
    result = pd.DataFrame()
    activity = []
    for var in var_list :
        l = len(globals()[var])
        if (l > 0) :
            activity.extend([var] * l)
            result = pd.concat([result, globals()[var]], ignore_index = True)
    result["activity_type"] = activity
    return (result)

# Data Scraping

The following searches where done everyday from Dec 18th to 24th, 2020 to scrape for anti-vaxx related tweets.

## Get the Most Recent Tweets by Common Anti-Vaxx Hashtags

In [None]:
#define hashtags to pull
hashtags_to_query = ["%23novax", "%23antivax", "%23CDCwhistleblower", "%23vaccineinjury", "%23vaxxed", "%23cdcfraud"]

#set number of maximum number of results for each request
max_request = 100

var_list = []
for i in range(len(hashtags_to_query)) :
    scraped = "json_tweets_" + (hashtags_to_query[i][3:]).replace(" ", "")
    globals()[scraped] = search_by_hashtag(hashtags_to_query[i], max_request)
    var_list.append(scraped)
    print(hashtags_to_query[i][3:], "tweets: ", globals()[scraped]["meta"]["result_count"])
print(var_list)

In [None]:
#concat results into a single dataframe
results_data = pd.DataFrame()
results_user = pd.DataFrame()

for var in var_list :
    results_data = pd.concat([results_data, pd.json_normalize(globals()[var]["data"])], ignore_index = False)
    results_user = pd.concat([results_user, pd.json_normalize(globals()[var]["includes"]["users"])], ignore_index = False)
    
results = results_data.join(results_user.set_index("id"), on = "author_id").drop_duplicates("id")

In [None]:
#filter for top tweets by different users
# results_filtered = results.sort_values("public_metrics.reply_count", ascending = False).drop_duplicates("author_id", keep = "first")
# results_filtered = results[(results["public_metrics.retweet_count"] > 0) & (results["public_metrics.reply_count"] > 0)]
results_filtered = results.sort_values("public_metrics.reply_count", ascending = False)
results_filtered = results_filtered.reset_index(drop = True)
print("Total tweets after filtering:", results_filtered.shape[0])

In [None]:
#get activities and clean up dataframe
results_filtered = results_filtered.join(get_activity2(results_filtered), on = "id")
results_filtered = results_filtered.drop("referenced_tweets", axis = 1)
results_filtered = results_filtered.drop_duplicates()

In [None]:
# results_filtered.head()

In [None]:
#export
results_filtered.to_csv("output/2020-12-24 Tweets/initial_results_filtered.csv", index = False)

## Get Replies and Retweets for Top Users' Most Recent Tweets

### Get Conversation IDs for Recent Tweets

In [None]:
#get a list of tweet ids
top_recent_tweet_ids = results_filtered["id"].tolist() + results_filtered["replied_to_tweet_id"].tolist() + results_filtered["quoted_tweet_id"].tolist() + results_filtered["retweeted_id"].tolist()
top_recent_tweet_ids = pd.Series(top_recent_tweet_ids).drop_duplicates().dropna().tolist()

In [None]:
#fetch conversations ids of recent tweets by top user
conversation_id_recent_tweets = pd.DataFrame()
for i in list(range(len(top_recent_tweet_ids)))[::100] :
    start = 1
    if (start + 100 < len(top_recent_tweet_ids)) :
        end = start + 100
    else :
        end = len(top_recent_tweet_ids)
    results = tweet_id_to_conversation_id(top_recent_tweet_ids[start:end])
    conversation_id_recent_tweets = pd.concat([conversation_id_recent_tweets, results], ignore_index = True)

In [None]:
conversation_id_recent_tweets = conversation_id_recent_tweets.drop_duplicates("conversation_id").reset_index(drop = True)
print("Conversations to retrieve:", conversation_id_recent_tweets.shape[0])

In [None]:
# # #get tweets and related information on replied to, quoted from or retweeted from tweets
# references = get_ref_tweets(conversation_id_recent_tweets)

In [None]:
#fetch replies from conversation ids
replies = get_replies(conversation_id_recent_tweets)

In [None]:
#fetch quotes from conversation ids
quotes = get_quotes(conversation_id_recent_tweets)

In [None]:
#fetch retweets from conversation ids
retweets = get_retweets(conversation_id_recent_tweets)

In [None]:
#reset indices
replies = replies.reset_index(drop = True)
quotes = quotes.reset_index(drop = True)
retweets = retweets.reset_index(drop = True)

In [None]:
#get activity connections and clean up
replies_activity = get_activity(replies)
replies = replies.join(replies_activity, on = "id")
replies = replies.drop("referenced_tweets", axis = 1)
replies = replies.drop_duplicates()

quotes_activity = get_activity(quotes)
quotes = quotes.join(quotes_activity, on = "id")
quotes = quotes.drop("referenced_tweets", axis = 1)
quotes = quotes.drop_duplicates()

l = retweets.shape[0]
retweets["replied_to_tweet_id"] = [np.nan] * l
retweets["quoted_tweet_id"] = [np.nan] * l
retweets["retweeted_id"] = retweets["conversation_id"]
retweets = retweets.drop_duplicates()

# l = references.shape[0]
# references["replied_to_tweet_id"] = [np.nan] * l
# references["quoted_tweet_id"] = [np.nan] * l
# references["retweeted_id"] = references["conversation_id"]
# references = references.drop_duplicates()

In [None]:
#export files
# references.to_csv("output/2020-12-24 Tweets/references.csv", index = False)
replies.to_csv("output/2020-12-24 Tweets/replies.csv", index = False)
quotes.to_csv("output/2020-12-24 Tweets/quotes.csv", index = False)
retweets.to_csv("output/2020-12-24 Tweets/retweets.csv", index = False)

In [None]:
#concat results and export to csv
total_results = pd.concat([replies, quotes, retweets], ignore_index = True)
total_results.to_csv("output/2020-12-24 Tweets/total_results.csv", float_format = str, index = False)

In [None]:
total_results.head()

In [None]:
#get all replied to, quoted and retweeted tweets ids
engagement_tweet_ids = total_results["replied_to_tweet_id"].tolist() + total_results["quoted_tweet_id"].tolist() + total_results["retweeted_id"].tolist()

#drop NAs and duplicates
engagement_tweet_ids = pd.Series(engagement_tweet_ids).dropna().drop_duplicates()

#get list of fetched tweets
fetched_tweets = np.union1d(total_results["id"], total_results["conversation_id"])

#get a list if tweet ids not yet seen in previous fetches
more_tweet_ids = engagement_tweet_ids[~engagement_tweet_ids.isin(fetched_tweets)]

In [None]:
more_tweet_ids

### Get 2nd Degree Tweets

In [None]:
#fetch conversations ids of newly found referenced tweets
conversation_id_recent_tweets = pd.DataFrame()
for i in list(range(len(more_tweet_ids)))[::100] :
    start = 1
    if (start + 100 < len(more_tweet_ids)) :
        end = start + 100
    else :
        end = len(more_tweet_ids)
    results = tweet_id_to_conversation_id(more_tweet_ids[start:end])
    conversation_id_recent_tweets = pd.concat([conversation_id_recent_tweets, results], ignore_index = True)

In [None]:
conversation_id_recent_tweets = conversation_id_recent_tweets.drop_duplicates("conversation_id").reset_index(drop = True)

In [None]:
# conversation_id_recent_tweets

In [None]:
#get tweets and related information on replied to, quoted from or retweeted from tweets
# references = get_ref_tweets(conversation_id_recent_tweets)

In [None]:
#fetch replies from conversation ids
replies = get_replies(conversation_id_recent_tweets)

In [None]:
#fetch quotes from conversation ids
quotes = get_quotes(conversation_id_recent_tweets)

In [None]:
#fetch retweets from conversation ids
retweets = get_retweets(conversation_id_recent_tweets)

In [None]:
#clean up
# references = references.reset_index(drop = True)
replies = replies.reset_index(drop = True)
quotes = quotes.reset_index(drop = True)
retweets = retweets.reset_index(drop = True)

In [None]:
#get activity connections and clean up
replies_activity = get_activity(replies)
replies = replies.join(replies_activity, on = "id")
replies = replies.drop("referenced_tweets", axis = 1)
replies = replies.drop_duplicates()

quotes_activity = get_activity(quotes)
quotes = quotes.join(quotes_activity, on = "id")
quotes = quotes.drop("referenced_tweets", axis = 1)
quotes = quotes.drop_duplicates()

l = retweets.shape[0]
retweets["replied_to_tweet_id"] = [np.nan] * l
retweets["quoted_tweet_id"] = [np.nan] * l
retweets["retweeted_id"] = retweets["conversation_id"]
retweets = retweets.drop_duplicates()

# l = references.shape[0]
# references["replied_to_tweet_id"] = [np.nan] * l
# references["quoted_tweet_id"] = [np.nan] * l
# references["retweeted_id"] = references["conversation_id"]
# references = references.drop_duplicates()

In [None]:
#export files
# references.to_csv("output/2020-12-24 Tweets/extended_references1.csv", index = False)
replies.to_csv("output/2020-12-24 Tweets/extended_replies1.csv", index = False)
quotes.to_csv("output/2020-12-24 Tweets/extended_quotes1.csv", index = False)
retweets.to_csv("output/2020-12-24 Tweets/extended_retweets1.csv", index = False)

In [None]:
#concat extended results and export to csv
extended_results1 = pd.concat([replies, quotes, retweets], ignore_index = True) #, references
extended_results1.to_csv("output/2020-12-24 Tweets/extended_results1.csv", float_format = str, index = False)

In [None]:
extended_results1.head()

In [None]:
#get all replied to, quoted and retweeted tweets ids
engagement_tweet_ids = extended_results1["replied_to_tweet_id"].tolist() + extended_results1["quoted_tweet_id"].tolist() + extended_results1["retweeted_id"].tolist()

#drop NAs and duplicates
engagement_tweet_ids = pd.Series(engagement_tweet_ids).dropna().drop_duplicates()

#get list of fetched tweets
fetched_tweets = np.union1d(np.union1d(fetched_tweets, more_tweet_ids), np.union1d(extended_results1["id"], extended_results1["conversation_id"]))

#get a list if tweet ids not yet seen in previous fetches
more_tweet_ids = engagement_tweet_ids[~engagement_tweet_ids.isin(fetched_tweets)]

In [None]:
more_tweet_ids

### Get 3rd Degree Tweets

In [None]:
#fetch conversations ids of newly found referenced tweets
conversation_id_recent_tweets = pd.DataFrame()
for i in list(range(len(more_tweet_ids)))[::100] :
    start = 1
    if (start + 100 < len(more_tweet_ids)) :
        end = start + 100
    else :
        end = len(more_tweet_ids)
    results = tweet_id_to_conversation_id(more_tweet_ids[start:end])
    conversation_id_recent_tweets = pd.concat([conversation_id_recent_tweets, results], ignore_index = True)

In [None]:
conversation_id_recent_tweets = conversation_id_recent_tweets.drop_duplicates("conversation_id").reset_index(drop = True)

In [None]:
# conversation_id_recent_tweets

In [None]:
# #get tweets and related information on replied to, quoted from or retweeted from tweets
# references = get_ref_tweets(conversation_id_recent_tweets)

In [None]:
#fetch replies from conversation ids
replies = get_replies(conversation_id_recent_tweets)

In [None]:
#fetch quotes from conversation ids
quotes = get_quotes(conversation_id_recent_tweets)

In [None]:
#fetch retweets from conversation ids
retweets = get_retweets(conversation_id_recent_tweets)

In [None]:
#clean up
# references = references.reset_index(drop = True)
replies = replies.reset_index(drop = True)
quotes = quotes.reset_index(drop = True)
retweets = retweets.reset_index(drop = True)

In [None]:
#get activity connections and clean up
replies_activity = get_activity(replies)
replies = replies.join(replies_activity, on = "id")
replies = replies.drop("referenced_tweets", axis = 1)
replies = replies.drop_duplicates()

quotes_activity = get_activity(quotes)
quotes = quotes.join(quotes_activity, on = "id")
quotes = quotes.drop("referenced_tweets", axis = 1)
quotes = quotes.drop_duplicates()

l = retweets.shape[0]
retweets["replied_to_tweet_id"] = [np.nan] * l
retweets["quoted_tweet_id"] = [np.nan] * l
retweets["retweeted_id"] = retweets["conversation_id"]
retweets = retweets.drop_duplicates()

# l = references.shape[0]
# references["replied_to_tweet_id"] = [np.nan] * l
# references["quoted_tweet_id"] = [np.nan] * l
# references["retweeted_id"] = references["conversation_id"]
# references = references.drop_duplicates()

In [None]:
#export files
# references.to_csv("output/2020-12-24 Tweets/extended_references2.csv", index = False)
replies.to_csv("output/2020-12-24 Tweets/extended_replies2.csv", index = False)
quotes.to_csv("output/2020-12-24 Tweets/extended_quotes2.csv", index = False)
retweets.to_csv("output/2020-12-24 Tweets/extended_retweets2.csv", index = False)

In [None]:
#concat extended results and export to csv
extended_results2 = pd.concat([replies, quotes, retweets], ignore_index = True) #, references
extended_results2.to_csv("output/2020-12-24 Tweets/extended_results2.csv", float_format = str, index = False)

In [None]:
extended_results2.head()

In [None]:
#get all replied to, quoted and retweeted tweets ids
engagement_tweet_ids = extended_results2["replied_to_tweet_id"].tolist() + extended_results2["quoted_tweet_id"].tolist() + extended_results2["retweeted_id"].tolist()

#drop NAs and duplicates
engagement_tweet_ids = pd.Series(engagement_tweet_ids).dropna().drop_duplicates()

#get list of fetched tweets
fetched_tweets = np.union1d(np.union1d(fetched_tweets, more_tweet_ids), np.union1d(extended_results2["id"], extended_results2["conversation_id"]))

#get a list if tweet ids not yet seen in previous fetches
more_tweet_ids = engagement_tweet_ids[~engagement_tweet_ids.isin(fetched_tweets)]

In [None]:
more_tweet_ids

From our experience, after the 3rd degree fetch, there are no more extended replies, quotes or retweets. If there are, the above process will be repeated until no further extended tweets are gathered.

# Putting Connections Together

In [None]:
#adjust the initial top user recent tweets in preparation to consolidate the dataframe with all retrieved results
results_filtered["source_user_id"] = results_filtered["replied_to_tweet_id"]

In [None]:
#consolidate all results and clean up
final_results = pd.concat([results_filtered[total_results.columns], total_results, extended_results1, extended_results2, extended_results3], ignore_index = True)
final_results = final_results.drop_duplicates()

In [None]:
#display issue with replied_to_tweet_id's tweet id being used as a user_id in the source_user_id column
#this seemed to be an issue with twitter
errors = final_results[final_results["replied_to_tweet_id"] == final_results["source_user_id"]]
errors

In [None]:
for i in errors.index :
    final_results.loc[i, "source_user_id"] = final_results.loc[i, "author_id"]

In [None]:
#export final results to csv
final_results.to_csv("output/2020-12-24 Tweets/final_results.csv", float_format = str, index = False)

# Creating Edges

In [None]:
edges = final_results[["source_user_id", "author_id", "text"]]
edges.columns = ["source", "target", "weight"]
edges = edges.dropna()
edges = edges.groupby(["source", "target"]).count().reset_index()
edges = edges.sort_values("weight", ascending = False).reset_index(drop = True)

In [None]:
edges

In [None]:
#export edges
edges.to_csv("output/2020-12-24 Tweets/edges.csv", index = False)

# Retrieve User Information

In [None]:
all_users = final_results["author_id"].tolist() + final_results["source_user_id"].tolist()
all_users = pd.Series(all_users).drop_duplicates().tolist()

In [None]:
len(all_users)

In [None]:
nodes = pd.DataFrame()

for i in list(range(len(all_users)))[::100] :
    sleep_countdown(3)
    start = i
    if (start + 100 < len(all_users)) :
        end = start + 100
    else :
        end = len(all_users)
    try :
        results = user_lookup(all_users[start:end])
    except :
        print("Error in batch:", start, "to", end)
        pass
    nodes = pd.concat([nodes, results], ignore_index = True)

nodes = nodes.reset_index(drop = True)

In [None]:
nodes.shape

In [None]:
#export edges
nodes.to_csv("output/2020-12-24 Tweets/nodes.csv", index = False)

# Additional Tweet Scraping For All Referenced Tweets

In the process of our analysis, we realized that there are some missing tweets. They are either referenced tweets (tweet ids being replied to, quoted or retweeted). The following is an attempt to retrieve them. Unfortunately, some of these tweets and users were already deleted/banned by Twitter by the time we tried to retrieve them.

## User Defined Functions to Retrieve Additional Tweets

In [14]:
def create_url_query_by_tweet_id2 (tweet_id) :
    """URL query for specific tweets only, no retweets and no quotes."""
    tweet_fields = "tweet.fields=conversation_id,author_id,created_at,in_reply_to_user_id,public_metrics,entities,referenced_tweets"
    expansions = "expansions=author_id,referenced_tweets.id"
    user_fields = "user.fields=created_at"
#     max_results = "max_results=" + str(max_return)
#     start_time = "start_time=" + datetime.datetime(year = 2020, month = 11, day = 1).isoformat() + "Z"
#     end_time = "end_time=" + datetime.datetime(year = 2020, month = 11, day = 19).isoformat() + "Z"
    url = "https://api.twitter.com/2/tweets?ids={}&{}&{}&{}".format(
        tweet_id, tweet_fields, expansions, user_fields
    )
    return url

In [1]:
def search_by_tweet_id (tweet_id) :
    """Get tweets by tweet id"""
    if (type(tweet_id) == list) :
        tweet_id = str(tweet_id).strip("[]").replace(" ", "").replace("'", "")
    else :
        tweet_id = tweet_id.tolist()
        tweet_id = str(tweet_id).strip("[]").replace(" ", "").replace("'", "")
    url = create_url_query_by_tweet_id2(tweet_id)
    response = main(url)
    response = pd.json_normalize(response["data"])
    return (response)

## Load Combined Data

Load previously fetched data to determine which tweets are missing.

In [16]:
edges_tweet = pd.read_csv("output/Combined Tweets/2020-12-24 edges_tweet.csv", dtype = str)
nodes_tweet = pd.read_csv("output/Combined Tweets/2020-12-24 nodes_tweet.csv", dtype = str)

In [17]:
# get a list of all tweet ids included in the data
tweet_ids = edges_tweet["Source"].append(edges_tweet["Target"])
tweet_ids = tweet_ids.drop_duplicates()
tweet_ids = pd.DataFrame(tweet_ids, columns = ["id"])

In [18]:
# get a list of tweet ids without details
missing_tweets = tweet_ids[~tweet_ids["id"].isin(nodes_tweet["Name"].tolist())]
missing_tweets = missing_tweets["id"].tolist()

## Get Missing Tweets

In [19]:
max_results = 100

missing_tweets_results = pd.DataFrame()
for i in list(range(len(missing_tweets)))[::100] :
    start = 1
    if (start + 100 < len(missing_tweets)) :
        end = start + 100
    else :
        end = len(missing_tweets)
    results = search_by_tweet_id(missing_tweets[start:end])
    missing_tweets_results = pd.concat([missing_tweets_results, results], ignore_index = True)

In [20]:
# rename columns to match existing data
missing_tweets_results = missing_tweets_results.rename(columns = {"in_reply_to_user_id" : "source_user_id"})
missing_tweets_results = missing_tweets_results[["source_user_id", "author_id", "id", "conversation_id", "text"]]

In [21]:
missing_tweets_results.head()

Unnamed: 0,source_user_id,author_id,id,conversation_id,text
0,,19098792,1338829441112428544,1338829441112428544,‘It's a lot better than getting sick ’: Trial ...
1,,1041258824147976192,1339553106284212224,1339553106284212224,I think that its hilarious that people believe...
2,1.2717694417480745e+18,1269662880762343424,1339655590138798081,1339646731559067656,@sheepskindee @AndyInLondon1 @Nigel_Farage Per...
3,,42836999,1338838777616363520,1338838777616363520,"There are more than 10,000 active COVID-19 cas..."
4,,1249607420508286977,1338881028908326914,1338881028908326914,#Canada creates a Vaccine Injury Compensation ...


## Retrieving User IDs

In [23]:
# get a list of author ids to fetch user informations
all_users = missing_tweets_results["author_id"]
all_users = pd.Series(all_users).drop_duplicates().tolist()

In [24]:
# get a user information from missing tweets list
nodes = pd.DataFrame()

for i in list(range(len(all_users)))[::100] :
    sleep_countdown(3)
    start = i
    if (start + 100 < len(all_users)) :
        end = start + 100
    else :
        end = len(all_users)
    try :
        results = user_lookup(all_users[start:end])
    except :
        print("Error in batch:", start, "to", end)
        pass
    nodes = pd.concat([nodes, results], ignore_index = True)

nodes = nodes.reset_index(drop = True)

00:00:00

In [25]:
nodes.shape

(70, 13)

In [27]:
#export missing tweet results
missing_tweets_results.to_csv("output/2021-01-09 Missing Tweets/missing_tweet_results.csv", index = False)
nodes.to_csv("output/2021-01-09 Missing Tweets/missing_nodes.csv", index = False)