In [None]:
### Code used to collect Tweets via the Twitter API

import pathlib
import tweepy
import pandas as pd
import time


def extract_entities(records):
    hashtags = []
    urls = []

    for tweet in records:
        if tweet.entities is None:
            tweet.entities = {}
        for url_json in tweet.entities.get("urls", []):
            url_record = (tweet.id, url_json["expanded_url"])
            urls.append(url_record)

        for hashtag_json in tweet.entities.get("hashtags", []):
            hashtag_record = (tweet.id, hashtag_json["tag"])
            hashtags.append(hashtag_record)

    return (
        pd.DataFrame(hashtags, columns=["tweet_id", "hashtag"]),
        pd.DataFrame(urls, columns=["tweet_id", "hashtag"]),
    )


def extract_referenced(records):
    referenced = []
    for tweet in records:
        for reference in tweet.data.get("referenced_tweets", []):
            ref_record = (tweet.id, reference["type"], reference["id"])
            referenced.append(ref_record)

    return pd.DataFrame(
        referenced, columns=["tweet_id", "reference_type", "reference_id"]
    )


BEARER_TOKEN = "YOUR BEARER TOKEN HERE"
client = tweepy.Client(bearer_token=BEARER_TOKEN)

# Arguments for the request
# Details for building query parameters can be found here:  https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all
# Change query parameters for each API pull
query= """(#vaccinated) -flu -flushot -fluvaccine -hpv -influenza -smallpox -is:retweet -is:quote -is:nullcast lang:en"""

# YYYY-MM-DDTHH:mm:ssZ
start_time = "2021-12-14T00:00:00Z"
end_time = "2022-01-01T00:00:00Z"

# https://developer.twitter.com/en/docs/twitter-api/expansions
expansions = ["author_id", "geo.place_id", "attachments.media_keys"]

# https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model
place_fields = ["country", "country_code"]
media_fields = ["media_key", "type", "preview_image_url", "public_metrics"]
user_fields = [
    "id",
    "name",
    "username",
    "created_at",
    "description",
    "profile_image_url",
    "public_metrics",
    "url",
    "verified",
]
tweet_fields = [
    "attachments",
    "author_id",
    "conversation_id",
    "created_at",
    "entities",
    "geo",
    "id",
    "in_reply_to_user_id",
    "lang",
    "public_metrics",
    "possibly_sensitive",
    "referenced_tweets",
    "reply_settings",
    "source",
    "text",
]

paginator = tweepy.Paginator(
    client.search_all_tweets,
    query=query,
    start_time=start_time,
    end_time=end_time,
    expansions=expansions,
    place_fields=place_fields,
    user_fields=user_fields,
    media_fields=media_fields,
    tweet_fields=tweet_fields,
    max_results=500,
)

frames = {
    "tweet": [],
    "user": [],
    "media": [],
    "place": [],
    "referenced": [],
    "hashtag": [],
    "url": []
}

for response in paginator:
    # Extract entities (hashtags, urls), and referenced_tweets as their own dataframes
    hashtags_df, urls_df = extract_entities(response.data)
    referenced_df = extract_referenced(response.data)

    # Remove entities and referenced tweets from the response data
    #  we have already "saved" it in the above 2 lines
    tweet_records = [tweet.data for tweet in response.data]
    for tweet_data in tweet_records:
        if "entities" in tweet_data:
            tweet_data.pop("entities")
        if "referenced_tweets" in tweet_data:
            tweet_data.pop("referenced_tweets")

    # Convert the json data for dataframe
    tweet_df = pd.json_normalize(tweet_records)

    # Get the many to many relationship between id and media keys
    tweet_media_links = (
        tweet_df[["id", "attachments.media_keys"]].explode("attachments.media_keys")
        .rename(columns={"id": "tweet_id", "attachments.media_keys": "media_key"})
    )

    # Drop media_keys from tweet_df
    tweet_df = tweet_df.drop("attachments.media_keys", axis=1)
    users_df = pd.json_normalize([user.data for user in response.includes["users"]])

    # Sometimes the response doesn't contain any media objects:
    media_df = None
    if "media" in response.includes:
        # Store the media_keys along with their corresponding tweet_ids into
        #  the media_df.
        # This means tweet_df will NOT have a media_key column, and instead
        #  media_df will contain the mappings to tweet_df ids.
        media_df = (
            pd.json_normalize([media.data for media in response.includes["media"]])
            .merge(tweet_media_links, on="media_key")
        )

    # Sometimes the response doesn't contain any place objects
    place_df = None
    if "places" in response.includes:
        place_df = pd.json_normalize([place.data for place in response.includes["places"]])

    # Append the frames from the current response to our dictionary,
    #  so we can combine everything later
    frames["tweet"].append(tweet_df)
    frames["user"].append(users_df)
    frames["hashtag"].append(hashtags_df)
    frames["url"].append(urls_df)
    frames["referenced"].append(referenced_df)
    if media_df is not None:
        frames["media"].append(media_df)
    if place_df is not None:
        frames["place"].append(place_df)

    # sleep for 1 second to avoid TooManyRequests error
    time.sleep(1)

# Create an output data folder
data_path = pathlib.Path(__file__).parent / "data"
data_path.mkdir(exist_ok=True)

# Empty out old results from data folder
for out_file in data_path.glob("*.csv"):
    out_file.unlink()

# Combine the dataframe lists from our frames dictionary, 
#  then save those dataframes inside of the data folder
for key, frame_list in frames.items():
    if frame_list:
        df = pd.concat(frame_list)
        df.to_csv(data_path / f"{key}.csv", index=False)
