## Set up the environment and import key libraries:

In [1]:
import requests
import os
import json
import time
import re
import jsonlines
from datetime import datetime

# Define API Endpoint & OAuth Bearer Token:
The API endpoint is the url the script will send its requests to, that Twitter's servers are listening on (defined by "endpoint"). It is currently set to access Version 2 of their Academic API.

"bearer_token" defines the confidential security token generated by Twitter, which it uses to keep track of and authorize API requests - it is what lets Twitter know that the requests are coming from Dr. Murphy's account.

In [2]:
endpoint = "https://api.twitter.com/2/tweets/search/all"
bearer_token = "AAAAAAAAAAAAAAAAAAAAACC9lQEAAAAAyYkInf7JgCsCy8W%2BqMHo8oV65Ms%3DoWBU0XvLnHMd8Pg9ZJprIsbY3gulAVazGQ55pikR1DhmNUsNY0"

# Defining our query:
"Query" can be any combination of search operators accepted by twitter's Full-Archive Search API (as found at https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query).

"start_time" and "end_time" define the time period from which you want to pull your search results. Must be formatted in UTC (ie. 2023-01-01T00\:00:00Z).

"max_tweets" sets a cap on the maximum number of results that will be returned. Can be set to whatever you want, bearing in mind the 10,000,000 tweet/month limit of the Academic API and that relatively narrow/obscure searches may not even reach the value you set.

"max_results" defines the maximum number of tweets that will be returned per individual request of the API. Best left to 500, as pagination is on and all returned tweets will be concatenated into the same "dataset.jsonl" file upon output.

In [3]:
query = '("Spot" ("Boston Dynamics" OR "Robot")) OR ("Atlas" ("Boston Dynamics" OR "Robot")) OR "BigDog" OR "Roomba" OR "iRobot" OR "ASIMO" OR "iCub" OR ("valkyrie" ("NASA" OR "robot")) OR ("Spirit" ("Mars" OR "NASA" OR "science" OR "rover")) OR ("Opportunity" ("Mars" OR "NASA" OR "science" OR "rover")) OR ("Curiosity" ("Mars" OR "NASA" OR "science" OR "rover")) OR ("Voyager" ("1" OR "2" OR "PROBE" OR "NASA" OR "space" OR "probe"))'

start_time = "2006-03-22T00:00:00Z"
end_time = "2023-04-01T00:00:00Z"

max_tweets = 100000
max_results = 500

filepath = "I:\\Nextcloud\\Twitter Scraping Pilot Project\\Jupyter Notebooks\\2wttr\\Production\\Broad Scope Topic Search"

# Defining API parameters.
We define the different types of parameters we might send to the API, and the parameters themselves. These tell the API what types of data to send back to us, and how those data fields are categorized.

The type "query" only has one type of parameter - "query" - which we defined in the last cell. It forms the backbone of our request.

The type "expansions" has several parameters:
   - "author_id": the Twitter User ID of the tweet's author.
   - "referenced_tweets.id": the Twitter ID of any referenced/quoted tweets from the tweet.
   - "geo.place_id": the Twitter ID of any places/locations specifically named in the tweet.
   - "in_reply_to_user_id": the Twitter User ID of the author of the tweet being replied to.
   - "referenced_tweets.id.author_id":
    
The type "tweet.fields" has several parameters:
   - "created_at": date/time the tweet was created.
   - "author_id": same as "author_id" above.
   - "lang": language of the tweet.
   - "entities":
   - "geo":
   - "referenced_tweets": text/content of the tweet being replied to.
   - "in_reply_to_user_id": same as "in_reply_to_user_id" above.
   - "public_metrics": numbers of likes, retweets & replies.

The type "user.fields" only has one parameter, "username" - the username of the account that created the tweet.

The types "start_time", "end_time", and "max_results" each only have one parameter, named the same as their types.

In [4]:
params = {
    'query': query,
    'expansions': 'author_id,referenced_tweets.id,geo.place_id,in_reply_to_user_id,referenced_tweets.id.author_id',
    'tweet.fields': 'created_at,author_id,lang,entities,geo,referenced_tweets,in_reply_to_user_id,public_metrics', 
    'user.fields': 'username',
    'start_time': start_time,
    'end_time': end_time,
    'max_results': max_results,
}

# Define API request header & attaches bearer token authorization to it.

In [5]:
headers = {"Authorization": "Bearer {}".format(bearer_token)}

# Initialize empty list in which to store returned tweets, until query is completed and they are written out to "dataset.jsonl".

In [6]:
tweets = [] # initialize list of tweets

# Define a function to fetch tweets

In [7]:
def fetch_tweets():
    response = requests.request("GET", endpoint, headers=headers, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    json_response = response.json()
    return json_response

# Define a function to parse the incoming tweets from the Twitter API's JSON response

In [8]:
def parse_tweets(json_response):
    if len(json_response['data']) == 0:
        return None

    user_dict = extract_user_data(json_response)
    tweets = []
    for tweet_dict in json_response['data']:
        tweet_dict = add_user_data_to_tweet(user_dict, tweet_dict)
        tweets.append(tweet_dict)
        if len(tweets) == max_tweets:  # Use max_tweets from outside the function
            break

    return tweets

# Define a function to extract the data about each user

In [9]:
def extract_user_data(json_response):
    user_dict = {}
    for user_data in json_response['includes']['users']:
        user_id = user_data['id']
        user_dict[user_id] = {'username': user_data['username'], 'name': user_data['name']}
    return user_dict

# Define a function to add the extracted data to the appropriate tweet object in the temporary dictionary

In [10]:
def add_user_data_to_tweet(user_dict, tweet_dict):
    user_id = tweet_dict['author_id']
    if user_id in user_dict:
        user_data = user_dict[user_id]
        tweet_dict['username'] = user_data['username']
        tweet_dict['name'] = user_data['name']
    return tweet_dict

# Define a function to paginate tweets

In [11]:
def paginate_tweets(json_response):
    if 'next_token' not in json_response['meta']:
        return None
    else:
        next_token = json_response['meta']['next_token'] # get next_token
        params['pagination_token'] = next_token # add pagination key to query dict
        return True

# Define a function to create an appropriate file name and strip out any illegal windows characters

In [12]:
def create_filename(query):
    # Create a filename based on the query and current date/time
    now = datetime.now().strftime('%Y-%m-%d, %H_%M')
    filename = f"{query} - {now}.jsonl"
    
    # Replace invalid characters in filename with [symbol name]
    invalid_chars = {
        "<": "LT",
        ">": "GT",
        ":": "COL",
        "\"": "QOUT",
        "/": "FS",
        "\\": "BS",
        "|": "VB",
        "?": "QUES",
        "*": "AS"
    }
    for char, name in invalid_chars.items():
        filename = filename.replace(char, f"[{name}]")
    
    return filename

# Define a function to write each tweet out into a .jsonl file

In [13]:
def save_tweets_to_file(tweets, filepath, query):
    # Create a filename based on the query and current date/time
    #filename = create_filename(query)
    filename = f"realspecific.jsonl"
    
    # Check if filepath is valid, otherwise fall back to default directory
    if not os.path.isdir(filepath):
        print(f"Invalid directory path {filepath}. Defaulting to current directory.")
        filepath = "."
    
    # Write tweets to file
    file = os.path.join(filepath, filename)
    with jsonlines.open(file, 'w') as writer:
        for tweet in tweets:
            writer.write(tweet)
    
    print(f"Saved {len(tweets)} tweets to {file}")

# Fetch and parse tweets until max_tweets reached or no more tweets to fetch

In [14]:
while len(tweets) < max_tweets:
    print("Getting tweets")
    json_response = fetch_tweets()

    parsed_tweets = parse_tweets(json_response)
    if parsed_tweets is None:
        break # end loop if no more tweets returned

    tweets += parsed_tweets

    if len(tweets) == max_tweets:
        break # end loop if max number of tweets have been added

    paginated = paginate_tweets(json_response)
    if paginated is None:
        break # end loop if no more

Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tweets
Getting tw

Exception: (429, '{"title":"Too Many Requests","detail":"Too Many Requests","type":"about:blank","status":429}')

# We print "done" to the console to indicate that the requests have finished, & save the .jsonl file of tweets to a specified location

In [None]:
print("Done")
save_tweets_to_file(tweets, filepath, query)