Purpose: Collect historical tweets using the search tweet function offered through the Academic Twitter API.  Collect a max of 100,000 tweets for each month per year. Export into a CSV file. 

In [None]:
import requests
import os
import json
import csv
import dateutil.parser
import time 
import pandas as pd 
from datetime import datetime

In [None]:
def auth():
    return os.getenv('TWITTER_BEARER_TOKEN')


In [None]:
def create_headers(bearer_token):
    headers = {"Authorization" : "Bearer {}".format(bearer_token)}
    return headers 

    

In [None]:
#Define parameters 

def create_params(start_date, end_date, max_results):

    query_params= {'query': 'bitcoin -is:retweet -is:quote -has:cashtags -has:media -has:links -has:videos -has:images -is:nullcast lang:en', 
                'start_time': start_date,
                'end_time':end_date, 
                'max_results': '500',
               'tweet.fields': 'text,created_at,public_metrics,in_reply_to_user_id',
               'user.fields':'verified',
               'next_token':{}}
    return query_params

In [None]:
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token
    response = requests.request("GET", search_url, auth=bearer_oauth, params=params)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [None]:
def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2FullArchiveTweetCountsPython"
    return r


bearer_token = auth()
headers = create_headers(bearer_token)
  
#create file 
csvFile = open("2014Data.csv", "a", newline="", encoding='utf-8')
csvWriter=csv.writer(csvFile)

#Create headers for the data 
csvWriter.writerow(['created_at', 'text', 'like_count', 'retweet_count', 'quote_count', 'reply_count'])
csvFile.close() 


In [None]:
#Function that appends the json_response to a csv file 

def append_to_csv(json_response, fileName):
    
    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open("2014Data.csv", "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])
        
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        # 8. Tweet text
        text = tweet['text']
        
        # Assemble all data in a list
        res = [created_at, text, like_count, quote_count, reply_count, retweet_count]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 
    


In [None]:
#Loop through each month for a specific year and collect as many tweets as possible (with a max of 100,000) per month, that meet the query defined above 

search_url = "https://api.twitter.com/2/tweets/search/all"

        
#Example: COUNT FOR 2014
start_list = ['2014-01-01T00:00:00Z', '2014-02-01T00:00:00Z', '2014-03-01T00:00:00Z', '2014-04-01T00:00:00Z', '2014-05-01T00:00:00Z', '2014-06-01T00:00:00Z', '2014-07-01T00:00:00Z', '2014-08-01T00:00:00Z', '2014-09-01T00:00:00Z', '2014-10-01T00:00:00Z', '2014-11-01T00:00:00Z', '2014-12-01T00:00:00Z']
end_list = ['2014-01-31T00:00:00Z', '2014-02-28T00:00:00Z', '2014-03-31T00:00:00Z', '2014-04-30T00:00:00Z', '2014-05-31T00:00:00Z', '2014-06-30T00:00:00Z', '2014-07-31T00:00:00Z', '2014-08-30T00:00:00Z', '2014-09-30T00:00:00Z', '2014-10-31T00:00:00Z', '2014-11-30T00:00:00Z', '2014-12-31T00:00:00Z']

max_results = 500

total_tweets = 0


for i in range(0, len(start_list)): 
    
    count = 0 
    max_count = 100000
    flag = True
    next_token = None
    
    #Check if flag is true
    while flag:
        if count >= max_count: 
            break 
        print("--------------------------")
        print("Token: ", next_token)
        query_params = create_params(start_list[i], end_list[i], max_results)
        json_response = connect_to_endpoint(search_url, headers, query_params, next_token)
        result_count = json_response['meta']['result_count']
        
        if 'next_token' in json_response['meta']:
                
            # Save the token to use for next call
            
            next_token = json_response['meta']['next_token']
            
            print("Next Token: ", next_token)
            if result_count is not None and result_count > 0 and next_token is not None:
                print("Start Date: ", start_list[i])
                append_to_csv(json_response, "2014Data.csv")
                count += result_count
                total_tweets += result_count
                print("Total # of Tweets added: ", total_tweets)
                print("-------------------")
                
        else:
            if result_count is not None and result_count > 0:
                print("-------------------")
                print("Start Date: ", start_list[i])
                append_to_csv(json_response, "2014Data.csv")
                count += result_count
                total_tweets += result_count
                print("Total # of Tweets added: ", total_tweets)
                print("-------------------")
                time.sleep(5)
                
            flag = False
            next_token = None 
                
                
                
        time.sleep(5)
    #print("Total number of results: ", total_tweets)  
