# Data Collection

In this notebook tweets containing the keyword "Tesla" will be collected using Twitter's API.  Two .csv files will be saved at the end of this notebook.

In [None]:
# Deleted before uploading notebook
api_key = ...
api_key_secret = ...
bearer_token = ...

The code to collect tweets was based on a combination of the Twitter API documentation and the instructions in the link below.

https://towardsdatascience.com/an-extensive-guide-to-collecting-tweets-from-twitter-api-v2-for-academic-research-using-python-3-518fcb71df2a

## Imports

In [None]:
# For sending GET requests from the API
import requests
# For file management when creating and adding to the dataset
import os
# For dealing with json responses from the API
import json
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata
#To add wait time between requests
import time

## Defining useful functions

In [None]:
# Creates headers based on bearer token
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [None]:
# creates url with easy to modify inputs
def create_url(keyword, start_date, end_date, max_results = 100):
    
    search_url = "https://api.twitter.com/2/tweets/search/recent"

    # set params
    query_params = {'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'tweet.fields': 'created_at',
                    'next_token': {}}
    return (search_url, query_params)

In [None]:
# makes request and prints response code so I know if it worked or not
def connect_to_endpoint(url, headers, params, next_token=None):
    
    params['next_token'] = next_token
    
    response = requests.request("GET", url, headers = headers, params = params)
    
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
        
    return response.json()

In [None]:
def append_to_csv(json_response, fileName):

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:

        tweet_id = tweet['id']

        text = tweet['text']
        
        date = tweet['created_at']
        
        # Assemble data in a list
        res = [tweet_id, text ,date]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)

    # When done, close the CSV file
    csvFile.close()

## Collecting data before Q4 earnings announcement

In [None]:
#Inputs for the request
headers = create_headers(bearer_token)
keyword = "Tesla OR tesla OR TSLA OR tsla lang:en"
start_time = "2022-01-21T00:00:00.000Z"
end_time = "2022-01-22T00:00:00.000Z" # 24 hours after start
max_results = 100

In [None]:
url = create_url(keyword, start_time,end_time, max_results)
json_response = connect_to_endpoint(url[0], headers, url[1])

In [None]:
print(json.dumps(json_response, indent=4, sort_keys=True))

In [None]:
# Create file
csvFile = open("before.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)

# Create column names
csvWriter.writerow(['id', 'text', 'created_at'])
csvFile.close()

In [None]:
# Append first response
append_to_csv(json_response, "before.csv")

In [None]:
# Append tweets on the next 399 pages
# There is a rate limit of 400 requests in 15 minutes, so the loop will stop at 400 requests
for i in range(400):
    if 'next_token' in json_response['meta']:
        next_token = json_response['meta']['next_token']
    
        url = create_url(keyword, start_time, end_time, max_results)
        json_response = connect_to_endpoint(url[0], headers, url[1], next_token);
    
        append_to_csv(json_response, "before.csv")
    else:
        break
        
print('All tweets collected')

## Collecting Data After the Q4 earnings announcement

In [None]:
#Inputs for the request
headers = create_headers(bearer_token)
keyword = "Tesla lang:en"
start_time = "2022-01-26T18:00:00.000Z" # Note that the time is set for 6:00 PM
end_time = "2022-01-27T18:00:00.000Z" # 24 hours after start
max_results = 100

In [None]:
url = create_url(keyword, start_time,end_time, max_results)
json_response = connect_to_endpoint(url[0], headers, url[1])

In [None]:
print(json.dumps(json_response, indent=4, sort_keys=True))

In [None]:
# Create file
csvFile = open("after.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)

#Create column names
csvWriter.writerow(['id', 'text', 'created_at'])
csvFile.close()

In [None]:
# Append first response
append_to_csv(json_response, "after.csv")

In [None]:
for i in range(400):
    if 'next_token' in json_response['meta']:
        next_token = json_response['meta']['next_token']
    
        url = create_url(keyword, start_time, end_time, max_results)
        json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
    
        append_to_csv(json_response, "after.csv")
    else:
        print('All tweets collected')   