
## [Mostly based on this article](https://towardsdatascience.com/an-extensive-guide-to-collecting-tweets-from-twitter-api-v2-for-academic-research-using-python-3-518fcb71df2a)

In [2]:
import requests
import os
import json
import pandas as pd
import csv
import datetime
import dateutil.parser
import unicodedata
import time

In [3]:
os.environ.clear()
os.environ['TOKEN'] = 'insert your twitter api token here'

In [4]:
def auth():
    return os.getenv('TOKEN')

In [5]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [6]:
def create_url(keyword, start_date, end_date, max_results = 10):
    search_url = "https://api.twitter.com/2/tweets/search/all" 
    #if you have academic research access

    query_params = {
        # controls returned response
        'query': keyword,
        'start_time': start_date,
        'end_time': end_date,
        'max_results': max_results,

        #possible extra fields to include
        'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
        'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
        'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
        'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
        
        #identifier to access next page of results
        'next_token': {}
    }

    return (search_url, query_params)

Different API endpoints:
for tweets, will want Full-Archive Search for this project

more info: https://developer.twitter.com/en/docs/twitter-api/early-access

query params:
https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all

Building Tweet Queries:
https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query

In [6]:
keyword = '(ivermectin OR "horse paste" OR "horse dewormer" OR "farm dewormer" OR "livestock dewormer" OR "ivermetcin" OR "ivermecin") -is:retweet'
# https://www.timestamp-converter.com/
start_time = '2020-03-01T02:31:00.000Z'
max_results=10

In [7]:
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token # params from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [8]:
bearer_token = auth()
headers=create_headers(bearer_token)
end_time= '2022-09-11T02:31:00.000Z' #'2022-03-01T02:31:00.000Z'

Testing Connection to API

In [71]:
url = create_url(keyword, start_time, end_time, max_results)

json_response = connect_to_endpoint(url[0], headers, url[1])

Endpoint Response Code: 200


In [26]:
print(json.dumps(json_response, indent=4, sort_keys=True))

{
    "data": [
        {
            "author_id": "919107441182121984",
            "conversation_id": "1568302558971830273",
            "created_at": "2022-09-11T02:29:20.000Z",
            "id": "1568788755586203648",
            "in_reply_to_user_id": "1544828574007541765",
            "lang": "en",
            "public_metrics": {
                "like_count": 1,
                "quote_count": 0,
                "reply_count": 0,
                "retweet_count": 0
            },
            "referenced_tweets": [
                {
                    "id": "1568302558971830273",
                    "type": "replied_to"
                }
            ],
            "reply_settings": "everyone",
            "source": "Twitter for Android",
            "text": "@Sdj_neuro Take your deworming ivermectin, ma'am. We don't want the rona spreading to your husband-son."
        },
        {
            "author_id": "1518930570163765248",
            "conversation_id": "1565576520982216707",

In [30]:
with open('data.json', 'w') as f:
    json.dump(json_response, f)

# Uploading to MongoDB

In [8]:
import pymongo
from pymongo import MongoClient, InsertOne
from bson.json_util import dumps

In [10]:
client = pymongo.MongoClient('insert your mongodb database address here')
db = client. # database name here
collection = db. #collection name here
requesting = []

In [None]:
collection.insert_one(json_response)

In [12]:
client.close()

# Automating the pull request

In [9]:
bearer_token = auth()
headers = create_headers(bearer_token)

In [12]:
keyword = '(ivermectin OR "horse paste" OR "horse dewormer" OR "farm dewormer" OR "livestock dewormer" OR "ivermetcin" OR "ivermecin" OR "1vermectin" OR "iv3rmectin" OR "iverm3ctin" OR "1v3rmectin" OR "1verm3ctin" OR "1v3rm3ctin" OR "!vermectin" OR "!v3rmectin" OR "!verm3ctin" OR "!v3rm3ctin") -is:retweet' 
#starting from 3/20/20 
#added 1, !, and 3 variations starting 7/1/2021
#08/21/21 peak
#8/27/21
#8/28/21 30,000+
#9/2 60k+

#503 error on 8/28/21, reloading, will show up twice delete mongodb _id '634330be49ff30046bb28bd3' after verification
#503 error on 9/5/21,
#503 on 9/8/21, but accidentally repeated 9/5 and part of 9/6
#503 error on 9/21

start_list = '2021-09-03T00:00:00Z'
end_list = '2022-10-15T00:00:00Z'
max_results = 200

In [13]:
import datetime as dt
pd.to_datetime(start_list) + dt.timedelta(days=1)

Timestamp('2021-09-04 00:00:00+0000', tz='UTC')

In [43]:
from unittest import result
import gc

day = pd.to_datetime(start_list)
#test = pd.to_datetime(start_list) + dt.timedelta(days = 2)
end_date = pd.to_datetime(end_list)
max_results = 200 # if you run into errors, you can restart with the previous token, so keep this in consideration, max is 500
total_tweets = 0

extras = []

while day <= end_date:

    #with open('IverTweets' + str(day)[:10] + '.json', 'w') as f:
    #    f.close()

    IverTweets = '{}'

    count = 0
    max_per_day = 100000 # max number of tweets per day
    flag = True
    next_token = None
    eod = day + dt.timedelta(days = 1)

    while flag:
        if count >= max_per_day: 
            extras.append([day, next_token])
            break
        print("---------------------------------------")
        print("Token: ", next_token)
        url = create_url(keyword, str(day.isoformat())[:-6] + "Z", str(eod.isoformat())[:-6] + "Z", max_results)
        json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
        result_count = json_response['meta']['result_count']

        if 'next_token' in json_response['meta']:
            next_token = json_response['meta']['next_token']
            print("Next Token: ", next_token)
            if result_count is not None and result_count > 0 and next_token is not None:
                print("Start Date:", day)
                #filename = "IverTweets" + str(day)[:10] +".json"
                jr = json.loads(IverTweets)
                jr.update(json_response)
                count += result_count
                total_tweets += result_count
                print("Daily Tweets added: ", count)
                print("Total Tweets added: ", total_tweets)
                print("---------------------------------------")
                time.sleep(5) #these are inserted to prevent contacting the Twitter server too frequently and getting locked out
                
        else:
            if result_count is not None and result_count > 0:
                print("---------------------------------------")
                print("Start Date: ", day)
                #filename = "IverTweets" + str(day)[:10] +".json"
                jr = json.loads(IverTweets)
                jr.update(json_response)
                count += result_count
                total_tweets += result_count
                print("Daily Tweets added: ", count)
                print("Total Tweets added: ", total_tweets)
                print("---------------------------------------")
                time.sleep(5) 
            flag = False
            next_token = None
        time.sleep(5)

    client = pymongo.MongoClient('mongodb database address here')
    db = client. # database name here
    collection = db. #collection name here
    collection.insert_one(jr)
    client.close()

    day += dt.timedelta(days=1)

---------------------------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpds7p0d14z961ruu4pvh0ez534sql
Start Date: 2021-10-01 00:00:00+00:00
Daily Tweets added:  452
Total Tweets added:  452
---------------------------------------
---------------------------------------
Token:  b26v89c19zqg8o3fpds7p0d14z961ruu4pvh0ez534sql
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpds7p0d0rbfwbm0s9ft6bliraa5j1
Start Date: 2021-10-01 00:00:00+00:00
Daily Tweets added:  886
Total Tweets added:  886
---------------------------------------
---------------------------------------
Token:  b26v89c19zqg8o3fpds7p0d0rbfwbm0s9ft6bliraa5j1
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpds7p0cq1xb3vxdafuiotrs9ahrp9
Start Date: 2021-10-01 00:00:00+00:00
Daily Tweets added:  1327
Total Tweets added:  1327
---------------------------------------
---------------------------------------
Token:  b26v89c19zqg8o3fpds7p0cq1xb3vxdafuiotrs9ahrp9
Endpoint Response 