In [None]:
from twikit import Client
from twikit import TwitterException 
from twikit import TooManyRequests
from twikit.utils import Endpoint
from translate import Translator
from math import ceil
import time
import json
import requests

In [None]:
# this API requires authentication
f = open('authentication.txt', 'r')
auth = f.read()
f.close()
auth_token = auth.split("\n")

# don't hardcode your email and password into something!!!
# the auth is in gitignore so I won't get hacked
USERNAME = str(auth_token[0])
EMAIL = str(auth_token[1])
PASSWORD = str(auth_token[2])

# Initialize client
client = Client(language='en-US', http2=True)

# Login to the service with provided user credentials
client.login(
    auth_info_1=USERNAME ,
    auth_info_2=EMAIL,
    password=PASSWORD
)

In [None]:
# Twitter LOVES to ban people when they log in repeatedly
# saving the cookies makes sure I don't get banned (often)

client.get_cookies()
client.save_cookies('IGNOREcookies.json')
with open('IGNOREcookies.json', 'r', encoding='UTF8') as f:
    client.set_cookies(json.load(f))

In [None]:
# housekeeping function
# each different method uses a different API endpoint
# each different API endpoint has a rate limit
# you can hit it a certain number of times per a time period (usually 15 minutes)
# this tells me how much time I have left if I'v hit the rate limit

def get_limit_reset_time(endpoint: str):
    res = requests.get(
        endpoint,
        headers=client._base_headers,
        cookies=client.get_cookies()
    )
    return ceil(int(res.headers['x-rate-limit-reset']) - time.time())

In [None]:
# timeout check for scraping tweet IDs
try:
    print(client.search_tweet(
        f'from:JoeBiden since:2020-01-01 until:2021-03-01', 'Latest', count=40
    ))
except TooManyRequests:
    reset_time = get_limit_reset_time(Endpoint.USER_TWEETS)
    print(f'rate limit is reset after {reset_time} seconds.')

In [None]:
# timeout check for processing tweets
try:
    print(client.get_tweet_by_id(1351951465674276869))
except TooManyRequests:
    reset_time = get_limit_reset_time(Endpoint.USER_TWEETS)
    print(f'rate limit is reset after {reset_time} seconds.')

In [None]:
# another housekeeping function
# if I'm suddenly getting 403 errors, I can use this to check if I've been banned
# sometimes I just have to go on the browser and reauthenticate

def check_user_status(user_id):
    """
    True if the user is active, otherwise false (not exists or suspended).
    """
    try:
        client.get_user_by_id(user_id)
    except TwitterException as e:
        if str(e).startswith('Invalid user id'):
            return False
        raise e
    else:
        return True

check_user_status(1547081484695216130)

In [None]:
# INPUT: the user handle, a beginning and end of a date range
# OUTPUT: the user handle, the user_ID, and the scraped tweets
def get_all_tweets(handle, since, until):

    # load the cookies so you don't login a million times and get banned
    client.load_cookies('IGNOREcookies.json')

    # initialize the list we will store our data in
    mass_tweets = []

    since = f'{since}-01-01'
    until = f'{until}-06-31'

    # this will pull the first forty tweets
    tweets = client.search_tweet(
        f'from:{handle} since:{since} until:{until}', 'Top'
    )

    # if it returns an empty list, the user had no available tweets during the date time range
    if len(tweets) == 0:
        print('No tweets available')
        return([])
    
    # this will keep looking for tweets until a certain number of them has been reached
    else:
        while len(mass_tweets) >= 0 and len(mass_tweets) < 100:

            # this API provides a 'tweet' object, but we only want the id when we return
            tweets1 = [tweet.id for tweet in tweets]
            mass_tweets += tweets1
            time.sleep(1)  # cooldown so we don't get banned

            # keep pulling tweets until number is hit or there are none left
            tweets = tweets.next()

            # we need to make a check in case we've hit the max number of tweets we can scrape
            # this prevents us from pinging the API for no reason
            if len(tweets) == 0:
                print("No more tweets")
                break
            else:
                print(len(mass_tweets))
                continue

    return(mass_tweets)

In [None]:
def process_tweets(handle, user_id, name, tweet_ids):
    # load the cookies so you don't login a million times and get banned
    client.load_cookies('IGNOREcookies.json')

    # initialize a list to store all tuples
    tweets = []

    for tweet_id in tweet_ids:
        try:
            # using the IDs we pulled from above
            tweet = client.get_tweet_by_id(tweet_id)

            # we have international data
            # this will translate it and identify it's translation
            if tweet.lang != 'en':
                translator = Translator(to_lang='en')
                tweet = translator.translate(tweet.text)
                tweets.append((int(tweet.id), int(user_id), name, handle, tweet, str(tweet.lang), 'True', 'en', tweet.created_at_datetime))

            # otherwise we just move on
            else:
                tweets.append((int(tweet.id), int(tweet_id), name, handle, str(tweet.text), str(tweet.lang), 'False', 'null', tweet.created_at_datetime))

        # it throws an Index Error if the tweet has been deleted/ is not available
        except IndexError:
            print(f'Index Error: {tweet}')

    return tweets

In [None]:
def format_tweets(active_user_list):
    final_list = []
    no_tweets = []

    while len(active_user_list) > 0:
        
        lst = active_user_list[0]
        user_id = lst[0]
        name = lst[1]
        handle = lst[2]
        since = lst[3]
        until = lst[3]+1

        try:
            tweet_ids = get_all_tweets(handle, since=since, until=until)
            print(f'{len(tweet_ids)} tweets collected for {name} for {since} election')

            # if they had no tweets for that election year
            # add to a separate list to keep track and pop
            if len(tweet_ids) == 0:
                print(f'{name} for election {since} had no tweets')
                no_tweets.append((user_id, name, handle, since))
                active_user_list.pop(0)
            
            # if they had tweets, process them and add to the data list
            # we also pop here to keep the while loop moving
            else:
                tweets2 = process_tweets(handle, user_id, name, tweet_ids)
                final_list += tweets2
                active_user_list.pop(0)

        # cool down when we hit too many requests
        # this works well with the while loop
        # otherwise we would have to stop and start or batch the entire thing
        except TooManyRequests:
            reset_time = get_limit_reset_time(Endpoint.USER_TWEETS)
            print(f"Too many requests. Rate limit reset after {reset_time}")
            time.sleep(reset_time)
    
    return(final_list, no_tweets)

In [None]:
import sqlite3
conn = sqlite3.connect('tweets.db')
c = conn.cursor()

c.execute(""" SELECT twitter_user_id, politician_name, twitter_handle, election_year
            FROM coordinates
            WHERE twitter_active_during_election = 'True'
            """)
active_user_list = c.fetchall()
print(active_user_list[0])

In [None]:
final_list, no_tweets = format_tweets(active_user_list)

In [None]:
import sqlite3
conn = sqlite3.connect('tweets.db')
c = conn.cursor()

c.execute("""CREATE TABLE tweets (
          tweet_id INTEGER primary key,
          user_id INTEGER,
          user_name STRING,
          user_handle STRING,
          tweet_text STRING,
          tweet_original_lang STRING,
          tweet_translated STRING,
          tweet_translated_lang STRING,
          created_date DATETIME
          )""")

In [None]:
conn.execute('DROP TABLE tweets')

In [None]:
# upload data in SQLite table
# storing this data is important since this is an unofficial API
# every time I access it, I am risking not being able to access it again

c.executemany("INSERT INTO tweets VALUES (?,?,?,?,?,?,?)", tweets2)
conn.commit()
