In [80]:
from twikit import Client
import time
import json
from twikit import TwitterException 

f = open('authentication.txt', 'r')
auth = f.read()
f.close()
auth_token = auth.split("\n")

# this API requires authentication
USERNAME = str(auth_token[0])
EMAIL = str(auth_token[1])
PASSWORD = str(auth_token[2])

# Initialize client
client = Client(language = 'en-US', http2=True)

# Login to the service with provided user credentials
client.login(
    auth_info_1=USERNAME ,
    auth_info_2=EMAIL,
    password=PASSWORD
)

{'flow_token': 'g;171357003447534547:-1713570034541:EBohOd25XzxiuOtY8kM45IL2:14',
 'status': 'success',
 'subtasks': [{'subtask_id': 'LoginSuccessSubtask',
   'open_account': {'user': {'id': 1547081484695216130,
     'id_str': '1547081484695216130',
     'name': 'Eloragh Espie',
     'screen_name': 'EloraghEspie'},
    'next_link': {'link_type': 'subtask',
     'link_id': 'next_link',
     'subtask_id': 'SuccessExit'},
    'attribution_event': 'login'}},
  {'subtask_id': 'SuccessExit',
   'open_link': {'link': {'link_type': 'subtask',
     'link_id': 'next_link',
     'subtask_id': 'LoginOpenHomeTimeline'}}},
  {'subtask_id': 'LoginOpenHomeTimeline',
   'open_home_timeline': {'next_link': {'link_type': 'abort',
     'link_id': 'next_link'}}}]}

In [2]:
# Twitter LOVES to ban people when they log in repeatedly
# saving the cookies makes sure I don't get banned (often)

client.get_cookies()
client.save_cookies('cookies.json')
with open('cookies.json', 'r', encoding='UTF8') as f:
    client.set_cookies(json.load(f))

In [48]:
# INPUT: the user handle, a beginning and end of a date range
# OUTPUT: the user handle, the user_ID, and the scraped tweets
def get_all_tweets(handle, since, until):

    # load the cookies so you don't login a million times and get banned
    client.load_cookies('cookies.json')
    user_id = client.get_user_by_screen_name(handle)

    # initialize the list we will store our data in
    mass_tweets = []

    # this will pull the first forty tweets
    tweets = client.search_tweet(
        f'from:{handle} since:{since} until:{until}', 'Top', count=40
    )

    # if it returns an empty list, the user had no available tweets during the date time range
    if len(tweets) == 0:
        print('No tweets available')
        return(handle, user_id, [])
    
    # this will keep looking for tweets until a certain number of them has been reached
    else:
        while len(mass_tweets) >= 0 and len(mass_tweets) < 100:

            # this API provides a 'tweet' object, but we only want the id when we return
            tweets1 = [tweet.id for tweet in tweets]
            mass_tweets += tweets1
            time.sleep(1)  # cooldown so we don't get banned

            # keep pulling tweets until number is hit or there are none left
            tweets = tweets.next()

            # we need to make a check in case we've hit the max number of tweets we can scrape
            # this prevents us from pinging the API for no reason
            if len(tweets) == 0:
                break
            else:
                print(len(mass_tweets))
                continue

    return(handle, user_id, mass_tweets)

In [56]:
def process_tweets(handle, name, user_id, tweet_ids):
    # load the cookies so you don't login a million times and get banned
    client.load_cookies('cookies.json')

    tweets = []
    for id in tweet_ids:
        try:
            tweet = client.get_tweet_by_id(id)
            print(tweet)
            tweets.append((int(user_id.id), name, handle, int(tweet.id), str(tweet.text), tweet.created_at_datetime))
        except IndexError:
            print(f'Index Error: {tweet}')
    
    return tweets

In [52]:
import time
from math import ceil
import requests
from twikit import TooManyRequests
from twikit.utils import Endpoint

# this is a housekeeping function
# twitter API can throw rate limits
# they're kind of like timeouts
# this function just shows me how much longer I will be in timeout for

def get_limit_reset_time(endpoint: str):
    res = requests.get(
        endpoint,
        headers=client._base_headers,
        cookies=client.get_cookies()
    )
    return ceil(int(res.headers['x-rate-limit-reset']) - time.time())

In [50]:
# timeout check for scraping tweet IDs

try:
    print(client.search_tweet(
        f'from:JoeBiden since:2020-01-01 until:2021-03-01', 'Latest', count=40
    ))
except TooManyRequests:
    reset_time = get_limit_reset_time(Endpoint.USER_TWEETS)
    print(f'rate limit is reset after {reset_time} seconds.')

[<Tweet id="1351951465674276869">, <Tweet id="1351918910199631872">, <Tweet id="1351906918667677696">, <Tweet id="1351897267666608129">, <Tweet id="1351731172989050882">, <Tweet id="1351711040933830659">, <Tweet id="1351653131248041984">, <Tweet id="1351630258114502656">, <Tweet id="1351599720012021761">, <Tweet id="1351367275094310912">, <Tweet id="1351333542547001344">, <Tweet id="1351265605840633858">, <Tweet id="1351228360123318272">, <Tweet id="1350981483062706177">, <Tweet id="1350926118409289730">, <Tweet id="1350878051710750725">, <Tweet id="1350634446475694080">, <Tweet id="1350593782832500737">, <Tweet id="1350562220367884289">, <Tweet id="1350515133034819584">]


In [54]:
# timeout check for processing tweets
try:
    print(tweet = client.get_tweet_by_id(1351951465674276869))
except TooManyRequests:
    reset_time = get_limit_reset_time(Endpoint.USER_TWEETS)
    print(f'rate limit is reset after {reset_time} seconds.')

IndexError: list index out of range

In [39]:
# another housekeeping function
# if I'm suddenly getting 403 errors, I can use this to check if I've been banned
# sometimes I just have to go on the browser and reauthenticate

def check_user_status(user_id):
    """
    True if the user is active, otherwise false (not exists or suspended).
    """
    try:
        client.get_user_by_id(user_id)
    except TwitterException as e:
        if str(e).startswith('Invalid user id'):
            return False
        raise e
    else:
        return True

In [9]:
check_user_status(1547081484695216130)

True

In [43]:
# this scrapes the tweet IDs
handle, user_id, tweets = get_all_tweets('JoeBiden', since='2020-01-01', until='2021-03-01')
print(handle, user_id, tweets)


20
40
60
80
100
JoeBiden <User id="939091"> ['1351951465674276869', '1351918910199631872', '1351906918667677696', '1351897267666608129', '1351731172989050882', '1351711040933830659', '1351653131248041984', '1351630258114502656', '1351599720012021761', '1351367275094310912', '1351333542547001344', '1351265605840633858', '1351228360123318272', '1350981483062706177', '1350926118409289730', '1350878051710750725', '1350634446475694080', '1350593782832500737', '1350562220367884289', '1350515133034819584', '1350480432341520384', '1350256455748882434', '1350210150481555457', '1350156296012656647', '1350077778310270978', '1349894571094810626', '1349872996920430595', '1349849776012087296', '1349563892087869442', '1349556594237792256', '1349375651883724804', '1349090628790067200', '1349068634413436929', '1348994390690246657', '1348800612742402048', '1348763039160532992', '1348636095458717696', '1348430675238678528', '1348351402624524288', '1348299813163474944', '1348064513170567168', '13479474913

In [57]:
# this cleans them and puts them in the format I need for uploading to the SQLite table
tweets2 = process_tweets(handle, 'Joe Biden', user_id, tweets)

<Tweet id="1351951465674276869">
<Tweet id="1351918910199631872">
Index Error: <Tweet id="1351918910199631872">
<Tweet id="1351897267666608129">


KeyboardInterrupt: 

In [45]:
import sqlite3
conn = sqlite3.connect('tweets.db')
c = conn.cursor()

c.execute("""CREATE TABLE tweets (
          user_id INTEGER,
          user_name TEXT,
          user_handle TEXT,
          tweet_id INTEGER,
          tweet_text TEXT,
          created_date DATETIME
          )""")

<sqlite3.Cursor at 0x22e839d46c0>

In [41]:
conn.execute('DROP TABLE tweets')

<sqlite3.Cursor at 0x22e84a18940>

In [46]:
# upload data in SQLite table
# storing this data is important since this is an unofficial API
# every time I access it, I am risking not being able to access it again

c.executemany("INSERT INTO tweets VALUES (?,?,?,?,?,?)", tweets2)
conn.commit()
