In [1]:
from twikit import Client
import time
import json
from twikit import TwitterException 

f = open('authentication.txt', 'r')
auth = f.read()
f.close()
auth_token = auth.split("\n")

# this API requires authentication
USERNAME = str(auth_token[0])
EMAIL = str(auth_token[1])
PASSWORD = str(auth_token[2])

# Initialize client
client = Client(language = 'en-US', http2=True)

# Login to the service with provided user credentials
client.login(
    auth_info_1=USERNAME ,
    auth_info_2=EMAIL,
    password=PASSWORD
)

{'flow_token': 'g;171357469271098860:-1713574692775:2blU7Qx8woiVwA0w72l1Og8w:14',
 'status': 'success',
 'subtasks': [{'subtask_id': 'LoginSuccessSubtask',
   'open_account': {'user': {'id': 1547081484695216130,
     'id_str': '1547081484695216130',
     'name': 'Eloragh Espie',
     'screen_name': 'EloraghEspie'},
    'next_link': {'link_type': 'subtask',
     'link_id': 'next_link',
     'subtask_id': 'SuccessExit'},
    'attribution_event': 'login'}},
  {'subtask_id': 'SuccessExit',
   'open_link': {'link': {'link_type': 'subtask',
     'link_id': 'next_link',
     'subtask_id': 'LoginOpenHomeTimeline'}}},
  {'subtask_id': 'LoginOpenHomeTimeline',
   'open_home_timeline': {'next_link': {'link_type': 'abort',
     'link_id': 'next_link'}}}]}

In [2]:
# Twitter LOVES to ban people when they log in repeatedly
# saving the cookies makes sure I don't get banned (often)

client.get_cookies()
client.save_cookies('cookies.json')
with open('cookies.json', 'r', encoding='UTF8') as f:
    client.set_cookies(json.load(f))

In [3]:
# INPUT: the user handle, a beginning and end of a date range
# OUTPUT: the user handle, the user_ID, and the scraped tweets
def get_all_tweets(handle, since, until):

    # load the cookies so you don't login a million times and get banned
    client.load_cookies('cookies.json')
    user_id = client.get_user_by_screen_name(handle)

    # initialize the list we will store our data in
    mass_tweets = []

    # this will pull the first forty tweets
    tweets = client.search_tweet(
        f'from:{handle} since:{since} until:{until}', 'Top', count=20
    )

    # if it returns an empty list, the user had no available tweets during the date time range
    if len(tweets) == 0:
        print('No tweets available')
        return(handle, user_id, [])
    
    # this will keep looking for tweets until a certain number of them has been reached
    else:
        while len(mass_tweets) >= 0 and len(mass_tweets) < 20:

            # this API provides a 'tweet' object, but we only want the id when we return
            tweets1 = [tweet.id for tweet in tweets]
            mass_tweets += tweets1
            time.sleep(1)  # cooldown so we don't get banned

            # keep pulling tweets until number is hit or there are none left
            tweets = tweets.next()

            # we need to make a check in case we've hit the max number of tweets we can scrape
            # this prevents us from pinging the API for no reason
            if len(tweets) == 0:
                print("No more tweets")
                break
            else:
                print(len(mass_tweets))
                continue

    return(handle, user_id, mass_tweets)

In [4]:
def process_tweets(handle, name, user_id, tweet_ids):
    # load the cookies so you don't login a million times and get banned
    client.load_cookies('cookies.json')

    # initialize a list to store all tuples
    tweets = []
    for id in tweet_ids:
        # it throws an Index Error if the tweet has been deleted
        try:
            # using the IDs we pulled from above
            tweet = client.get_tweet_by_id(id)
            # get the user id, name, handle, tweet id, tweet text, and date
            tweets.append((int(tweet.id), int(user_id.id), name, handle, str(tweet.text), tweet.created_at_datetime))
        except IndexError:
            print(f'Index Error: {tweet}')
    
    return tweets

In [5]:
import time
from math import ceil
import requests
from twikit import TooManyRequests
from twikit.utils import Endpoint

# this is a housekeeping function
# twitter API can throw rate limits
# they're kind of like timeouts
# this function just shows me how much longer I will be in timeout for

def get_limit_reset_time(endpoint: str):
    res = requests.get(
        endpoint,
        headers=client._base_headers,
        cookies=client.get_cookies()
    )
    return ceil(int(res.headers['x-rate-limit-reset']) - time.time())

In [6]:
# timeout check for scraping tweet IDs
try:
    print(client.search_tweet(
        f'from:JoeBiden since:2020-01-01 until:2021-03-01', 'Latest', count=40
    ))
except TooManyRequests:
    reset_time = get_limit_reset_time(Endpoint.USER_TWEETS)
    print(f'rate limit is reset after {reset_time} seconds.')

[<Tweet id="1351951465674276869">, <Tweet id="1351918910199631872">, <Tweet id="1351906918667677696">, <Tweet id="1351897267666608129">, <Tweet id="1351731172989050882">, <Tweet id="1351711040933830659">, <Tweet id="1351653131248041984">, <Tweet id="1351630258114502656">, <Tweet id="1351599720012021761">, <Tweet id="1351367275094310912">, <Tweet id="1351333542547001344">, <Tweet id="1351265605840633858">, <Tweet id="1351228360123318272">, <Tweet id="1350981483062706177">, <Tweet id="1350926118409289730">, <Tweet id="1350878051710750725">, <Tweet id="1350634446475694080">, <Tweet id="1350593782832500737">, <Tweet id="1350562220367884289">, <Tweet id="1350515133034819584">]


In [7]:
# timeout check for processing tweets
try:
    print(tweet = client.get_tweet_by_id(1351951465674276869))
except TooManyRequests:
    reset_time = get_limit_reset_time(Endpoint.USER_TWEETS)
    print(f'rate limit is reset after {reset_time} seconds.')

TypeError: 'tweet' is an invalid keyword argument for print()

In [None]:
# another housekeeping function
# if I'm suddenly getting 403 errors, I can use this to check if I've been banned
# sometimes I just have to go on the browser and reauthenticate

def check_user_status(user_id):
    """
    True if the user is active, otherwise false (not exists or suspended).
    """
    try:
        client.get_user_by_id(user_id)
    except TwitterException as e:
        if str(e).startswith('Invalid user id'):
            return False
        raise e
    else:
        return True

In [None]:
check_user_status(1547081484695216130)

True

In [8]:
# this scrapes the tweet IDs
handle, user_id, tweets = get_all_tweets('theresa_may', since='2017-01-01', until='2018-03-01')
print(handle, user_id, tweets)


19
39
theresa_may <User id="747807250819981312"> ['953739668876652544', '959056044155686913', '948893617355198464', '968926339276341248', '960910001031311360', '951378583175524352', '950688770659176448', '952108358668038146', '967489616386412545', '954256428809834499', '948606757437427713', '957163969634062338', '951519218482196480', '966417214424436736', '960924738444038144', '953577288905449472', '952210453375127552', '960845788187447296', '962625372415242240', '966667665552171010', '961561086926491648', '960502229223530497', '956211947321679875', '951404450253598720', '949939659546791936', '967096115186274305', '951503194940694528', '949952384775675904', '951800059221430272', '953574859136733184', '963480679890718720', '963348184566325250', '959000338492743680', '968180906618998785', '960946124931633152', '968157897275437056', '965687831371943936', '968119407833681921', '957970508485746688']


In [9]:
# this cleans them and puts them in the format I need for uploading to the SQLite table
tweets2 = process_tweets(handle, 'Theresa May', user_id, tweets)

In [10]:
for tweet in tweets2:
    print(tweet)

(953739668876652544, 747807250819981312, 'Theresa May', 'theresa_may', 'Labour has turned its back on investment, growth, jobs. A Labour Party that will always put politics before people. https://t.co/Hnua5uIAo5', datetime.datetime(2018, 1, 17, 21, 23, 51, tzinfo=datetime.timezone.utc))
(959056044155686913, 747807250819981312, 'Theresa May', 'theresa_may', "Too many people with mental ill health still face appalling injustices and stigma every day. Tackling those injustices is a priority for me. Let's work together to break the silence around mental health. #TimetoTalk https://t.co/zM8svV1bkf", datetime.datetime(2018, 2, 1, 13, 29, 13, tzinfo=datetime.timezone.utc))
(948893617355198464, 747807250819981312, 'Theresa May', 'theresa_may', "Thank you to the NHS staff who work hard and do a fantastic job for us day in and day out all year round. Their dedication ensures people get the treatment they need - we've put in an additional £437 million to help, so the NHS has been better prepared 

In [11]:
import sqlite3
conn = sqlite3.connect('tweets.db')
c = conn.cursor()

c.execute("""CREATE TABLE tweets (
          tweet_id INTEGER primary key,
          user_id INTEGER,
          user_name TEXT,
          user_handle TEXT,
          tweet_text TEXT,
          created_date DATETIME
          )""")

<sqlite3.Cursor at 0x222cb2088c0>

In [None]:
conn.execute('DROP TABLE tweets')

<sqlite3.Cursor at 0x22e8637a040>

In [12]:
# upload data in SQLite table
# storing this data is important since this is an unofficial API
# every time I access it, I am risking not being able to access it again

c.executemany("INSERT INTO tweets VALUES (?,?,?,?,?,?)", tweets2)
conn.commit()
