In [1]:
import twint
import pandas as pd
import nest_asyncio
import time
nest_asyncio.apply()
pd.options.display.max_columns = 40


In [2]:
# candidate = 'Biden'

# c = twint.Config()
# c.Search = candidate
# c.Lang = 'en'
# c.Since = '2020-10-01'
# c.Until = '2020-10-05'
# c.Store_csv = True
# c.Output = '../data/biden_tweets_all.csv'
# c.Hide_output = True
# twint.run.Search(c)

## Scrape Tweets

This is the workbook I used to create the below function.

Note...this takes a while. In order to prevent Twint's `client payload error`, I had to add randomized wait times between each Twitter query. I suggest running the below cells overnight to generate the files.

In [3]:
import datetime as dt
import time
import random

def arrange_dates(start_date, end_date):
    """Used in the get_election_tweets func. """
    
    dates = []
    delta = end_date - start_date
    
    for i in range(delta.days + 1):
        day = start_date + dt.timedelta(days=i)
        dates.append(day)
    
    return dates

def get_election_tweets(tweets_per_day=15000, start_date="2020-10-01", end_date="2020-11-03"):
    
    # get a list of dates
    f = "%Y-%m-%d"
    sd = dt.datetime.strptime(start_date, f)
    ed = dt.datetime.strptime(end_date, f)
    dates = arrange_dates(sd, ed)
    candidates = ['Biden', 'Trump']
    
    # fetch tweets & save CSVs (each CSV contains one day/candidate)
    
    lam_plus_1 = lambda x: (x + dt.timedelta(days=1)).strftime("%Y-%m-%d")
    wait_time = 50 # seconds; will use for sleep timer to avoid 'client payload' error
    print("""Waiting 60 seconds...\n""")
    time.sleep(60)
    for date in dates:
        for candidate in candidates:
            
            d = date.strftime("%Y-%m-%d")            
            print(f"""\n Begin scraping {candidate}'s tweets on {d}\n""")
            outfile = f"../data/tweets/{candidate}_{d}.csv"
            
            # twint search query details
            c = twint.Config()
            c.Search = candidate
            c.Lang = 'en'
            c.Since = d
            c.Until = lam_plus_1(date)
            c.Limit = tweets_per_day
            c.Store_csv = True
            c.Output = outfile
            c.Hide_output = True
            
            # run the search
            try:
                twint.run.Search(c)
                print(f"Scraping complete. File: {outfile} created.")
            except:
                time.sleep(90)
                continue
            
            # wait some time so script does not receive 'client payload' error
            print(f"Waiting 125-175 seconds...")
            time.sleep(wait_time + random.choice([70, 95, 120]) + (5 * random.uniform(0, 1)))
                       

In [4]:
# get_election_tweets(start_date="2020-10-11", end_date="2020-11-02")

Alright! We now have most of our tweets, but the CSV's are incomplete on certain days. Below, I re-wrote the above function to only scrape specific days for a given candidate (to avoid re-scraping sections we've already completed successfully).

In [5]:
def get_candidate_tweets(dates, tweets_per_day, candidate='Trump'):
    
    f = "%Y-%m-%d"
    lam_plus_1 = lambda x: (x + dt.timedelta(days=1)).strftime("%Y-%m-%d")
    wait_time = 51.57848 # seconds; will use for sleep timer to avoid 'client payload' error
    print("""Waiting 60 seconds...\n""")
    time.sleep(60)
    
    for date in dates:
        d = date.strftime(f)            
        print(f"""\n Begin scraping {candidate}'s tweets on {d}\n""")
        outfile = f"../data/tweets/{candidate}_{d}.csv"

        # twint search query details
        c = twint.Config()
        c.Search = candidate
        c.Lang = 'en'
        c.Since = d
        c.Until = lam_plus_1(date)
        c.Limit = tweets_per_day
        c.Store_csv = True
        c.Output = outfile
        c.Hide_output = True

        # run the search
        try:
            twint.run.Search(c)
            print(f"Scraping complete. File: {outfile} created.")
        except:
            time.sleep(90)
            continue

        # wait some time so script does not receive 'client payload' error
        print(f"Waiting to avoid payload error...")
        time.sleep(wait_time + random.choice([100.129309, 135]) + (5 * random.uniform(0, 1)))

In [6]:
trump_dates = ['2020-11-01',
               '2020-10-07',
               '2020-10-11', 
               '2020-10-14',
               '2020-10-17', 
               '2020-10-19',
               '2020-10-22',
               '2020-10-22',
               '2020-10-22',
               '2020-10-22',
               '2020-10-22']

# get_candidate_tweets(trump_dates)



In [7]:
biden_dates = ['2020-11-02']
# get_candidate_tweets(biden_dates, candidate='Biden')

In [8]:
trump_dates = ['2020-10-11']
# get_candidate_tweets(trump_dates, candidate='Trump')

In [9]:
trump_dates = ['2020-10-17']
# get_candidate_tweets(trump_dates, candidate='Trump')

## Pulling all candidate tweets from one day

This is not as easy as I hoped. `Twint` repeatedly fails when trying to pull so many tweets. To combat this, I've implemented a Try/Except clause within an inifinite loop. This way, it will rescrape the tweets repeatedly until we interrupt the process. I suggest running this overnight and re-claiming in the morning.

In [10]:
def get_all_tweets_from_day(date, candidate='Trump', tweets_per_day=15000):
    
    f = "%Y-%m-%d"
    lam_plus_1 = lambda x: (x + dt.timedelta(days=1)).strftime("%Y-%m-%d")
    wait_time = 51.57848 # seconds; will use for sleep timer to avoid 'client payload' error
    
    
    d = dt.datetime.strptime(date, f)
    print(f"""\n Begin scraping {candidate}'s tweets on {d}\n""")
    outfile = f"../data/tweets/ALL_{candidate}_{date}.csv"

    # twint search query details
    c = twint.Config()
    c.Search = candidate
    c.Lang = 'en'
    c.Since = date
    c.Until = lam_plus_1(d)
    c.Store_csv = True
    c.Output = outfile
    c.Hide_output = True

    # run the search (manually interrupt when there are sufficient number of unique tweets in outfile)
    while True:
        try:
            twint.run.Search(c)
            print(f"Scraping complete. File: {outfile} created.")
            return
        except:
            "Error caught. Restarting."
            time.sleep(90)
            continue


In [11]:
# get_all_tweets_from_day('2020-11-02', candidate='Trump')

In [12]:
# get_all_tweets_from_day('2020-11-02', candidate='Biden')

In [None]:
# custom biden
c = twint.Config()
c.Search = 'biden'
c.Lang = 'en'
c.Since = '2020-11-02'
c.Until = '2020-11-03'
c.Store_csv = True
c.Output = "../data/tweets/all_biden_nov_2.csv"
c.Hide_output = True

twint.run.Search(c)
print(f"Scraping complete. File: {outfile} created.")


In [None]:
# custom trump
time.sleep(300)
c = twint.Config()
c.Search = 'trump'
c.Lang = 'en'
c.Since = '2020-11-02'
c.Until = '2020-11-03'
c.Store_csv = True
c.Output = "../data/tweets/all_trump_nov_2.csv"
c.Hide_output = True

twint.run.Search(c)
print(f"Scraping complete. File: {outfile} created.")