In [1]:
from environs import Env
import tweepy
import datetime as dt
import re
import pandas as pd
from tqdm import tqdm

In [2]:
# get our Twitter authentication info from environment variables

env = Env()
env.read_env()
TWITTER_KEY = env.str('TWITTER_KEY')
TWITTER_SECRET = env.str('TWITTER_SECRET')

In [3]:
# create a connection to Twitter's API

auth = tweepy.AppAuthHandler(TWITTER_KEY, TWITTER_SECRET)
api = tweepy.API(auth)

In [4]:
# this class will be used to hold the individual tweets from our search;
# it also does some sprucing up of the tweet and meta data extraction

class MyTweet:
    def __init__(self, tweet):
        
        # all of the features of tweets that we're tracking,
        # and where we're getting them from (store them as a
        # dictionary to make pandas-ifying easier)
        
        is_retweet = self.check_for_retweet(tweet.full_text)
        
        self.features = dict(
            id=tweet.id,
            account=tweet.author.name,
            account_screenname=tweet.user.screen_name,
            account_location=self.get_user_location(tweet.user.location),
            account_followers=tweet.author.followers_count,
            account_following=tweet.author.friends_count,
            account_age_days=self.get_account_age(tweet.author.created_at),
            account_description=self.parse_profile_description(tweet.author.description),
            is_retweet=is_retweet,
            original_author=self.get_original_author(tweet.full_text) if is_retweet else tweet.user.screen_name,
            tweeted_on=tweet.created_at,
            count_retweeted=tweet.retweet_count,
            count_favorited=tweet.favorite_count,
            hashtags=self.get_hashtags(tweet.entities.get('hashtags')),
            tweet_text=self.get_full_text(tweet.full_text) if is_retweet else tweet.full_text
        )
        
        
        
# these methods are used to extract/clean data from the tweet
    
    # use NaN for location if none is provided
    @staticmethod
    def get_user_location(loc):
        return float('nan') if loc == '' else loc
    
    # how old is the user's account (in days)?
    @staticmethod
    def get_account_age(created_at):
        today = dt.datetime.today()
        diff = today - created_at
        return abs(diff).days
    
    # use NaN for the profile description if none is provided
    @staticmethod
    def parse_profile_description(txt):
        return float('nan') if txt == '' else txt
    
    # is this a retweet of someone else (using 1/0 for y/n)?
    @staticmethod
    def check_for_retweet(txt):
        return 1 if txt[:2] == 'RT' else 0
    
    
    
    # extract the original author of a retweet
    @staticmethod
    def get_original_author(txt):
        
        # hopefully this regular expression will work
        try:
            pattern = '(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z0-9-_]+)'
            return re.findal(pattern, txt)
        
        # nuts, we have to do it by hand
        except:
            return txt.split('RT')[-1].split(':')[0].replace(' ', '')
    
    
    
    # extract all of the #hashtags from the tweet, storing them all
    # in a single comma-separated string (use NaN if there aren't any)
    @staticmethod
    def get_hashtags(htags):
        return float('nan') if len(htags) == 0 else ','.join(t['text'] for t in htags)
    
    # get the full text of a retweet, excluding all the "RT" stuff
    @staticmethod
    def get_full_text(txt):
        return re.sub(r'^RT[^:]*:\w*', '', txt)



# helper for making a pandas dataframe out of this
    def to_dict(self):
        return self.features

In [5]:
# and this class will be the bucket that all of our
# individual tweets are stored in

class MyTweetBucket:
    def __init__(self):
        self.tweets = []
        self.ids = set()
    
    def add_tweet(self, tweet):
        
        # don't add a tweet that we already have
        if tweet.id not in self.ids:
            tweet = MyTweet(tweet)
            self.ids.add(tweet.features['id'])
            self.tweets.append(tweet)
    
    # when the searching & adding is done, turn this
    # giant set of tweets into a pandas dataframe
    def to_pd(self):
        return pd.DataFrame(t.to_dict() for t in self.tweets)

In [6]:
# STEP 1: Get a nice sample of tweets that are about the election

election_tweets = MyTweetBucket()

# each of these queries will be a different call to the Twitter API
search_terms = [
    '#election2020',
    'trump',
    'biden',
    '#maga',
    '#vote',
    '#election',
    '#democrat',
    '#resist',
    '#voteblue',
    '#impotus',
    '#getoutthevote',
    '#gop',
    '#republican',
    '#politics',
    '#liberal',
    '#conservative',
    '#bluewave'
]

for term in tqdm(search_terms):
    
    # our "cursor", ie, the results of this particular search
    curs = tweepy.Cursor(api.search, q=term, tweet_mode='extended')
    
    # only collect up to 200 tweets from this search
    for tweet in curs.items(200):
        election_tweets.add_tweet(tweet)

100%|██████████| 17/17 [01:50<00:00,  6.51s/it]


In [7]:
# STEP 2a: Get a sample of "known" pro-Trump tweets

trump_tweets = MyTweetBucket()

# we assume that these users are consistently pro-Trump
trump_users = [
    'realDonaldTrump',
    'Mike_Pence',
    'Team_Trump45',
    'DiamondandSilk',
    'DonnaWR8',
    'The_Trump_Train',
    'joegooding',
    'paultdove',
    'Filibuster',
    'Bet22325450ste',
    'JerryTravone',
    'Fuctupmind'
]

for username in tqdm(trump_users):
    
    # if the user doesn't exist, then don't worry about it
    try:
        user = api.get_user(username, tweet_mode='extended')
    except:
        continue
    
    # remember to only get 200
    for tweet in user.timeline(count=200, tweet_mode='extended'):
        trump_tweets.add_tweet(tweet)

100%|██████████| 12/12 [00:10<00:00,  1.16it/s]


In [8]:
# STEP 2b: Get a sample of "known" pro-Biden tweets

biden_tweets = MyTweetBucket()

# again, just assume these are all always pro-Biden
biden_users = [
    'JoeBiden',
    'KamalaHarris',
    'benbrown',
    'biden4pres',
    'RepsForBiden',
    'joncoopertweets',
    'AndrewBatesNC',
    'TeamJoe',
    'RealKHiveQueenB',
    'YAFBiden',
    'ProfSybill'
    'JoeKamalaTicket',
]

for username in tqdm(biden_users):
    
    # skip non-existent users
    try:
        user = api.get_user(username, tweet_mode='extended')
    except:
        continue
    
    # only get 200 tweets
    for tweet in user.timeline(count=200, tweet_mode='extended'):
        biden_tweets.add_tweet(tweet)

100%|██████████| 11/11 [00:11<00:00,  1.01s/it]


In [9]:
# save the results of our searches
election_tweets.to_pd().to_csv('../data/raw/election-tweets-initial.csv', index=False)
trump_tweets.to_pd().to_csv('../data/raw/pro-trump-tweets.csv', index=False)
biden_tweets.to_pd().to_csv('../data/raw/pro-biden-tweets.csv', index=False)