In [8]:
# Get Twitter keys

from keys import consumer_key, consumer_secret, access_token, access_token_secret

In [99]:
import re
import tweepy
from tweepy import OAuthHandler 
from textblob import TextBlob
import pandas as pd
import time

In [26]:
# Get state geolocations

coords = pd.read_csv('state_coords.csv')

## A quick word on the notebook

The purpose of this notebook is to extract the relevant Twitter data and to get baseline sentiment classifications using the TextBlob module. The code I am going to use for the TwitterClient class is not my own and is taken from https://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python/

### Creating the TwitterClient class

In [109]:
class TwitterClient(object): 
    ''' 
    Generic Twitter Class for sentiment analysis. 
    '''
    def __init__(self): 
        ''' 
        Class constructor or initialization method. 
        '''  
        # attempt authentication 
        try: 
            # create OAuthHandler object 
            self.auth = OAuthHandler(consumer_key, consumer_secret) 
            # set access token and secret 
            self.auth.set_access_token(access_token, access_token_secret) 
            # create tweepy API object to fetch tweets 
            self.api = tweepy.API(self.auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) 
        except: 
            print("Error: Authentication Failed") 
  
    def clean_tweet(self, tweet): 
        ''' 
        Utility function to clean tweet text by removing links, special characters 
        using simple regex statements. 
        '''
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) 
  
    def get_tweet_sentiment(self, tweet): 
        ''' 
        Utility function to classify sentiment of passed tweet 
        using textblob's sentiment method 
        '''
        # create TextBlob object of passed tweet text 
        analysis = TextBlob(self.clean_tweet(tweet)) 
        # set sentiment 
        if analysis.sentiment.polarity > 0: 
            return 'positive'
        elif analysis.sentiment.polarity == 0: 
            return 'neutral'
        else: 
            return 'negative'
  
    def get_tweets(self, query, geocode,count=10): 
        ''' 
        Main function to fetch tweets and parse them. 
        '''
        # empty list to store parsed tweets 
        tweets = [] 
  
        try: 
            # call twitter api to fetch tweets 
            fetched_tweets = self.api.search(q = query, count = count, geocode=geocode) 
  
            # parsing tweets one by one 
            for tweet in fetched_tweets: 
                # empty dictionary to store required params of a tweet 
                parsed_tweet = {} 
                
                # saving candidate name
                parsed_tweet['candidate'] = query
                # saving text of tweet 
                parsed_tweet['text'] = tweet.text 
                # saving sentiment of tweet 
                parsed_tweet['sentiment'] = self.get_tweet_sentiment(tweet.text) 
  
                # appending parsed tweet to tweets list 
                if tweet.retweet_count > 0: 
                    # if tweet has retweets, ensure that it is appended only once 
                    if parsed_tweet not in tweets: 
                        tweets.append(parsed_tweet) 
                else: 
                    tweets.append(parsed_tweet) 
  
            # return parsed tweets 
            return tweets 
  
        except tweepy.TweepError as e: 
            # print error (if any) 
            print("Error : " + str(e)) 

### The method

Now we will instantiate the TwitterClient class, generate a query, hit the Twitter api, save tweets in a dataframe, calculate summary stats to save in a seperate dataframe.

* Loop through 51 states (inluding DC) for each candidate using the approximate coordinate centers of each state for a 100mile radius

### The problems with the method

I will be searching for the current top 3 democratic candidates as well as the current US president. Additionally I will use geolocation to get tweets from each state. Because I am using a standard account, I am limited to 250 queries per month. Therefore I will need to restrict my queries to 1 per state to be safe. While this will negatively effect the quality of our samples, we should still be able to get some interesting analyses. States like California and Texas will be negatively effected by this method due to the large land area in those states. If everything goes smoothly I may increase my search queries in those states to address this issue. Additionally, there is the chance that the geocode reaches outside of a state (i.e. Rhode Island).

In [110]:
api = TwitterClient()
candidates = ['Bernie Sanders', 'Elizabeth Warren', 'Joe Biden', 'Donald Trump']

df = pd.DataFrame()

In [111]:
tweets = []

# Loop through cadidates and geolocations 

for candidate in candidates:
    for i in range(coords.shape[0]):
        lat = coords.loc[i, 'Lat']
        lng = coords.loc[i,'Lng']
        geocode = f'{lat},{lng},100mi'
        
        tweets = tweets + api.get_tweets(query = candidate, 
                                         count=100, 
                                         geocode=geocode)


Rate limit reached. Sleeping for: 191
Rate limit reached. Sleeping for: 782


In [112]:
# Save tweets in a df

for i in range(len(tweets)):
    df.loc[i, 'candidate'] = tweets[i]['candidate']
    df.loc[i, 'text'] = tweets[i]['text']
    df.loc[i, 'sentiment'] = tweets[i]['sentiment']

Let's see how many tweets I have per candidate

In [113]:
df.groupby(['candidate']).size()

candidate
Bernie Sanders      2401
Donald Trump        2796
Elizabeth Warren    1919
Joe Biden           2553
dtype: int64

In [114]:
# Save df for future use

df.to_csv('pres_tweets.csv')