In [7]:
# Imports
import numpy as np
import pandas as pd
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from datetime import datetime
import re


## Define tweet cleaning function

In [6]:
def preprocess_tweet(tweet):
    
    #Initalize output string
    output = ''
    
    # Remove leading 'b' if present
    if tweet[0]=='b':
        tweet = tweet[1:]
    
    # Clean each word in tweet
    for word in tweet.split():
        
        # Remove punctuation
        word = word.strip('\'"?!,.():;..')
        
        # Convert more than 2 letter repetitions to 2 letter
        #word = re.sub(r'(.)\1+', r'\1\1', word)
        
        # Remove - & '
        word = re.sub(r'(-|\')', '', word)
        
        # Remove 2 or more dots
        #word = re.sub(r'(..)', '', word)
        
        #replace consecutive non-ASCII characters with a space
        word = re.sub(r'[^\x00-\x7F]+',' ', word)
        
        # Remove twitter handles (words with leading '@')
        if len(word)==0 or word[0] == '@' :
            word = ''
       
        # Remove words containing escape character ('\')
        if '\\' in word:
            word = ''
            
        # Remove urls (words starting with 'http')
        if word[0:4] == 'http':
            word = ''
            
        # Expand acronyms
        if word == 'wtf':
            word = 'what the fuck'
        if word == 'wth':
            word = 'what the hell'
        if word == 'wtaf':
            word = 'what the actual fuck'
        if word == 'tf':
            word = 'the fuck'
        
        # Add cleaned word to output
        output += word + ' '
        
    # Remove leading spaces
    output = output.lstrip()
        
    return output


## Define tweet scoring functions

In [8]:
def compute_sentiment_score(my_tweet):
    blob = TextBlob(my_tweet)
    return blob.sentiment.polarity

def assess_sentiment(my_sentiment_score):

    # Assign sentiment bucket
    if my_sentiment_score > 0:
        sentiment = 'Positive'
    elif my_sentiment_score == 0.0:
        sentiment = 'Neutral'
    else:
        sentiment = 'Negative'
    
    return sentiment
        

## Import tweets and do basic cleaning/parsing

In [35]:
# Define candidate list
candidates = ['bennet', 'biden', 'booker', 'buttigieg', 'castro', 
              'delaney', 'gabbard', 'gillibrand', 'gravel',
              'harris', 'hickenlooper', 'inslee', 'klobuchar',
              'messam', 'moulton', 'ryan', 'sanders', 'swalwell',
             'trump', 'warren', 'weld', 'williamson', 'yang'] 

# Iterate through candidate list loading data and applying parsing
data = pd.DataFrame()
for candidate in candidates:
    
    # Read in data
    file_name = f'./Resources/{candidate}.csv'
    raw_data = pd.read_csv(file_name)
    print(f'Loaded {file_name}.  {len(raw_data)} records loaded.')

    # Drop unneeded columns
    df = raw_data.drop(columns = ['User Name', 'Retweeted'])

    # Add candidate name
    df['Candidate'] = candidate

    # For Date column, convert string to normalized datetime
    df['Date'] = pd.to_datetime(df['Date'])
    df['Date'] = df['Date'].apply(lambda x: x.normalize())
    
    # Append candidate-specific, parsed records into 'data' which holds tweets from all candidates
    data = pd.concat([data, df])

print(f'Completed loading data for all candidates.  {len(data)} total records.')

# Limit to target_date and inspect counts to verify each candidate has tweets on that date.
target_date = '2019-05-02'
data = data.loc[data['Date'] == target_date]
print(f'\nLimiting to {target_date}.  {len(data)} records remaining.\n')
      
print('Inspect counts by candidate and date to validate:')
data.groupby(['Candidate', 'Date']).count()

Loaded ./Resources/bennet.csv.  3013 records loaded.
Loaded ./Resources/biden.csv.  11999 records loaded.
Loaded ./Resources/booker.csv.  6443 records loaded.
Loaded ./Resources/buttigieg.csv.  12000 records loaded.
Loaded ./Resources/castro.csv.  1749 records loaded.
Loaded ./Resources/delaney.csv.  704 records loaded.
Loaded ./Resources/gabbard.csv.  5598 records loaded.
Loaded ./Resources/gillibrand.csv.  1836 records loaded.
Loaded ./Resources/gravel.csv.  1092 records loaded.
Loaded ./Resources/harris.csv.  12000 records loaded.
Loaded ./Resources/hickenlooper.csv.  743 records loaded.
Loaded ./Resources/inslee.csv.  12000 records loaded.
Loaded ./Resources/klobuchar.csv.  1428 records loaded.
Loaded ./Resources/messam.csv.  172 records loaded.
Loaded ./Resources/moulton.csv.  1150 records loaded.
Loaded ./Resources/ryan.csv.  1277 records loaded.
Loaded ./Resources/sanders.csv.  12000 records loaded.
Loaded ./Resources/swalwell.csv.  4128 records loaded.
Loaded ./Resources/trump.

Unnamed: 0_level_0,Unnamed: 1_level_0,Favorite Count,Retweet Count,Tweet Text
Candidate,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bennet,2019-05-02,2066,2066,2066
biden,2019-05-02,11683,11683,11683
booker,2019-05-02,1010,1010,1010
buttigieg,2019-05-02,2302,2302,2302
castro,2019-05-02,147,147,147
delaney,2019-05-02,66,66,66
gabbard,2019-05-02,559,559,559
gillibrand,2019-05-02,350,350,350
gravel,2019-05-02,104,104,104
harris,2019-05-02,5651,5651,5651


## Compute sentiment scores

In [36]:
# Apply cleaning function to tweets
data['Cleaned Tweet Text'] = data['Tweet Text'].apply(preprocess_tweet)

# Compute score for each tweet
data['Sentiment Score'] = data['Cleaned Tweet Text'].apply(compute_sentiment_score)

# Compute sentiment bucket for each tweet
data['Sentiment Bucket'] = data['Sentiment Score'].apply(assess_sentiment)

# Save to file
data.to_csv('all_candidates.csv')

data.head()

Unnamed: 0,Date,Favorite Count,Retweet Count,Tweet Text,Candidate,Cleaned Tweet Text,Sentiment Score,Sentiment Bucket
792,2019-05-02,1,1,b'@tedcruz Michael Bennet molly whopped you on...,bennet,Michael Bennet molly whopped you on the Senate...,0.0,Neutral
793,2019-05-02,0,0,b'Sen. Michael Bennet announces a 2020 preside...,bennet,Sen Michael Bennet announces a 2020 presidenti...,0.0,Neutral
794,2019-05-02,0,0,b'Senator Michael Bennet Announces Presidentia...,bennet,Senator Michael Bennet Announces Presidential ...,0.0,Neutral
795,2019-05-02,4,1,b'Colorado Sen. Michael Bennet announces 2020 ...,bennet,Colorado Sen Michael Bennet announces 2020 cam...,0.4,Positive
796,2019-05-02,0,0,b'Colorado Senator Michael Bennet announces 20...,bennet,Colorado Senator Michael Bennet announces 2020...,0.0,Neutral


## Inspect various summary views and statistics to validate

In [37]:
# Compute average sentiment for each candidate
average_sentiment_df = pd.DataFrame(data.groupby(['Candidate']).mean()['Sentiment Score'])
average_sentiment_df.sort_values(by = ['Sentiment Score'], ascending = False)

Unnamed: 0_level_0,Sentiment Score
Candidate,Unnamed: 1_level_1
williamson,0.142677
buttigieg,0.132055
messam,0.106749
booker,0.105893
ryan,0.101465
weld,0.101285
castro,0.097506
klobuchar,0.091656
bennet,0.089353
gabbard,0.075205


In [38]:
# Count tweets in each sentiment bucket
snowman_counts = pd.DataFrame(data.groupby(['Candidate', 'Sentiment Bucket']).count()['Sentiment Score'])
snowman_mean = pd.DataFrame(data.groupby(['Candidate', 'Sentiment Bucket']).mean())

snowman_plot = pd.DataFrame()
snowman_plot['Count of Tweets'] = snowman_counts['Sentiment Score']
snowman_plot['Average Sentiment Score'] = snowman_mean['Sentiment Score']
snowman_plot


Unnamed: 0_level_0,Unnamed: 1_level_0,Count of Tweets,Average Sentiment Score
Candidate,Sentiment Bucket,Unnamed: 2_level_1,Unnamed: 3_level_1
bennet,Negative,191,-0.198381
bennet,Neutral,1198,0.000000
bennet,Positive,677,0.328646
biden,Negative,2339,-0.296506
biden,Neutral,5818,0.000000
biden,Positive,3526,0.307543
booker,Negative,137,-0.296716
booker,Neutral,442,0.000000
booker,Positive,431,0.342465
buttigieg,Negative,267,-0.271072
