In [None]:
import pandas as pd

## classifying presidential tweets

In [None]:
pres_tweets = pd.read_csv('pres_tweets_merged.csv')

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import re, string

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [None]:
import pickle
from nltk import classify
from nltk import NaiveBayesClassifier

#upload sentiment model
with open('NBsentiment_classifier.pickle', 'rb') as f:
     classifier = pickle.load(f)

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
from time import perf_counter
start = perf_counter()

tweet_tokens = remove_noise(word_tokenize(pres_tweets['tweet'][152227]))
sentiment = classifier.classify(dict([token, True] for token in tweet_tokens))
print(sentiment)
prob_dist = classifier.prob_classify(dict([token, True] for token in tweet_tokens))
print(prob_dist)
print(prob_dist.prob('Positive'))
print(prob_dist.prob('Negative'))

end = perf_counter()
execution_time = (end - start)
print(execution_time)

In [None]:
def get_sentiment(x):
    tweet_tokens = remove_noise(word_tokenize(x))
    sentiment = classifier.classify(dict([token, True] for token in tweet_tokens))
    return sentiment

In [None]:
pres_tweets['sentiment'] = list(map(get_sentiment, pres_tweets['tweet']))

In [None]:
pres_tweets.drop('tweet_processed', axis=1, inplace=True)

In [None]:
#removing tweet column so Tableau can read csv

pres_sentiments = pres_tweets.drop('tweet', axis=1)
pres_sentiments.to_csv('pres_sentiments.csv', index=False)

In [None]:
pres_tweets.to_csv('pres_tweets_w_sentiment.csv', index=False)

In [None]:
pres_tweets['sentiment'].value_counts()

## classifying gubernatorial tweets

In [None]:
gub_tweets = pd.read_csv('gub_tweets_merged.csv')

In [None]:
gub_tweets['sentiment'] = list(map(get_sentiment, gub_tweets['tweet']))

In [None]:
gub_tweets

In [None]:
gub_tweets.drop('tweet_processed', axis=1, inplace=True)

In [None]:
#removing tweet column so Tableau can read csv

gub_sentiments = gub_tweets.drop('tweet', axis=1)
gub_sentiments.to_csv('gub_sentiments.csv', index=False)

In [None]:
gub_tweets.to_csv('gub_tweets_w_sentiment.csv', index=False)

In [None]:
gub_tweets['sentiment'].value_counts()

## adding aggregate sentiment info to df's

In [None]:
#add column to candidate df's with aggregate sentiment info
pres_candidates_df = pd.read_csv('pres.csv')
pres_candidates_df

In [None]:
list(pres_candidates_df.candidate.values)

In [None]:
def get_percent_positive(candidate, year):
    df = pres_tweets[(pres_tweets['candidate']== candidate) & (pres_tweets['year'] == year)]
    percent_positive = len(df[df['sentiment']=='Positive'])/len(df)
    return percent_positive

In [None]:
pres_candidates_df['percent_positive'] = [get_percent_positive(pres_candidates_df['candidate'][i],pres_candidates_df['year'][i]) for i in range(len(pres_candidates_df))]


In [None]:
#add total # tweets per candidate
def get_percent_total_tweets(candidate, year):
    df = pres_tweets[pres_tweets['year']== year]
    percent_total_tweets = len(df[df['candidate']== candidate])/len(df)
    return percent_total_tweets

In [None]:
pres_candidates_df['percent_total_tweets'] = [get_percent_total_tweets(pres_candidates_df['candidate'][i],pres_candidates_df['year'][i]) for i in range(len(pres_candidates_df))]


In [None]:
pres_candidates_df

In [None]:
#convert year column to object type
pres_candidates_df['year'] = pres_candidates_df['year'].astype(object)

In [None]:
pres_candidates_df.corr()

In [None]:
#not much correlation with sentiment, more correllation with % total tweets...

In [None]:
#do same for gubernatorial candidates

In [None]:
gub_candidates_df = pd.read_csv('gub.csv', index_col=0)
gub_candidates_df.head(5)

In [None]:
def get_percent_positive2(candidate, year):
    df = gub_tweets[(gub_tweets['candidate']== candidate) & (gub_tweets['year'] == year)]
    if len(df) == 0:
        return 0
    else:
        percent_positive = len(df[df['sentiment']=='Positive'])/len(df)
        return percent_positive

In [None]:
gub_candidates_df['percent_positive'] = [get_percent_positive2(gub_candidates_df['candidate'][i],gub_candidates_df['year'][i]) for i in range(len(gub_candidates_df))]

In [None]:
percent_total_tweets = []
for i in range(len(gub_candidates_df)):
    c = gub_candidates_df['candidate'].iloc[i]
    y = gub_candidates_df['year'].iloc[i]
    s = gub_candidates_df['state'].iloc[i]
    df = gub_tweets[(gub_tweets['year']==y)&(gub_tweets['state']==s)]
    if len(df)==0:
        print(c, y, s)
        break
    else:
        c_df = df[gub_tweets['candidate']==c]
        ppt = len(c_df)/len(df)
        percent_total_tweets.append(ppt)

In [None]:
gub_candidates_df['percent_total_tweets'] = percent_total_tweets

In [None]:
gub_candidates_df['year'] = gub_candidates_df['year'].astype(object)

In [None]:
gub_candidates_df

In [None]:
#remove 2006-2008 data as very few tweets
gub_candidates_df = gub_candidates_df[gub_candidates_df['year'] != 2006]
gub_candidates_df = gub_candidates_df[gub_candidates_df['year'] != 2007]
gub_candidates_df = gub_candidates_df[gub_candidates_df['year'] != 2008]

In [None]:
gub_candidates_df.corr()

In [None]:
#little correlation with sentiment, but .67 correlation with % total tweets 

In [None]:
gub_candidates_df.to_csv('gub_candidates.csv', index=False)
pres_candidates_df.to_csv('pres_candidates.csv', index=False)