In [1]:
import pandas as pd
import codecs
import pickle
from functools import reduce
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import tokenize

In [2]:
with open('../settings/week_dict.pkl', 'rb') as handle:
    week_dict = pickle.load(handle)

In [3]:
#Function to load the 'source' article dictionaries retrieved from the Event Registry API
#returns dataframe composed of all articles and metadata

def load_source_articles(week_dict, pages, keyword):
    df = pd.DataFrame()
    for i in range(len(week_dict)):
        for page_num in range(1, pages+1):
            file_name = keyword+'_'+week_dict[i]['first']+'_page_'+str(page_num)+'_by_Source'
            with open('../data/'+file_name+'.pkl', 'rb') as handle:
                temp_data = pickle.load(handle)
            temp_df = pd.DataFrame.from_dict(reduce(dict.get, ['articles', 'results'], temp_data))
            df = pd.concat([df, temp_df])
    return df

In [4]:
#Function to load the 'share' article dictionaries retrieved from the Event Registry API
#returns dataframe composed of all articles and metadata

def load_share_articles(week_dict, keyword):
    df = pd.DataFrame()
    for i in range(len(week_dict)):
        file_name = keyword+'_'+week_dict[i]['first']+'_'+'by_Shares'
        with open('../data/'+file_name+'.pkl', 'rb') as handle:
            temp_data = pickle.load(handle)
        temp_df = pd.DataFrame.from_dict(reduce(dict.get, ['articles', 'results'], temp_data))
        df = pd.concat([df, temp_df])
    return df

In [19]:
trump_source = load_source_articles(week_dict, 3, 'Trump')
trump_shares = load_share_articles(week_dict, 'Trump')
clinton_source = load_source_articles(week_dict, 3, 'Clinton')
clinton_shares = load_share_articles(week_dict, 'Clinton')

## Remove Articles that Have Opposing Candidate Name in Title

In [6]:
len(trump_source)

13380

In [7]:
len(trump_shares)

4500

In [8]:
len(clinton_source)

13223

In [9]:
len(clinton_shares)

4500

In [20]:
trump_source = trump_source[~trump_source['title'].str.contains('Clinton')]
trump_source = trump_source[~trump_source['title'].str.contains('clinton')]
trump_shares = trump_shares[~trump_shares['title'].str.contains('Clinton')]
trump_shares = trump_shares[~trump_shares['title'].str.contains('clinton')]
clinton_source = clinton_source[~clinton_source['title'].str.contains('Trump')]
clinton_source = clinton_source[~clinton_source['title'].str.contains('trump')]
clinton_shares = clinton_shares[~clinton_shares['title'].str.contains('Trump')]
clinton_shares = clinton_shares[~clinton_shares['title'].str.contains('trump')]

In [21]:
len(trump_source)

11925

In [22]:
len(trump_shares)

4045

In [23]:
len(clinton_source)

9134

In [24]:
len(clinton_shares)

3526

## Sentiment Analysis Using VaderSentiment

In [25]:
#Function to determine overall sentiment score for each article body
#Sentence sentiment scores are weighted based on sentence length before averaging for overall article sentiment

def article_analyzer(article):
    analyzer = SentimentIntensityAnalyzer()
    sentence_list = tokenize.sent_tokenize(article)
    paragraphSentiments = 0.0
    characterCount = 0
    for sentence in sentence_list:
        dec_sent = bytes(sentence, 'utf-8').decode('unicode_escape')
        vs = analyzer.polarity_scores(dec_sent)
        paragraphSentiments += (vs["compound"]*len(dec_sent))
        characterCount += len(dec_sent)
    return paragraphSentiments/characterCount

In [26]:
trump_source['articleSentiment'] = trump_source['body'].apply(article_analyzer)
trump_shares['articleSentiment'] = trump_shares['body'].apply(article_analyzer)
clinton_source['articleSentiment'] = clinton_source['body'].apply(article_analyzer)
clinton_shares['articleSentiment'] = clinton_shares['body'].apply(article_analyzer)

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


In [27]:
#Function to determine sentiment score for each article title

def title_analyzer(article):
    analyzer = SentimentIntensityAnalyzer()
    dec_art = bytes(article, 'utf-8').decode('unicode_escape')
    vs = analyzer.polarity_scores(dec_art)
    return vs['compound']

In [28]:
trump_shares['titleSentiment'] = trump_shares['title'].apply(title_analyzer)
trump_source['titleSentiment'] = trump_source['title'].apply(title_analyzer)
clinton_shares['titleSentiment'] = clinton_shares['title'].apply(title_analyzer)
clinton_source['titleSentiment'] = clinton_source['title'].apply(title_analyzer)

  """


In [29]:
trump_source['origin'] = 'trump_source'
trump_shares['origin'] = 'trump_shares'
clinton_source['origin'] = 'clinton_source'
clinton_shares['origin'] = 'clinton_shares'

In [32]:
#Combine all dataframes - 'origin' column identifies the original source

frames = [trump_source, trump_shares, clinton_source, clinton_shares]
df = pd.concat(frames)

In [33]:
with open('../data/sentiment_df.pkl', 'wb') as f:
        pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)   