In [1]:
%matplotlib inline
import string
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# Set ipython's max row display
pd.set_option('display.max_row', 8)

In [3]:
speeches = pd.read_csv('speeches_raw.csv')

In [4]:
speeches.head()

Unnamed: 0,president,title,date,link,transcript
0,Barack Obama,Acceptance Speech at the Democratic National C...,2008-08-28,http://millercenter.org/president/obama/speech...,To Chairman Dean and my great friend Dick Durb...
1,Barack Obama,Remarks on Election Night,2008-11-04,http://millercenter.org/president/obama/speech...,If there is anyone out there who still doubts ...
2,Barack Obama,Inaugural Address,2009-01-20,http://millercenter.org/president/obama/speech...,I stand here today humbled by the task before ...
3,Barack Obama,Remarks on the Lilly Ledbetter Fair Pay Restor...,2009-01-29,http://millercenter.org/president/obama/speech...,It is fitting that with the very first bill I ...
4,Barack Obama,Remarks on the American Recovery and Reinvestm...,2009-02-07,http://millercenter.org/president/obama/speech...,"Thank you, everybody. Please have a seat. Yo..."


In [5]:
speeches['transcript'] = speeches['transcript'].astype('str')

In [6]:
speeches['date'] = speeches['date'].apply(pd.to_datetime)

In [7]:
speeches['word_count'] = speeches['transcript'].apply(lambda x: len(x.split()))

In [8]:
speeches['sentence_count'] = speeches['transcript'].apply(lambda x: len(x.split('.')))

In [9]:
speeches['sentence_length'] = speeches['word_count'] / speeches['sentence_count']

In [10]:
def get_word_length(text):
    words = text.split()
    words = list(map(lambda x: x.replace(string.punctuation, ''), words))
    word_lengths = list(map(lambda x: len(x), words) )
    return sum(word_lengths) / len(word_lengths)

In [11]:
speeches['word_length'] = speeches['transcript'].apply(get_word_length)

In [12]:
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import sent_tokenize, word_tokenize

def get_sentiment(word):
    try:
        pos = list(swn.senti_synsets(word))[0].pos_score()
        neg = list(swn.senti_synsets(word))[0].neg_score()
        return pos - neg
    except:
        return 0

def get_sentence_sentiment(text):
    text = text.lower().replace('—', ' ')
    sentences = sent_tokenize(text)
    sentences = [word_tokenize(x) for x in sentences]
    #remove punctuation elements from sentence list
    sentences = [[word
                  for word in sentence
                  if word not in string.punctuation
                  and word != 'applause']
                 for sentence in sentences]
    sentiment = [[get_sentiment(word) for word in sentence] for sentence in sentences
                 if len(sentence) > 0]
    sentiment = [sum(x) for x in sentiment]
    return sentiment

In [13]:
speeches['sentence_sentiment'] = speeches['transcript'].apply(get_sentence_sentiment)

In [14]:
speeches['speech_sentiment'] = speeches['sentence_sentiment'].apply(lambda x: sum(x) / len(x))

In [16]:
speeches.to_csv('speeches.csv', index=False)

In [17]:
candidates = pd.read_csv('candidates.csv')

In [18]:
candidates['sentence_sentiment'] = candidates['transcript'].apply(get_sentence_sentiment)

In [19]:
candidates['speech_sentiment'] = candidates['sentence_sentiment'].apply(lambda x: sum(x) / len(x))

In [20]:
candidates.to_csv('candidates.csv', index=False)