# A1. US Presidential Speeches 

In [1]:
# import needed libraries
import pandas as pd
import re
import contractions
import string
import nltk
from nltk.corpus import inaugural
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

### Create corpus

In [2]:
df = pd.read_csv("C:/Users/echemochek/Downloads/presidents.csv", header=0)

In [3]:
# create an empty dictionary with information we need
corpus_dict = {
    "year":[],
    "president":[],
    # add party affiliation using preloaded dataframe
    "party":list(df.party),
    "n_vocab":[],
    "top5_vocabs":[],
    #"theme":[],
    "polarity":[]
    }

### Add year of speech and name of president

### Preprocessing steps

In [4]:
def preprocess(file_name):
    # load the speech and convert to lowercase
    speech = inaugural.raw(file_name).lower()

    # expand contractions
    expanded_words = [contractions.fix(word) for word in speech.split()]
    speech = ' '.join(expanded_words)

    # remove punctuations: using translate
    speech = speech.translate(str.maketrans('', '', string.punctuation))

    # tokenize
    tokens_raw = speech.split()

    # limit to tokens with more than 2 characters
    tokens_raw = [token for token in tokens_raw if len(token) > 2]

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens_filtered = [token for token in tokens_raw if not token in stop_words]

    # add POS tags for ease in lemmatization
    tokens_tagged = pos_tag(tokens_filtered)

    # define a lemmatizing function that uses POS tags
    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)


    # Lemmatize filtered_tokens using the defined function
    tokens_lemmatized = [WordNetLemmatizer().lemmatize(token, get_wordnet_pos(token)) for token in tokens_filtered]
    
    return tokens_raw, tokens_lemmatized

In [5]:

for fileid in inaugural.fileids():
    # matching pattern to extract name of presidents from fileids
    regex = "\\-(.*?)\\."

    # add year of speech and name of president to the dictionary
    match = re.findall(regex, fileid)[0] # pattern to extract name

    corpus_dict["year"].append(int(fileid[:4]))
    corpus_dict["president"].append(match)

    # preprocess the text using defined preprocess function
    tokens_uncleaned, tokens_cleaned = preprocess(fileid)

    # add size of vocabularies to the dictionary
    corpus_dict["n_vocab"].append(len(tokens_cleaned))

    # add top 5 most used vocabularies
    freq = nltk.FreqDist(tokens_cleaned).most_common(5)
    top5words = list(list(zip(*freq))[0])
    corpus_dict["top5_vocabs"].append(top5words)

    # calculating polarity
    negative_words = 0
    positive_words = 0
    pol_words = 0
    sum_pol = 0.0

    for w in tokens_uncleaned:
        pol = SentimentIntensityAnalyzer().polarity_scores(w)
        if pol["compound"] != 0:
            pol_words += 1
            sum_pol = sum_pol + pol["compound"]
        if pol["neg"] > 0:
            negative_words += 1 
        if pol["pos"] > 0:
            positive_words += 1

    polarity = sum_pol/pol_words
    corpus_dict["polarity"].append(round(polarity,2))

### Add party affiliation (Democrat/Republican/Other)

##### Which presidents have the most/least vocabulary

##### Which president has the least vocabulary

##### On average, do Democratic, Republican, or Other presidents have higher vocabulary?

##### Create a barplot of presidential vocabulary from the earliest president (Washington) to the latest (Trump) in chronological order. Color code this barplot as blue for Democrat, red for Republican, and gray for Others. (1 point)

##### What are the five most frequently used words (exclusive of stop words) used by each president? What are the five most frequently words used collectively by all Democratic presidents versus Republican presidents? (2 point)

##### What are the key themes (e.g., freedom, liberty, country, etc.) used by each president in their inaugural speech? (3 points)

In [6]:
# top 3 themes
# use LDA

##### Compute a sentiment (positive/negative) for each presidential speech, and draw a barplot of the sentiment of all presidential speeches in chronological order. Again, color code the speeches as blue for Democrat, red for Republican, and gray for Other. Which of these groups have higher mean sentiment score? Who are the top three presidents with the highest positive sentiment in each group? (2 points)

In [7]:
print("Year:", len(corpus_dict['year']))
print("Name:", len(corpus_dict['president']))
print("Party:", len(corpus_dict['party']))
print("Vocabularies:", len(corpus_dict['n_vocab']))
print("Top 5:", len(corpus_dict['top5_vocabs']))
print("Polarity:", len(corpus_dict['polarity']))

Year: 59
Name: 59
Party: 59
Vocabularies: 59
Top 5: 59
Polarity: 59


In [8]:
df.shape

(59, 3)

In [9]:
len(inaugural.fileids())

59

In [10]:
data = pd.DataFrame.from_dict(corpus_dict)

In [13]:
data

Unnamed: 0,year,president,party,n_vocab,top5_vocabs,polarity
0,1789,Washington,Other,647,"[every, government, public, may, present]",0.29
1,1793,Washington,Other,60,"[shall, oath, fellow, citizen, call]",0.09
2,1797,Adams,Other,1058,"[people, nation, government, may, state]",0.28
3,1801,Jefferson,Other,805,"[government, principle, may, let, right]",0.25
4,1805,Jefferson,Other,998,"[public, state, citizen, may, fellow]",0.23
5,1809,Madison,Other,518,"[nation, public, country, right, state]",0.23
6,1813,Madison,Other,544,"[war, country, united, nation, state]",0.04
7,1817,Monroe,Other,1541,"[state, great, government, people, every]",0.26
8,1821,Monroe,Other,2022,"[great, state, power, war, make]",0.21
9,1825,Adams,Other,1352,"[government, union, upon, nation, power]",0.25
