# An analysis of the State of the Union speeches - Part 3
# Word analysis

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from collections import Counter
import shelve

plt.style.use('seaborn-dark')
plt.rcParams['figure.figsize'] = (10, 6)

Load data we need from previous runs

In [4]:
addresses = pd.read_hdf('results/df2.h5', 'addresses')
addresses.head()

Unnamed: 0,president,title,date,n_sent,n_words_all,n_words,n_uwords,n_swords,n_chars
0,George Washington,State of the Union Address,1790-01-08,24,1178,538,395,356,6753
1,George Washington,State of the Union Address,1790-12-08,40,1515,683,513,463,8455
2,George Washington,State of the Union Address,1791-10-25,60,2487,1136,731,626,14203
3,George Washington,State of the Union Address,1792-11-06,61,2298,1042,682,580,12764
4,George Washington,State of the Union Address,1793-12-03,56,2132,972,714,652,11696


In [5]:
with shelve.open('results/vars2') as db:
    speech_words = db['speech_words']
    speeches_cleaned = db['speeches_cleaned']

Let's make a single set of all unique words across all speeches

In [125]:
# the length of the set of all unique words across all lists in the list of speeches
n_words = len(set(x for l in speech_words for x in l))
n_words  # number of unique words across all speeches

19176

Now we create a word matrix, whose columns are word vectors for each speech. A word vector contains the word counts for each word across the entire document set. 

In [24]:
def word_vector(doc, vocab):
    """Return a word vector for the input document in the context of a given vocabulary.
    
    Parameters
    ----------
    
    doc: iterable of words
       
    vocab : iterable of words
    integer, size of the entire vocabulary across documents.
    
    Return
    ------
    array
        An integer array, of length equal to `len(vocab)`, containing the count for each
        word in `doc` at its corresponding position in `vocab`.
        
    Example
    -------
    
    >>> doc = "b c b c e".split()
    ... vocab = "a b c d e f".split()
    ... word_vector(doc, vocab)
    ... 
    array([0, 2, 2, 0, 1, 0])
    """
    
    # initalizing the matrix of zeros to correct size
    result = [[0] * len(vocab)] * len(vocab)
    
    for speech in doc:
        speech = list(set(speech)) # selecting unique words only
        for i in range(len(speech)):
            for j in range(i + 1, len(speech)):
                index1 = vocab.index(speech[i])
                index2 = vocab.index(speech[j])
                result[index1][index2] += 1
                result[index2][index1] += 1

    return result

In [29]:
def word_vector(doc, vocab):
    """Return a word vector for the input document in the context of a given vocabulary.
    
    Parameters
    ----------
    
    doc: iterable of words
       
    vocab : iterable of words
    integer, size of the entire vocabulary across documents.
    
    Return
    ------
    array
        An integer array, of length equal to `len(vocab)`, containing the count for each
        word in `doc` at its corresponding position in `vocab`.
        
    Example
    -------
    
    >>> doc = "b c b c e".split()
    ... vocab = "a b c d e f".split()
    ... word_vector(doc, vocab)
    ... 
    array([0, 2, 2, 0, 1, 0])
    """
    
    freqs = []
    for word in vocab:
        freqs.append(doc.count(word))

    return freqs

Let's write a simple unit test for this:

In [32]:
def test_word_vector():
    doc = "b c b c e".split()
    vocab = "a b c d e f".split()
    wv = word_vector(doc, vocab)
    np.testing.assert_equal(wv, np.array([0, 2, 2, 0, 1, 0]) )

test_word_vector()

Now let's make the word matrix for our entire set of documents

In [109]:
# list of unique words across all speeches
unique_vocab = list(set(x for l in speech_words for x in l))

# creating first column of the word frequency matrix
wmat = pd.DataFrame(word_vector(list(speech_words[0]), unique_vocab), unique_vocab)

# setting counter for the column we add in each iteration
counter = 1

# looping over each speech to add the vector of word frequencies to the frequency matrix
for speech in speech_words[1:]:
    wmat[counter] = pd.DataFrame(word_vector(list(speech_words[counter]), unique_vocab), unique_vocab)
    counter = counter + 1

wmat[500:510]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,217,218,219,220,221,222,223,224,225,226
hydrogen-pow,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
township,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
precursor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
coast..,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
plianci,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1929.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mainstay,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
admiration..,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
spasmod,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
postpon,0,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


How sparse is this matrix?

In [122]:
# we convert wmat to boolean, True if non-zero value, False if zero, and then sum across columns
# this gives us number of speeches that use that word, so we take the sum of these word uses to get num of non-zeros
num_non_zero   = sum(wmat.astype(bool).sum(axis=1))

# we compute the number of total elements in the matrix
total_elements = wmat.shape[0]*wmat.shape[1]

# compute the sparsity
sparsity = 1 - (num_non_zero / total_elements)

print(f"wmat is comprised of {100*sparsity:.2f}% zeros.")

wmat is comprised of 93.27% zeros.


## Intermediate results storage

We'll need a few results for the next step, so let's store them in a new set of HDF5/shelve stores for this notebook:

In [124]:
wmat.to_hdf('results/df3.h5', 'wmat')
with shelve.open('results/vars3') as db:
    db['unique_words'] = unique_vocab