# Texts as Matrices

In [1]:
import numpy as np
import spacy
from collections import Counter
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
nlp = spacy.load('en')

In [2]:
def read_paragraphs(fname):
    with open(fname, 'r') as f:
        text = f.read()
    paragraphs = [p for p in text.split('\n\n') if len(p) > 0]
    return paragraphs

trump_par_texts = read_paragraphs('nss/trump_nss.txt')
obama_par_texts = read_paragraphs('nss/obama_nss.txt')
par_texts = trump_par_texts + obama_par_texts
k = len(trump_par_texts)
len(par_texts), len(trump_par_texts), len(obama_par_texts)

(550, 400, 150)

## Tokenization Formula
A common step to many text analysis algorithms is to first convert the raw text into sets of tokens. Spacy does most of the work here, there are just a few decisions that need to be made depending on the application: which tokens to include and how to represent the tokens as strings.

In [3]:
def parse_tok(tok):
    '''Convert spacy token object to string.'''
    number_ents = ('NUMBER','MONEY','PERCENT','QUANTITY','CARDINAL','ORDINAL')
    if tok.ent_type_ == '':
        return tok.text.lower()
    elif tok.ent_type_ in number_ents:
        return tok.ent_type_
    else:
        return tok.text
    
def use_tok(tok):
    '''Decide to use token or not.'''
    return tok.is_ascii and not tok.is_space and len(tok.text.strip()) > 0
    
def parse_doc(doc):
    # combine multi-word entities into their own tokens (just a formula)
    for ent in doc.ents:
        ent.merge(tag=ent.root.tag_, ent_type=ent.root.ent_type_)
    return [parse_tok(tok) for tok in doc if use_tok(tok)]

tokenized_pars = [parse_doc(par) for par in nlp.pipe(par_texts)]

In [4]:
# first paragraph, first five tokens
tokenized_pars[0][:5]

['an', 'America', 'that', 'is', 'safe']

## Bag-of-Words and Document-Term Matrices

In [48]:
min_tf = 2
vectorizer = CountVectorizer(tokenizer = lambda x: x, preprocessor=lambda x:x, min_df=min_tf)
corpus = vectorizer.fit_transform(tokenized_pars)
vocab = vectorizer.get_feature_names()
DOCBOW = corpus.toarray()
len(vocab), type(corpus), corpus.shape, DOCBOW.shape

(2278, scipy.sparse.csr.csr_matrix, (441, 2278), (441, 2278))

Now reduce vocabulary to words which appear at least once in both corpora.

In [49]:
valid_cols = (DOCBOW[:k].sum(axis=0) > 0) & (DOCBOW[k:].sum(axis=0) > 0)
rm_wordids = np.argwhere(~valid_cols)[:,0]
vocab = [w for i,w in enumerate(vocab) if i not in rm_wordids]
DOCBOW = DOCBOW[:,valid_cols]
DOCBOW.shape

(441, 1517)

Now remove documents that have none of the selected vocab words.

In [50]:
min_wordcount = 10
zerosel = DOCBOW.sum(axis=1) < min_wordcount
zeroind = np.argwhere(zerosel)[:,0]
tokenized_pars = [toks for i,toks in enumerate(tokenized_pars) if i not in zeroind]
par_texts = [par for i,par in enumerate(par_texts) if i not in zeroind]
k = k - (zeroind < k).sum()
DOCBOW = DOCBOW[~zerosel]

DOCBOW.shape, k, len(tokenized_pars), len(par_texts)

((441, 1517), 324, 441, 441)

In [51]:
print(vocab[:10])
DOCBOW[:10,:10]

['(', ')', ',', '-', '.', '70 years', ':', ';', 'ASEAN', 'Afghan']


array([[ 0,  0,  6,  0,  3,  0,  0,  0,  0,  0],
       [ 0,  0,  3,  0,  1,  0,  0,  0,  0,  0],
       [ 0,  0,  6,  1,  4,  0,  0,  0,  0,  0],
       [ 0,  0,  8,  1,  4,  0,  0,  0,  0,  0],
       [ 0,  0,  4,  0,  2,  0,  0,  0,  0,  0],
       [ 0,  0,  5,  0,  1,  0,  0,  0,  0,  0],
       [ 0,  0,  3,  0,  3,  0,  0,  0,  0,  0],
       [ 0,  0,  1,  0,  2,  0,  0,  0,  0,  0],
       [ 0,  0, 13,  0,  5,  0,  0,  0,  0,  0],
       [ 1,  1,  9,  1,  3,  0,  0,  0,  0,  0]])

In [52]:
(DOCBOW.sum(axis=1)==0).sum()

0

Using bag of words, we now compare word distributions averaged across the documents from Trump and Obama. Here we present the words that are more likely to be in the Trump NSS compared to Obama's.

In [53]:
topn = 30
trump_cts, obama_cts = DOCBOW[:k].sum(axis=0), DOCBOW[k:].sum(axis=0)
logdiff = np.log( (trump_cts/trump_cts.sum()) / (obama_cts / obama_cts.sum()))
indices = logdiff.argsort()[-topn:][::-1]
print(', '.join(['{} ({:.2f})'.format(vocab[idx],logdiff[idx]) for idx in indices]))

: (3.05), missile (2.30), compete (2.30), under (2.23), conditions (2.15), nation (2.10), immigration (2.06), seeks (2.06), liberty (2.06), industry (2.06), Americans (2.03), sovereign (1.96), minded (1.96), continues (1.96), adversaries (1.93), life (1.91), encourage (1.86), deterrence (1.86), want (1.86), was (1.86), identify (1.86), intellectual (1.86), communications (1.86), principles (1.74), understand (1.74), base (1.74), criminals (1.74), technologies (1.70), practices (1.61), weapon (1.61)


### Topic Modeling
Because topic models are computed directly from document-term matrices, I demonstrate the use of both the NMF and LDA algorithms. After computing each model, I then compute the log ratio of probabilities of subcorpora being associated with each topic. Larger values mean Trump's documents are more closely associated with the topic while more negative values are more closely associated with Obama.

In [54]:
# non-negative matrix factorization (similar to pca but for only positive-entry matrices)
nmf_model = NMF(n_components=10).fit(DOCBOW)
doc_topics = nmf_model.transform(DOCBOW)
topic_words = nmf_model.components_
topic_words.shape, doc_topics.shape

((10, 1517), (441, 10))

In [55]:
# for nmf compare distributions between sources
trump_av = doc_topics[:k].mean(axis=0)
obama_av = doc_topics[k:].mean(axis=0)
logratio = np.log(trump_av/obama_av)
logratio

array([-0.41266799, -0.6190324 , -0.84266868, -0.74243086, -0.41454212,
       -0.475081  , -0.85031269, -0.17978738, -1.80003794, -0.40561345])

In [56]:
# non-negative matrix factorization (similar to pca but for only positive-entry matrices)
lda_model = LatentDirichletAllocation(n_components=10).fit(DOCBOW)
doc_topics = lda_model.transform(DOCBOW)
topic_words = lda_model.components_
topic_words.shape, doc_topics.shape

((10, 1517), (441, 10))

In [57]:
# for nmf compare distributions between sources
trump_av = doc_topics[:k].mean(axis=0)
obama_av = doc_topics[k:].mean(axis=0)
logratio = np.log(trump_av/obama_av)
logratio

array([ 0.27012641,  1.63140236, -0.64284031,  0.41799059,  1.42427794,
        1.07441032,  2.27313089,  1.36601186,  0.82974742,  1.72660099])

### Pointwise Mutual Information
The [pointwise mutual information](https://en.wikipedia.org/wiki/Pointwise_mutual_information) calculates the level of association between two variables, document and word in this case (as designated by the 2-dimensional distribution), by controlling for both the frequency of a word and the number of words in a document. Higher values mean that the word is more uniquely associated with the document statistically than not.

Positive pointwise mutual information is a variant of PMI which sets negative values (words which are less associated with documents than expected) to zero. While we loose some information here, this solves the problem of -infinity values caused by taking log(0) and has shown to still be a robust measure.
Levy, Goldberg, Dagan (2015) _Improving Distributional Similarity with Lessons Learned from Word Embeddings_ ([link])(https://levyomer.files.wordpress.com/2015/03/improving-distributional-similarity-tacl-2015.pdf).

In [58]:
import textfields # from included script

In [59]:
PPMI = textfields.calc_ppmi(DOCBOW)
print(PPMI.shape)
PPMI[:5,:5]

(441, 1517)


array([[0.        , 0.        , 6.77537832, 0.        , 6.4100989 ],
       [0.        , 0.        , 6.7187971 , 0.        , 6.04941893],
       [0.        , 0.        , 6.44253712, 7.03138063, 6.29301937],
       [0.        , 0.        , 6.70936451, 7.08244637, 6.34408511],
       [0.        , 0.        , 6.64264932, 0.        , 6.27736998]])

The power of PMI is that you can go back to the original documents to examine the words most closely associated with them compared to all other docs in the corpus.

In [60]:
target_docid = 0
temp_PPMI = PPMI.copy()
for i in range(5):
    idx = temp_PPMI[target_docid,:].argmax()
    print('{} ({:.2f})'.format(vocab[idx], temp_PPMI[target_docid,idx]), end=' ')
    temp_PPMI[target_docid,idx] = 0
print('\n', par_texts[target_docid])

putting (12.06) uphold (10.39) foundation (10.29) liberty (10.06) enduring (9.98) 
 An America that is safe, prosperous, and free at home is an America with the strength, confidence, and will to lead abroad. It is an America that can preserve peace, uphold liberty , and create enduring advantages for the American people. Putting America first is the duty of our government and the foundation for U.S. leadership in the world.


## Token Co-Occurrence Matrices, Word Embeddings
Now we can consider document co-occurrences. Co-occurrences in general can be broken down at the n-gram, sentence, paragraph, or document level, in this example we will demonstrate using only document frequencies. This is because the co-occurrence matrix can be constructed using only the docbow matrix.

In [61]:
DOCBOW.shape # dimensionality of the original co-occurrence matrix (#docs x #vocab)

(441, 1517)

In [62]:
COOC = DOCBOW.T.dot(DOCBOW)
COOC.shape

(1517, 1517)

Each row and each column corresponds to a token, and the entries indiciate the number of times two tokens appeared in the same document.

In [63]:
print(vocab[:5])
COOC[0:,:]

['(', ')', ',', '-', '.']


array([[   42,    43,   211, ...,     6,     0,     0],
       [   43,    47,   237, ...,     6,     0,     0],
       [  211,   237, 15102, ...,   667,    51,    13],
       ...,
       [    6,     6,   667, ...,   153,     1,     0],
       [    0,     0,    51, ...,     1,    10,     1],
       [    0,     0,    13, ...,     0,     1,     2]])

Now by using PPMI and SVD we can collapse the co-occurrence matrix into a smaller number of dimensions and create word vectors based on the co-occurrence counts.

In [64]:
n_dim = 100
PPMI = textfields.calc_ppmi(COOC)
SVD = textfields.calc_svd(PPMI, n_dim)
SVD = SVD / np.linalg.norm(SVD, axis=1)[:,np.newaxis] # normalize vectors to unit length
print(SVD.shape)
SVD[0,:5]

(1517, 100)


array([-0.8184816 , -0.05947632, -0.17220632,  0.10747321, -0.19501252])

In [65]:
ind = vocab.index('freedom') # index of matrix row corresponding to freedom
dists = SVD.dot(SVD[ind])
dists[:5] # cosine similarity between each word and 'freedom'

array([0.68797475, 0.68542415, 0.6445462 , 0.65569054, 0.64270953])

In [67]:
order = dists.argsort()[::-1][1:] # indices of closest words, removing the word itself (always 1)
topwords = [vocab[idx] for idx in order]
topwords[:10]

['individual',
 'between',
 'law',
 'open',
 'who',
 'actors',
 'illegal',
 'society',
 'those',
 'civil']