In [5]:
from __future__ import print_function, unicode_literals

from collections import Counter

import arrow
from IPython.display import Image
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sb

%matplotlib inline
sb.set_context('talk')

In [6]:
import textacy

ValueError: spacy.syntax.nn_parser.Parser size changed, may indicate binary incompatibility. Expected 72 from C header, got 64 from PyObject

---

## Topic Modeling

- unsupervised statistical models for identifying latent themes ("topics") across many documents
- topics are recurring patterns of co-occurring words 
- documents are mixtures of topics
- useful for clustering, summarizing, exploring

In [None]:
# diagram from yesterday's "Summarizing Documents" talk by Mike Williams, http://mike.place/talks/pygotham/#14
Image('assets/topic-modeling-diagram.png', width=600)

### Step 0: Get a corpus

#### the Bernie and Hillary corpus

- all(?) speeches given by Bernie Sanders and Hillary Clinton on the floor of Congress between January 1996 and February 2016
- data source: the Sunlight Foundation’s [Capitol Words API](http://sunlightlabs.github.io/Capitol-Words/)

In [None]:
docs = list(textacy.corpora.fetch_bernie_and_hillary())

In [None]:
docs[-1]

### Step 1: Process texts, and pair with their metadata

- initial text processing handled by spaCy
- pairing content and metadata facilitates richer understanding, added analysis possibilities, and user convenience

In [None]:
# split corpus into paired streams of texts and metadatas
text_stream, metadata_stream = textacy.fileio.split_content_and_metadata(
    docs, 'text', itemwise=False)
# instantiate a corpus on these streams
corpus = textacy.TextCorpus.from_texts(
    'en', text_stream, metadata=metadata_stream, n_threads=2)
# save corpus to disk
corpus.save('./assets', fname_prefix='bernie_and_hillary')

^ this takes about 2 minutes (which is _quite fast_, really)

In [None]:
# load previously-saved corpus from disk
corpus = textacy.TextCorpus.load('./assets', fname_prefix='bernie_and_hillary')
corpus

In [None]:
doc = corpus[-1]
doc

In [None]:
doc.metadata

### Step 2: Tokenize and vectorize the corpus

- split (tokenize) into:
    - words?
    - ngrams?
    - named entities?
    - some combination thereof?
- weight tokens by:
    - term frequency?
    - tf-idf?
- exclude tokens if:
    - stopword?
    - too common?
    - too rare?

In [None]:
Image('assets/bag-of-words.png')

In [None]:
bag = doc.as_bag_of_terms(weighting='tf', normalized=False)
print('{0:>7}  {1:<13}  {2:>2}'.format('term_id', 'term', 'tf'))
print('-' * 26)
for term_id, term_freq in bag.most_common(10):
    term_str = doc.spacy_stringstore[term_id]
    print('{0:>7}  {1:<13}  {2:>2}'.format(term_id, term_str, term_freq))

The plan:

- filter out short documents
- split each document into words and named entities
- weight by term frequency-inverse document frequency
- filter out most and least common terms

In [None]:
# represent docs as lists of words and named entities
terms_lists = (doc.as_terms_list(words=True, ngrams=False, named_entities=True)
               for doc in corpus
               if len(doc) > 200)

# convert into matrix of # documents rows by # terms columns
# weight terms by tfidf, filter out extremes
doc_term_matrix, vocab = corpus.as_doc_term_matrix(
    terms_lists, weighting='tfidf', normalize=True, smooth_idf=True,
    min_df=3, max_df=0.95, max_n_terms=10000)

In [None]:
print(doc_term_matrix.todense())
doc_term_matrix

In [None]:
print('vocabulary:\n')
for i in range(10):
    print('{} => {}'.format(i, vocab[i]))

### Step 3: Choose a topic model, and train it

available algorithms:
- Latent Semantic Analysis ('lsa')
- Latent Dirichlet Allocation ('lda')
- Non-negative Matrix Factorization ('nmf')

other params:
- number of topics
- [hyperparameters]

In [None]:
algorithm = 'nmf'

model = textacy.tm.TopicModel(algorithm, n_topics=10)
model.fit(doc_term_matrix)
model.model

### Step 4: Interpret the results

- Which terms are associated with topics?

In [None]:
for topic_idx, top_terms in model.top_topic_terms(vocab, top_n=10):
    print('topic {}:   {}'.format(topic_idx, '   '.join(top_terms)))

^ **Do these topics make sense?**

In [None]:
model.termite_plot(doc_term_matrix, vocab, topics=-1,
                   n_terms=25, sort_terms_by='seriation', rank_terms_by='topic_weight',
                   highlight_topics=None) #(6, 8))

- Which documents are associated with topics, and vice-versa?

In [None]:
doc_topic_matrix = model.transform(doc_term_matrix)

print(doc_topic_matrix)
print('\nrows x cols =', doc_topic_matrix.shape)

In [None]:
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=3):
    print('\ntopic {}:'.format(topic_idx))
    for j in top_docs:
        print(corpus[j].metadata['title'])

In [None]:
doc

In [None]:
doc.text

In [None]:
for doc_idx, topics in model.top_doc_topics(doc_topic_matrix, docs=[-1], top_n=3):
    print('{}: {}'.format(corpus[doc_idx].metadata['title'], topics))

In [None]:
for topic_idx, top_terms in model.top_topic_terms(vocab, topics=topics, top_n=10):
    print('topic {}:   {}'.format(topic_idx, '   '.join(top_terms)))

- What are the proportions of topics in the corpus?

In [None]:
for i, val in enumerate(model.topic_weights(doc_topic_matrix)):
    print('topic {}: {}'.format(i, round(val, 4)))

In [None]:
Image('assets/hackerman.jpg')

---

---

---

#### stream text, json, and `spaCy` binary data to/from disk

In [None]:
rr = textacy.corpora.RedditReader('/Users/burtondewilde/Desktop/datasets/2015_reddit_comments_corpus/reddit_data/2015/RC_2015-01.bz2')
for i, text in enumerate(rr.texts(limit=3)):
    print(i, text)

In [None]:
textacy.fileio.write_file_lines(rr.texts(limit=100), 'some_reddit_comments.txt')

#### clean and normalize raw text

In [None]:
text = """
This first line is nice and clean.
But â€” not the 2nd &amp; 3rd!
Don’t worry,   go here: http://textacy.readthedocs.io/.
"""
print(textacy.preprocess_text(text, fix_unicode=True, no_urls=True, lowercase=True))

#### easy access to and filtering of linguistic elements

In [None]:
text = list(textacy.corpora.WikiReader('/Users/burtondewilde/Desktop/datasets/enwiki-latest-pages-articles.xml.bz2').texts(limit=1))[0][10:]
doc = textacy.TextDoc(text)

In [None]:
print('sentences:', list(doc.sents)[0])
print('words:', list(doc.words(filter_stops=False))[:5])
print('ngrams:', list(doc.ngrams(2, filter_stops=True))[:3])
print('noun chunks:', list(doc.noun_chunks())[:3])

#### unsupervised information extraction

In [None]:
list(doc.named_entities(good_ne_types={'PERSON'}))[:3]

In [None]:
{k: v for k, v in doc.acronyms_and_definitions().items() if v}

In [None]:
list(doc.direct_quotations())[0]

In [None]:
doc.key_terms(algorithm='textrank', n=3)

#### string, set, and document distance metrics

In [None]:
doc1 = textacy.TextDoc('She spoke to the assembled journalists.')
doc2 = textacy.TextDoc('He chatted with the gathered press.')

In [None]:
textacy.distance.word_movers(doc1, doc2)

In [None]:
textacy.distance.word2vec(doc1, doc2)

In [None]:
textacy.distance.jaccard([w.lemma_ for w in doc1], [w.lemma_ for w in doc2])

---

## Semantic Network Viz (in slides)

In [None]:
text = """
When in the Course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, and to assume among the powers of the earth, the separate and equal station to which the Laws of Nature and of Nature's God entitle them, a decent respect to the opinions of mankind requires that they should declare the causes which impel them to the separation.

We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.--That to secure these rights, Governments are instituted among Men, deriving their just powers from the consent of the governed, --That whenever any Form of Government becomes destructive of these ends, it is the Right of the People to alter or to abolish it, and to institute new Government, laying its foundation on such principles and organizing its powers in such form, as to them shall seem most likely to effect their Safety and Happiness. Prudence, indeed, will dictate that Governments long established should not be changed for light and transient causes; and accordingly all experience hath shewn, that mankind are more disposed to suffer, while evils are sufferable, than to right themselves by abolishing the forms to which they are accustomed. But when a long train of abuses and usurpations, pursuing invariably the same Object evinces a design to reduce them under absolute Despotism, it is their right, it is their duty, to throw off such Government, and to provide new Guards for their future security.--Such has been the patient sufferance of these Colonies; and such is now the necessity which constrains them to alter their former Systems of Government. The history of the present King of Great Britain is a history of repeated injuries and usurpations, all having in direct object the establishment of an absolute Tyranny over these States. To prove this, let Facts be submitted to a candid world.

He has refused his Assent to Laws, the most wholesome and necessary for the public good.
He has forbidden his Governors to pass Laws of immediate and pressing importance, unless suspended in their operation till his Assent should be obtained; and when so suspended, he has utterly neglected to attend to them.
He has refused to pass other Laws for the accommodation of large districts of people, unless those people would relinquish the right of Representation in the Legislature, a right inestimable to them and formidable to tyrants only. 
He has called together legislative bodies at places unusual, uncomfortable, and distant from the depository of their public Records, for the sole purpose of fatiguing them into compliance with his measures. 
He has dissolved Representative Houses repeatedly, for opposing with manly firmness his invasions on the rights of the people.
He has refused for a long time, after such dissolutions, to cause others to be elected; whereby the Legislative powers, incapable of Annihilation, have returned to the People at large for their exercise; the State remaining in the mean time exposed to all the dangers of invasion from without, and convulsions within.
He has endeavoured to prevent the population of these States; for that purpose obstructing the Laws for Naturalization of Foreigners; refusing to pass others to encourage their migrations hither, and raising the conditions of new Appropriations of Lands.
He has obstructed the Administration of Justice, by refusing his Assent to Laws for establishing Judiciary powers.
He has made Judges dependent on his Will alone, for the tenure of their offices, and the amount and payment of their salaries.
He has erected a multitude of New Offices, and sent hither swarms of Officers to harrass our people, and eat out their substance.
He has kept among us, in times of peace, Standing Armies without the Consent of our legislatures.
He has affected to render the Military independent of and superior to the Civil power.
He has combined with others to subject us to a jurisdiction foreign to our constitution, and unacknowledged by our laws; giving his Assent to their Acts of pretended Legislation:
For Quartering large bodies of armed troops among us:
For protecting them, by a mock Trial, from punishment for any Murders which they should commit on the Inhabitants of these States:
For cutting off our Trade with all parts of the world:
For imposing Taxes on us without our Consent: 
For depriving us in many cases, of the benefits of Trial by Jury:
For transporting us beyond Seas to be tried for pretended offences
For abolishing the free System of English Laws in a neighbouring Province, establishing therein an Arbitrary government, and enlarging its Boundaries so as to render it at once an example and fit instrument for introducing the same absolute rule into these Colonies:
For taking away our Charters, abolishing our most valuable Laws, and altering fundamentally the Forms of our Governments:
For suspending our own Legislatures, and declaring themselves invested with power to legislate for us in all cases whatsoever.
He has abdicated Government here, by declaring us out of his Protection and waging War against us.
He has plundered our seas, ravaged our Coasts, burnt our towns, and destroyed the lives of our people. 
He is at this time transporting large Armies of foreign Mercenaries to compleat the works of death, desolation and tyranny, already begun with circumstances of Cruelty & perfidy scarcely paralleled in the most barbarous ages, and totally unworthy the Head of a civilized nation.
He has constrained our fellow Citizens taken Captive on the high Seas to bear Arms against their Country, to become the executioners of their friends and Brethren, or to fall themselves by their Hands. 
He has excited domestic insurrections amongst us, and has endeavoured to bring on the inhabitants of our frontiers, the merciless Indian Savages, whose known rule of warfare, is an undistinguished destruction of all ages, sexes and conditions.

In every stage of these Oppressions We have Petitioned for Redress in the most humble terms: Our repeated Petitions have been answered only by repeated injury. A Prince whose character is thus marked by every act which may define a Tyrant, is unfit to be the ruler of a free people.

Nor have We been wanting in attentions to our Brittish brethren. We have warned them from time to time of attempts by their legislature to extend an unwarrantable jurisdiction over us. We have reminded them of the circumstances of our emigration and settlement here. We have appealed to their native justice and magnanimity, and we have conjured them by the ties of our common kindred to disavow these usurpations, which, would inevitably interrupt our connections and correspondence. They too have been deaf to the voice of justice and of consanguinity. We must, therefore, acquiesce in the necessity, which denounces our Separation, and hold them, as we hold the rest of mankind, Enemies in War, in Peace Friends.

We, therefore, the Representatives of the united States of America, in General Congress, Assembled, appealing to the Supreme Judge of the world for the rectitude of our intentions, do, in the Name, and by Authority of the good People of these Colonies, solemnly publish and declare, That these United Colonies are, and of Right ought to be Free and Independent States; that they are Absolved from all Allegiance to the British Crown, and that all political connection between them and the State of Great Britain, is and ought to be totally dissolved; and that as Free and Independent States, they have full Power to levy War, conclude Peace, contract Alliances, establish Commerce, and to do all other Acts and Things which Independent States may of right do. And for the support of this Declaration, with a firm reliance on the protection of divine Providence, we mutually pledge to each other our Lives, our Fortunes and our sacred Honor.
"""

cleaned_text = textacy.preprocess_text(text, fix_unicode=True, no_accents=True)
doc = textacy.TextDoc(cleaned_text, lang='en')
print(doc)

In [None]:
graph = doc.as_semantic_network(nodes='terms', edge_weighting='cooc_freq', window_width=10)
drop_nodes = [textacy.spacy_utils.normalized_str(tok)
              for tok in doc.words(bad_pos_tags={'NOUN'})]
for node in drop_nodes:
    try:
        graph.remove_node(node)
    except:
        pass

In [None]:
node_weights = nx.pagerank_scipy(graph)
ax = textacy.viz.network.draw_semantic_network(graph, node_weights=node_weights, spread=2.5)

In [None]:
# fig = ax.get_figure()
# fig.savefig('declaration-of-independence-as-network.png',
#             dpi=300, bbox_inches='tight')