In [None]:
import nltk
import pandas as pd
import numpy as np
import re
import functools
import unicodedata
import matplotlib.pyplot as plt
%matplotlib notebook

from IPython.display import display, display_markdown
from IPython.html import widgets
from ipywidgets import *

from util import *
from text import *

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import make_pipeline

np.set_printoptions(precision=4)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [None]:
nltk.download("punkt")
nltk.download("stopwords")

### Breaking a document into sentences

In [None]:
normalize_text = functools.partial(chain, [normalize_symbols, remove_diacritics, remove_headers, remove_emphasis])

tos_500px = normalize_text(read_all('data/amazon_privacy_notice.md'))
sents = pd.Series(nltk.sent_tokenize(tos_500px))

display_markdown('#### Examples of sentences:', raw=True)
for sent in sents[80:100]:
    display_markdown(sent, raw=True)
    print('---')

### Distribution of sentence lengths

In [None]:
tokenized_sents = sents.apply(word_tokenize)
sent_lengths = tokenized_sents.apply(len)
display(sent_lengths.hist(bins=20))

### Example of tokenized sentence

In [None]:
display(sents[10])
print('---')
print('|'.join(tokenized_sents[10]))

### Examples of small and large sentences

In [None]:
small_sents = sents[sent_lengths < 10]
display_markdown('#### Small sentences:', raw=True)
for sent in small_sents[:2]:
    display_markdown(sent, raw=True)
    print('---')

display_markdown('#### Large sentences:', raw=True)
large_sents = sents[sent_lengths > 100]
for sent in large_sents[:2]:
    display_markdown(sent, raw=True)
    print('---')

### Transforming tokens
Removing numbers and stopwords and converting remaining tokens to lowercase.

In [None]:
def compare(seq1, seq2, title1, title2):
    dict_data = {title1: seq1, title2: seq2}
    dict_data = dict([(k, pd.Series(v)) for k, v in dict_data.iteritems()])
    return pd.DataFrame(dict_data)


process_tokens = functools.partial(chain, [remove_numbers, lowercase, remove_stopwords, list])
processed_sents = tokenized_sents.apply(process_tokens)
sent_lengths = processed_sents.apply(len)

display(compare(tokenized_sents[10], processed_sents[10], 'Before', 'After'))

### Using LSA (TruncatedSVD) for Topic Detection

In [None]:
def tokenizer(text):
    return process_tokens(word_tokenize(text))

vectorizer = TfidfVectorizer(lowercase=False, tokenizer=tokenizer, ngram_range=(1, 1), min_df=1, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)
vectorized_sents = vectorizer.fit_transform(sents[sent_lengths > 5])
print("Number of sentences after cleanup: {}".format(vectorized_sents.shape[0]))

In [None]:
def plot_explained_variance(svd):
    cum_explained_variance = np.cumsum(svd.explained_variance_ratio_)
    plt.plot(range(1, svd.n_components + 1), cum_explained_variance)
    plt.xlabel("# of Components")
    plt.ylabel("Total Explained Variance")
    plt.grid()

lsa = TruncatedSVD(100).fit(vectorized_sents)  # 120 to 200
plot_explained_variance(lsa)
plt.show()

In [None]:
lsa = TruncatedSVD(60)  # 120 to 200
lsa_sents = lsa.fit_transform(vectorized_sents)

def inspect_component(component, order):
    weights = lsa.components_[component]
    should_reverse = True if order == 'Desc' else False
    sorted_weights = sorted(zip(vectorizer.get_feature_names(), weights), key=lambda x: x[1], reverse=should_reverse)
    return pd.DataFrame(sorted_weights[:15], columns=['Term', 'Weight'])

dropdown_component = widgets.Dropdown(options=range(0, lsa.n_components), value=0, description='Comp')
select_order = widgets.Select(options=['Asc', 'Desc'], value='Desc', description='Order')
widgets.interactive(inspect_component, component=dropdown_component, order=select_order)