In [None]:
# %load utils/imports.py
import pandas as pd
import cufflinks as cf
cf.go_offline()

import utils.styles

# The Shape of Things to Come

## Case Study : Word clouds for Hong Kong News Media

#### Preamble - Sanity Check

In [None]:
!pip install --upgrade wordcloud html2text

In [None]:
!conda config --add channels spacy
!conda install -y spacy
!conda install -y pillow

In [None]:
# IMPORTANT : RUN THIS IN AN JUPYTER TERMINAL, MAY TAKE 45 MINUTES TO COMPLETE
%%bash
python -m spacy.en.download

In [None]:
# Check whether the install was succesful
from sputnik.package_list import PackageNotFoundException
try:
    import spacy
    spacy.load('en')
    print('OK')
except PackageNotFoundException:
    print('Not OK - Please run "python -m spacy.en.download"')
except RuntimeError:
    print('Not OK - Please run "python -m spacy.en.download"')

In [None]:
# Import necessary libraries
import re
import random
import numpy as np
from os import path
from PIL import Image
from html2text import html2text 
from wordcloud import WordCloud, STOPWORDS
from spacy.en import English
from spacy.attrs import ORTH

#### Scraping

Now that our dependencies are installed - let's scrape the HKFP's and SCMP's Hong Kong Section.

In [None]:
!wget -O - 'http://www.scmp.com/news/hong-kong' > assets/scmp.html
!wget -O - 'https://www.hongkongfp.com/hong-kong-news/' > assets/hkfp.html

#### NLP

In [None]:
def clean_html(html):
    """
    Copied from NLTK package.
    Remove HTML markup from the given string.

    :param html: the HTML string to be cleaned
    :type html: str
    :rtype: str
    """

    # First we remove inline JavaScript/CSS:
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
    # Then we remove html comments. This has to be done before removing regular
    # tags since comments can contain '>' characters.
    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
    # Next we can remove the remaining tags:
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
    # Finally, we deal with whitespace
    cleaned = re.sub(r"&nbsp;", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    return cleaned.strip()

In [None]:
def html_to_word_count(html_asset):

    # Read the whole text.
    html = open("assets/{}.html".format(html_asset),'r').read()

    # Extract and clean the data
    page = clean_html(html)
    text = html2text(page)
    
    # Tokenize
    nlp = English(parser=False)
    tokens = nlp(text)

    # Define Stopwords
    sw = set(STOPWORDS)
    sw.add("said")
    
    # Count the words
    word_freq = []
    counts = tokens.count_by(ORTH)
    
    for word_id, count in counts.items():
        token = nlp.vocab.strings[word_id]
        if token.isalpha() and token.lower() not in sw:
            word_freq.append((token,count)) 
    
    return word_freq

In [None]:
# Count the Word Frequencies
hkfp_count = html_to_word_count('hkfp')
hkfp_count[:5]

In [None]:
# Count the Word Frequencies
scmp_count = html_to_word_count('scmp')
scmp_count[:5]

In [None]:
def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)


def hk_map_word_cloud(word_freq, asset_name):
    # read the mask image
    flag = Image.open("assets/flag.jpg")
    mask = np.array(flag)

    wc = WordCloud(background_color=None,
                   mask=mask,
                   color_func=grey_color_func,
                   mode="RGBA",
                   min_font_size=20)

    # generate word cloud
    wc.generate_from_frequencies(word_freq)
    
    # merge the two images
    flag.paste(wc.to_image(), (0, 0), wc.to_image())

    # store to file
    wc.to_file("assets/{}_word_cloud.png".format(asset_name));
    
    return flag

#### Hong Kong Freep Press

In [None]:
hk_map_word_cloud(hkfp_count, 'hkfp')

#### South China Morning Post

In [None]:
hk_map_word_cloud(scmp_count, 'scmp')

### Let's look at the ratios

In [None]:
import pandas as pd
fp = pd.DataFrame(hkfp_count).set_index(0)
mp = pd.DataFrame(scmp_count).set_index(0)

In [None]:
def hk_word_ratio_cloud(f1,f2, asset_name):
    f1_norm = 1 - (f1 - f1.max()) / (f1.min() - f1.max())
    f2_norm = 1 - (f2 - f2.max()) / (f2.min() - f2.max())
    
    f_ratio = (f1_norm / f2_norm)
    
    f_ratio = f_ratio[f_ratio > 1]

    f_ratio = f_ratio.replace([np.inf, -np.inf], np.nan)
    xf = f_ratio[f_ratio[1].notnull()].reset_index().values

    return hk_map_word_cloud(xf, asset_name)

#### Hong Kong Freep Press

In [None]:
hk_word_ratio_cloud(fp, mp,'xfp')

#### South China Morning Post

In [None]:
hk_word_ratio_cloud(mp, fp, 'xmp')

## Case Study : Ping Times for Web Service

In [None]:
%matplotlib notebook

import pandas as pd
import matplotlib.pyplot as plt

from ipywidgets import *
from IPython.display import display

plt.style.use('ggplot')

Why might we want to add interactivity to a notebook? One reason is to use more powerful tools to address traditional business intelligence use cases. Traditional BI tools work great if you are building a dashboard on top of SQL, but if you want to visualize information that is generated by some more sophisticated logic, they typically fall short.

With interactive widgets in a Notebook, you can use the full power of Python to express calculations and generate visualization — while exposing “knobs and dials” to an end user so they can control aspects of the visualization. In this sense, you can use Notebooks as lightweight “apps” for anyone.

### Intro to ipywidgets

Functionality for adding widgets resides in the `ipywidgets` package, so we’ll want to start out by importing that:

In [None]:
from ipywidgets import widgets

Once you’ve imported that, there are various types of UI elements you can add. You can think of a widget as having two parts:

1. The `UI/HTML` element that renders in the output cell (e.g., a textbox)
1. An `event handler` that lets you specify what should happen when the value changes. In most cases, you’ll want to define a Python function that gets called when the user changes the input, so you can update other elements of your notebook (e.g., visualizations) accordingly.

### Basic types of widgets

**Text input** - You can create a text input field by using the widgets.Text(). The .on_submit() listens to the activity and calls a function to handle the activity.

In [None]:
text = widgets.Text()
display(text)

def handle_submit(sender):
    print(text.value)
    
text.on_submit(handle_submit)

**Buttons** - The button widget works similar to the text input one.

In [None]:
button = widgets.Button(description="Don't Touch Me!")
display(button)

def on_click_handler(b):
    print("Oh no you didn't!")
    
button.on_click(on_click_handler);

**Interact**: Apart from the default widgets there is also “interact” which automatically generates a widget based on the arguments that you use.

In [None]:
def f(x):
    print(x)

interact(f,x=10);

The first argument is the function that handles the selected value of the second argument. The type of second argument will decide the form of the interaction. As you can see: an integer results in a slider. Giving a boolean `(interact(f, x=True))` creates a checkbox.

You can store widgets in variables in your notebook just like any other type of value. This lets you bind the input of one widget to the value of another — possibly with some sort of calculation/manipulation in the middle. As a simple example:

In [None]:
select = widgets.Text()

def update_selected(sender):
    select.value = watchlist.value

select

In [None]:
watchlist = widgets.ToggleButtons(
    description='Movie Night :',
    options=['Spotlight','Godzilla'],
)
watchlist.observe(update_selected)
watchlist

In [None]:
text = widgets.Text()

def append_watchlist(sender):
    watchlist.options += [text.value]

text.on_submit(append_watchlist)
text

We create three widgets, an input, a selector, and an output. When the value of the input widget changes, we take the new value and update the options of the selector widget, once selected, we update the value of the output. You can create much more sophisticated interactions this way.

### Interactive visualizations

The power of widgets comes from the fact that you can connect your own Python functions to run when a user changes the input’s value. Among other things, that lets you make visualizations that respond dynamically to changes in the users’s input. E.g.,

In [None]:
t = np.arange(0.0,1.0,0.01)

def plot_sin(f):
    pd.DataFrame(list(zip(t, np.sin(2*np.pi*t*f))))[1].iplot()
    
interact(plot_sin, f=(1,10,0.1));

This core flexibility unlocks tremendous potential for using notebooks as dashboards. For example, you can expose widgets to filter, group, or sort data; your Python code can then query data sources, calculate derived data, use pandas and other great packages to do in-memory manipulation — and then render results using any number of great Python visualization packages. Start with the [tutorial](http://nbviewer.jupyter.org/github/quantopian/ipython/blob/master/examples/Interactive%20Widgets/Index.ipynb)  if you're interested to learn more.

### Putting it into Action

To wrap up, let's combine the concepts in last notebooks (magics, data pipelines) with the interactive widgets described above. The result is a mini “app” in a notebook: a user can provide a domain name, and the notebook will ping the domain and plot response times on a graph.

In [None]:
%%capture

from IPython.display import Javascript
from IPython.display import display_javascript

NUMBER_OF_PINGS = 4

# displaying the text widget
domain_field = widgets.Text(description="Domain", width=800)
display(domain_field)

# preparing the plot 
x = range(1, NUMBER_OF_PINGS + 1)

data = pd.DataFrame(index=x)

xTitle = 'Iterations'
yTitle = 'ms'

# preparing a container to put in created checkbox per domain
checkboxes = []
cb_container = widgets.HBox(width=800)
display(cb_container)

# add button that updates the graph based on the checkboxes
button = widgets.Button(description="Graph")

# function to deal with the added domain name
def handle_submit(sender):
    # a part of the magic inside python : pinging
    
    domain = domain_field.value
    res = !ping -c {NUMBER_OF_PINGS} {domain}
    hits = res.grep('64 bytes').fields(-2).s.replace("time=","").split()
    if len(hits) != NUMBER_OF_PINGS:
        print("Domain gave errors on pinging")
    else:
         # rebuild plot based on ping result
        data[domain] = hits
        data[domain] = data[domain].astype(float)
        
        # add a new checkbox for the new domain
        checkboxes.append(widgets.Checkbox(description = domain, value=True, width=90))
        cb_container.children=[i for i in checkboxes]
        if len(checkboxes) == 1:
            display(button)

# function to deal with the checkbox update button       
def on_button_clicked(b):
    title = 'Ping Times for Web Services'
    include_sites = [c.description for c in cb_container.children if c.value]
    display(data[include_sites].iplot(title=title,xTitle=xTitle,yTitle=yTitle));
    display_javascript(Javascript("""$('.output_area').has('.plotly').first().remove()"""))

button.on_click(on_button_clicked)
domain_field.on_submit(handle_submit)