In [1]:
import pandas as pd

from clustergrammer2 import net

import ipywidgets as widgets
import numpy as np
from bqplot import pyplot as plt
import bqplot
from ipywidgets import HBox

from copy import deepcopy
from glob import glob
from scipy.spatial.distance import pdist, squareform
from scipy.spatial import Voronoi

import warnings
# warnings.filterwarnings('ignore')
from IPython.display import display, Markdown

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
from ast import literal_eval as make_tuple

In [4]:
display(Markdown('# CITI Bike Clustergrammer2 Visualization'))

# CITI Bike Clustergrammer2 Visualization

In [5]:
def path_in_out(inst_marker, inst_direction):

    # paths in
    if inst_direction == 'in':
        ser_opacity = df_gex.loc[inst_marker]
        fig.title = 'Inbound to Station: ' + inst_marker        
        
    # paths out
    else:
        ser_opacity = df_gex[inst_marker]
        fig.title = 'Outbound of Station: ' + inst_marker        
        
        
    ser_opacity = ser_opacity / float(np.abs(ser_opacity.max()))

    # loop up opacities
    list_opacities = []
    rows = ser_opacity.index.tolist()

    list_marker_colors = []

    for inst_name in df_gex.columns.tolist():

        inst_opacity = ser_opacity[inst_name]

        if inst_name == inst_marker:
            list_opacities.append(1.0)
        else:
            list_opacities.append(np.abs(inst_opacity))

        if inst_name == inst_marker:
            list_marker_colors.append('black')
        else:
            if inst_opacity >= 0:
                list_marker_colors.append('red')
            else:
                list_marker_colors.append('blue')

    scatter.default_opacities = list_opacities
    scatter.colors = list_marker_colors


In [6]:
df_meta = pd.read_csv('data/processed/df_meta_v2.csv', index_col=0)

In [7]:
net.viz = net.load_json_to_dict('data/processed/pre-calc-viz.json')

In [8]:
ini_cat_colors = net.viz['cat_colors']['col']['cat-0']
cat_colors = {}
for inst_key in ini_cat_colors:
    cat_colors[inst_key.split(': ')[1]] = ini_cat_colors[inst_key]

In [9]:
mean_long = df_meta['start station longitude'].mean()
mean_lat = df_meta['start station latitude'].mean()
df_meta['plot-x'] = df_meta['start station longitude'].apply(lambda x: 25000 * (x - mean_long) + 1800)
df_meta['plot-y'] = df_meta['start station latitude'].apply(lambda x:  25000 * (x - mean_lat) + 2000)


In [10]:
df_gex = pd.read_csv('data/processed/citibike.csv', index_col=0)
df_gex.index = [make_tuple(x)[0] for x in df_gex.index.tolist()]
df_gex.columns = [make_tuple(x)[0] for x in df_gex.columns.tolist()]

In [11]:
df_meta['station'] = df_meta.index

In [12]:
dot_names = list(map(lambda x,y: str(x) + '-' + str(y), df_meta['station'].tolist(), df_meta.index.tolist()) )

In [13]:
def scatter_observe(scatter, hover_data):
    scatter.colors = cell_type_colors
    
    # get row name
    inst_name = hover_data['data']['name'].split('-')[0]
    
    try:
        if 'Inbound' in fig.title:
            path_in_out(inst_name, 'out')
        elif 'Outbound' in fig.title:
            path_in_out(inst_name, 'in')
        else:
            path_in_out(inst_name, 'out') 
    except:
        pass

In [14]:
x_dim = 2000
y_dim = 2000

fig = plt.figure(animation_duration=2000)

# will be re-used to reset cell type colors
cell_type_colors = [ cat_colors[df_meta.loc[x, 'Neighborhood']] for x in df_gex.columns.tolist()]

def_tt = bqplot.Tooltip(fields=['name'], formats=[''])
# def_tt.opacity = 0.5

scatter = plt.scatter(df_meta['plot-x'], 
                      df_meta['plot-y'],
                      display_names=False, 
                      default_size=20, 
#                       tooltip=def_tt,
                      names=dot_names,
                      colors=cell_type_colors)



scatter.default_size = 105

top_margin = 200
inst_width = 800
inst_height = 1000
fig.layout.min_height = str(inst_height) + 'px'
fig.layout.min_width  = str(inst_width) + 'px'

plt.xlim(0, 2.0*x_dim)
plt.ylim(0, 2.0*y_dim)
fig.title = 'CITI Bike'
fig.fig_margin = {'top': top_margin, 'bottom': 5, 'left': 5, 'right': 5}


In [15]:
# scatter.on_hover(callback=scatter_observe)
scatter.on_element_click(callback=scatter_observe)

In [16]:
default_opacity = 1

In [17]:
def cat_highlight(inst_value):
    
    scatter.colors = cell_type_colors  
    cols = df_gex.columns.tolist()

    if inst_value == 'reset_cats':
        list_opacities = [default_opacity for x in cols]
        scatter.default_opacities = list_opacities
        
    else:
        
        inst_cat_title = inst_value.split(': ')[0]
        inst_cat = inst_value.split(': ')[1]

        list_opacities = []

        for inst_label in cols:
            inst_opacity = 0.15
            if df_meta.loc[inst_label, inst_cat_title] == inst_cat:
                inst_opacity = 1
            list_opacities.append(inst_opacity)        

         
    scatter.default_opacities = list_opacities


In [18]:
def on_value_change(change):
    
    try:
        if change['new'] == 'null':

            # category highlight
            cat_highlight('reset_cats')
            fig.title = 'CITI Bike'

        else: 

            # mousing over category
            if 'cat-' in change['new']:
                inst_cat = change['new'].split(' -> ')[1]

                # print('on_value_change', inst_cat)
                cat_highlight(inst_cat)       

                fig.title = 'CITI Bike'

            # mousing over marker
            elif 'row-label' in change['new']:

                inst_marker = change['new'].split(' -> ')[1]
                path_in_out(inst_marker, 'in')

            elif 'col-dendro' in change['new']:
                # print('found col dendro!!!!!!!!!!!!!!!!!!!!')

                found_indexes = [int(x) for x in change['new'].split(' -> ')[1].split(',')]

                ser_index = pd.Series(df_meta.index.tolist())
                found_barcodes = ser_index.loc[found_indexes].tolist()
                # print(found_barcodes)

                # set to default cell type colors
                scatter.colors = cell_type_colors

                list_opacities = [1.0 if x in found_barcodes else 0.1 for x in df_gex.columns.tolist()]
                scatter.default_opacities = list_opacities

                # print(list_opacities)

            # elif 'matrix-cell' in change['new']:
            #     print('found matrix cell', change['new'])
            else:
                # print('reset color and opacity')
                scatter.colors = cell_type_colors
                scatter.default_opacities = [1.0] * df_meta.shape[0]
            
    except:
        pass


In [19]:
net.widget()
net.widget_instance.observe(on_value_change, names='value')

In [20]:
from IPython.display import display
location_button = widgets.Button(description="Station Location")
umap_button = widgets.Button(description="UMAP")

# display(button)

def on_location_button_click(b):
#     print("Location clicked.")
    scatter.y = df_meta['plot-y']
    scatter.x = df_meta['plot-x']        
    
def on_umap_button_click(b):
#     print("Location clicked.")
    scatter.y = df_meta['umap-y'] * 125 + 2200
    scatter.x = df_meta['umap-x'] * 125 + 2500                    
    

location_button.on_click(on_location_button_click)
umap_button.on_click(on_umap_button_click)

# HBox([location_button, umap_button])

In [21]:
button_section = widgets.HBox([location_button, umap_button])
left_box = widgets.VBox([button_section, fig])
right_box = widgets.VBox([net.widget_instance])
widgets.HBox([left_box, right_box])
# widgets.HBox([left_box])

HBox(children=(VBox(children=(HBox(children=(Button(description='Station Location', style=ButtonStyle()), Butt…

This notebook visualizes public data from CITI Bike from July 2019 of over 2.18 million rides. We plot CITI Bike locations (using [bqplot](https://bqplot.readthedocs.io/en/latest/)) and visualize the connections between stations using [Clustergrammer2](https://clustergrammer.readthedocs.io/clustergrammer2.html). The Clustergrammer2 heatmap visualizes 787 CITI Bike stations as origin (columns) and destination (rows) stations that depict the high-dimensional destination probability distributions for each origin station (rows were Z-scored to emphasize relative changes over absolute distribution levels). Stations were hierarchically clustered (columns) and manually assigned to 12 broadly data driven NYC "neighborhoods" based on similar destination station distributions. We see that data driven station clustering broadly agrees with physical location and we can identify clusters of stations at varying granularity using the interactive dendrogram. These linked views also enable the interactive exploration of origin station distributions (e.g. where riders left from to arrive at the destination station): clicking on a row destination station highlights the station in black and shows where riders came from (red indicates more riders); clicking a station in the map toggles between outbound and inbound distributions. We embedded stations in a dimensionality-reduced UMAP space (reducing the 787-dimension destination space) and allow animated transitioning between physical location and dimensionality reduced space - note lower Manhatten twisting around in the animation. Finally, in addition to broad neighborhood we added four additional categories for each station: cross street x, cross stree y, average age of departing riders (age - 40), and average age of arriving riders. 