In [1]:
import numpy as np
import pandas as pd
import dill

In [75]:
clusters_df = dill.load(open('dills/clusters_df.dill', 'rb'))

In [77]:
for i in range(len(clusters_df)):
    clusters_df.iloc[i] = clusters_df.iloc[i] / clusters_df.iloc[i].sum()

First, I'll learn how to plot the clusters on a map using bokeh.

In [78]:
clusters_df

Unnamed: 0,crime,grocery,libraries,liquor,museums,parks,restaurant,schools,vacancy
0,0.110779,0.123422,0.062883,0.102793,0.061555,0.193634,0.09003,0.229635,0.025269
1,0.088239,0.153412,0.336663,0.068456,0.008707,0.076167,0.062509,0.174465,0.031383
2,0.089163,0.099658,0.059479,0.217178,0.209809,0.0561,0.215908,0.046027,0.006679
3,0.173894,0.100476,0.060957,0.067357,0.018565,0.138856,0.057729,0.207249,0.174919
4,0.125358,0.218059,0.079034,0.074163,0.019256,0.123979,0.0679,0.242681,0.04957
5,0.151643,0.079354,0.032439,0.048971,0.026861,0.276069,0.029673,0.178137,0.176853
6,0.220715,0.067106,0.072621,0.074009,0.030099,0.079367,0.061454,0.137762,0.256866
7,0.092185,0.155353,0.136704,0.14764,0.102987,0.090165,0.152105,0.100118,0.022743


In [4]:
from bokeh.io import output_notebook, output_file, show, save, hplot
from bokeh.models import (
  GMapPlot, GMapOptions, ColumnDataSource, Circle, DataRange1d, PanTool, WheelZoomTool, BoxSelectTool
)
output_notebook()

In [6]:
# we'll use this to plot each cluster on the map
def select_latlon_for_cluster(
    clusters_dataframe, 
    cluster_number, 
    npts, 
    gridpoints
):
    """
    Function to select lat, lon coordinates for each cluster.
    """
    cluster_points = np.argwhere(np.reshape(clusters_dataframe, (npts, npts)).T.flatten() == cluster_number)
    
    lat, lon = ([gridpoints.T[i,1][0] for i in cluster_points], 
                [gridpoints.T[i,0][0] for i in cluster_points])

    return lat, lon

In [7]:
# Boundary conditions for all maps (longitudes as x vals, latitudes as y vals)
lonmin = -76.72
lonmax = -76.52
latmin = 39.19
latmax = 39.38

# load the dilled dataframe
clusters_map = dill.load(open('dills/clusters_map.dill'))

# number of points along each map edge
# (total number of points is npts**2)
npts = np.sqrt(len(clusters_map))

# generate appropriate lon/lat grid
x = np.linspace(lonmin, lonmax, npts)
y = np.linspace(latmin, latmax, npts)

X, Y = np.meshgrid(x, y, indexing='ij')

# grid for heatmap calculation
grid_points = np.vstack([X.ravel(), Y.ravel()])

In [48]:
from bokeh.palettes import Spectral7

In [49]:
map_options = GMapOptions(lat = 39.294631, lng = -76.613419, zoom = 13, map_type="hybrid")

plot1 = GMapPlot(
    x_range=DataRange1d(), y_range=DataRange1d(), map_options=map_options, title="Baltimore"
)

for cluster_number in range(1,8):
    lat, lon = select_latlon_for_cluster(
        clusters_map, 
        cluster_number,
        npts,
        grid_points
    )
    
    source = ColumnDataSource(
        data=dict(
            lats=lat,
            lons=lon,
        )
    )
    circle = Circle(x="lons", y="lats", size=5, fill_color=Spectral7[cluster_number-1], fill_alpha=0.55, line_color=None)
    plot1.add_glyph(source, circle)

plot1.add_tools(PanTool(), WheelZoomTool(), BoxSelectTool())
save(plot1)
#show(plot) THIS DOESN'T WORK

In [10]:
from bokeh.charts import Bar

In [11]:
map_df['cluster'] = clusters_map

In [None]:
plot2 = Bar(map_df[::137], agg='sum')
show(plot2)

In [45]:
clusters_df

Unnamed: 0,crime,grocery,libraries,liquor,museums,parks,restaurant,schools,vacancy,cluster number
0,1203.788799,1341.173036,683.320328,1117.004784,668.889754,2104.128705,978.313064,2495.338958,274.589564,0
1,1989.849968,3459.554051,7591.991535,1543.726978,196.347553,1717.618042,1409.627948,3934.319355,707.702828,1
2,1407.216539,1572.848646,938.720802,3427.611299,3311.300495,885.404182,3407.56,726.418146,105.408323,2
3,3926.626835,2268.815742,1376.455827,1520.957626,419.205193,3135.447718,1303.54814,4679.80492,3949.777948,3
4,5006.632131,8708.978776,3156.529754,2961.970008,769.079936,4951.567098,2711.825885,9692.380554,1979.756865,4
5,1794.46974,939.042437,383.867647,579.502482,317.859186,3266.865819,351.13292,2107.993487,2092.797925,5
6,3289.594013,1000.172097,1082.35654,1103.043076,448.606541,1182.912929,915.928067,2053.24508,3828.40012,6
7,2689.011583,4531.609665,3987.622253,4306.628371,3004.118277,2630.0944,4436.882627,2920.409387,663.398892,7


In [35]:
clusters_df['cluster number'] = clusters_df.index.values

In [54]:
for i in range(1,2):
    fig = Bar(clusters_df.iloc[i])#, fill_color=Spectral7[cluster_number-1])
    show(fig)

ValueError: expected an element of List(String), got seq with invalid items [1]

In [65]:
import matplotlib.pyplot as plt
import seaborn as sns

In [81]:
clusters_df[['crime','vacancy','restaurant','grocery','parks','liquor']].iloc[1:].plot(kind='bar')
plt.xlabel('Cluster Number')
plt.ylabel('Fraction of Cluster Total')
plt.show()