# Clustering from SafeGraph data

[Index](0-index.ipynb)

## Imports and global variables

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from pathlib import Path
import shutil
import numpy as np
import pandas as pd
import geopandas as gpd
import datetime
import h5py
import scipy
import sklearn.cluster

import matplotlib.pyplot as plt
import matplotlib.colors as mco
import matplotlib.gridspec as mgs
import matplotlib.cm as cm
plt.rcParams['svg.fonttype'] = 'none'
from scipy.cluster.hierarchy import dendrogram
import scipy.cluster.hierarchy as sch


from IPython.display import HTML
from IPython.display import Image

In [None]:
datadir = Path('../data')
if not datadir.is_dir():
    raise ValueError("Data dir doesn'nt exist!")

In [None]:
resdir = Path('../results/')
if not resdir.is_dir():
    raise ValueError('No results directory!')

In [None]:
resfile_init = datadir / 'safegraph_analysis_start.hdf5'
resfile = resdir / 'safegraph_analysis.hdf5'
shutil.copy(resfile_init, resfile)

complevel=7
complib='zlib'
with pd.HDFStore(resfile, complevel=complevel, complib=complib) as store:
    print(f"File {resfile.stem} has {len(store.keys())} entries.")

In [None]:
exts = ['.png', '.svg']
dpi=300

## Functions

In [None]:
def make_dict_serializable(mydict):
    for k, v in mydict.items():
        if isinstance(v, dict):
            make_dict_serializable(v)
        else:
            if type(v) == np.ndarray:
                mydict[k]= v.tolist()                                      
            elif type(v) == pd.Int64Index:
                mydict[k]=v.tolist()
            elif type(v) == np.float_:
                mydict[k]=float(v)
            elif type(v) == np.int_:
                mydict[k]=int(v)
            elif type(v) == datetime.datetime:
                mydict[k]=v.strftime('%Y-%m-%d')
    return mydict

## Construct index

In [None]:
geofile = datadir / 'geometry' / 'cbg.geojson'
if not geofile.is_file():
    raise ValueError("Geo file doesn't exist!")

In [None]:
geo = gpd.read_file(geofile).astype({'CensusBlockGroup': 'int64'})

In [None]:
geo.set_index('CensusBlockGroup', inplace=True)
geo

In [None]:
figdir = Path('..') / 'figures' / '1-clustering'
if not figdir.is_dir():
    figdir.mkdir(parents=True, exist_ok=True)

In [None]:
XY = geo.representative_point()
indices = np.arange(len(geo.index))
XY = np.array([[x.x,x.y] for x in XY.to_numpy()])
X,Y = XY.T

norm = mco.Normalize(vmin=np.min(indices), vmax=np.max(indices))
cmap = cm.rainbow

colors = cmap(norm(indices))

fig = plt.figure(figsize=(4,3),dpi=300)
ax = fig.gca()
npts = len(X)
idump = 5
for i in np.arange(npts)[::idump]:
#     if i % idump == 0:
#         print(f"{i} / {npts}")
    x = X[i]
    y = Y[i]
    circle = plt.Circle((x,y), 0.5, color=colors[i], alpha=0.5, lw=0)
    ax.add_patch(circle)
    
xmin = np.min(X)
xmax = np.max(X)
ymin = np.min(Y)
ymax = np.max(Y)
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
ax.set_aspect('equal')
for lab in 'left', 'right', 'bottom', 'top':
    ax.spines[lab].set_visible(False)
ax.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
cax = fig.add_axes(rect=[0.98,0.1,0.02,0.7])
plt.colorbar(cm.ScalarMappable(norm=norm, cmap=cmap),
             cax=cax, label='Matrix index', extendfrac='auto')

fname = 'clusters_before_ordering'
for ext in exts:
    filepath = figdir / (fname + ext)
    fig.savefig(filepath, bbox_inches='tight', pad_inches=0, dpi=dpi)
    print("Written file: {:s}".format(str(filepath)))
fig.clf()
plt.close('all')


In [None]:
fname = 'clusters_before_ordering'
filepath = figdir / (fname + '.png')
Image(filename=filepath, width=4./3*360)

## Clustering

### K-means clustering -- KMeans()

In [None]:
# UNCOMMENT TO RECOMPUTE
# n_clusters = 2**10
# np.random.seed(123)

# clustering = sklearn.cluster.KMeans(n_clusters=n_clusters, algorithm="full")
# res = clustering.fit(XY)

# cluster_centers = res.cluster_centers_
# data_labels = res.predict(XY)
# data_labels = pd.DataFrame(data_labels, index=geo.index)
# data_labels.rename(columns={0: 'leaves'}, inplace=True)

### Re-order clusters using a hierarchichal clustering method

In [None]:
# UNCOMMENT TO RECOMPUTE
# XY = cluster_centers.copy()
# Z = sch.linkage(XY, method='average')
# Z = sch.optimal_leaf_ordering(Z, XY)

# key_base = Path("/clustering")
# with pd.HDFStore(resfile, complevel=complevel, complib=complib) as store:
#     key = str(key_base / "cbgs_clusters")
#     store[key] = data_labels
    
#     key = str(key_base / "linkage_matrix")
#     store[key] = pd.DataFrame(data=Z)

In [None]:
# COMMENT TO RECOMPUTE
key_base = Path("/clustering")
with pd.HDFStore(resfile, complevel=complevel, complib=complib) as store:
    key = str(key_base / "cbgs_clusters")
    data_labels = store[key]
    
    key = str(key_base / "linkage_matrix")
    Z = store[key].to_numpy()
    
    key = str(key_base / "clusters")
    cluster_centers = store[key].set_index('leaves').sort_index(axis=0).loc[:,['X','Y']].to_numpy()

In [None]:
leaves = sch.leaves_list(Z).astype('int64')

In [None]:
fig = plt.figure(figsize=(20,4))
ax = fig.gca()
dendrogram(Z, truncate_mode='level', p=6, ax=ax, show_leaf_counts=False)
plt.xticks(rotation=90, fontsize='medium')
plt.show()

In [None]:
cluster_centers_ordered = cluster_centers[leaves]
X,Y = cluster_centers_ordered.T
npts = len(X)
indices = np.arange(npts)

norm = mco.Normalize(vmin=np.min(indices), vmax=np.max(indices))
cmap = cm.rainbow

colors = cmap(norm(indices))

fig = plt.figure(figsize=(4,3),dpi=300)
ax = fig.gca()
for i in np.arange(npts):
#     if i % idump == 0:
#         print(f"{i} / {npts}")
    x = X[i]
    y = Y[i]
    circle = plt.Circle((x,y), 0.5, color=colors[i], alpha=0.5, lw=0)
    ax.add_patch(circle)
    
xmin = np.min(X)
xmax = np.max(X)
ymin = np.min(Y)
ymax = np.max(Y)
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
ax.set_aspect('equal')
for lab in 'left', 'right', 'bottom', 'top':
    ax.spines[lab].set_visible(False)
ax.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
cax = fig.add_axes(rect=[0.98,0.1,0.02,0.7])
plt.colorbar(cm.ScalarMappable(norm=norm, cmap=cmap),
             cax=cax, label='Matrix index', extendfrac='auto')

fname = 'clusters_after_ordering'
for ext in exts:
    filepath = figdir / (fname + ext)
    fig.savefig(filepath, bbox_inches='tight', pad_inches=0, dpi=dpi)
    print("Written file: {:s}".format(str(filepath)))
fig.clf()
plt.close('all')

In [None]:
fname = 'clusters_after_ordering'
filepath = figdir / (fname + '.png')
Image(filename=filepath, width=4./3*360)

In [None]:
groups = data_labels.groupby(['leaves']).groups
groups = [groups[i].tolist() for i in range(len(cluster_centers))]
groups_ordered = [groups[leaves[i]] for i in range(len(leaves))]

df = pd.DataFrame(data=cluster_centers_ordered, columns=['X', 'Y'])
df = pd.concat([df, pd.DataFrame(pd.Series(groups_ordered), columns=['cbg_clusters'])], axis=1)
df = pd.concat([df, pd.DataFrame(pd.Series(leaves), columns=['leaves'])], axis=1)

df

### Fill-in total population

In [None]:
columns = ['B01001e1']
cbg_data = pd.read_csv(datadir 'safegraph_open_census_data' / 'data' / 'cbg_b01.csv').set_index('census_block_group').loc[:,columns]
cbg_data

In [None]:
data_labels['population']= cbg_data.loc[data_labels.index]
data_labels

In [None]:
pop = data_labels.groupby(by='leaves')['population'].sum().to_frame()
pop

In [None]:
df.reset_index(inplace=True)
df.set_index('leaves', inplace=True)
df.loc[pop.index,'population'] = pop['population']
df.reset_index(inplace=True)
df.set_index('index', inplace=True)
df

In [None]:
df.loc[df['population'] == 0]

So there is one empty community once the CBGs are clustered using SafeGraph data.

### Write final clustering information

In [None]:
key_base = Path("/clustering")
with pd.HDFStore(resfile, complevel=complevel, complib=complib) as store:
    key = str(key_base / "clusters")
    store[key] = df

Export to csv

In [None]:
expdir = resdir / 'csv'
if not expdir.is_dir():
    expdir.mkdir()

In [None]:
fname = 'clustering.csv'
df.to_csv(expdir / fname)