## Clustering Attempt 1


In [9]:
import geopandas as gpd
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pysal.lib import weights
from sklearn import cluster
from esda.moran import Moran

cbg_parcel_twn = gpd.read_parquet("data/in_process/shapefiles/cbg_parcel_twn.parquet")

cols_to_use = [
    "total sales",
    "total pins",
    "mean sales price",
    "median sales price",
    "std sales price",
    "env_airport_noise_dnl",
    "access_cmap_walk_nta_score",
    "access_cmap_walk_total_score",
    "env_flood_fs_factor",
    "ratio sales to pins",
]

In [10]:
cbg_parcel_twn

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,AFFGEOID,GEOID,NAME,NAMELSAD,LSAD,ALAND,...,total sales,total pins,mean sales price,median sales price,std sales price,env_airport_noise_dnl,access_cmap_walk_nta_score,access_cmap_walk_total_score,env_flood_fs_factor,ratio sales to pins
0,17,031,251500,1,1500000US170312515001,170312515001,1,Block Group 1,BG,128798,...,99,238,153890.956835,51000.0,7.474817e+05,54.848284,78.500000,87.500000,4.810924,0.415966
1,17,031,242400,1,1500000US170312424001,170312424001,1,Block Group 1,BG,164031,...,140,316,376397.562500,287500.0,3.480964e+05,52.500000,110.500000,119.500000,1.070064,0.443038
2,17,031,842100,2,1500000US170318421002,170318421002,2,Block Group 2,BG,166772,...,162,286,118607.193548,109000.0,9.922959e+04,52.500000,84.288462,93.288462,1.000000,0.566434
3,17,031,243200,2,1500000US170312432002,170312432002,2,Block Group 2,BG,201069,...,267,544,446148.339350,377500.0,3.553244e+05,52.500000,128.000000,137.000000,1.000000,0.490809
4,17,031,231500,5,1500000US170312315005,170312315005,5,Block Group 5,BG,252000,...,160,307,120954.551020,68000.0,1.688528e+05,52.500000,90.789902,100.118893,1.000000,0.521173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563,17,031,843000,2,1500000US170318430002,170318430002,2,Block Group 2,BG,578744,...,409,603,112475.479381,79000.0,1.221769e+05,52.497039,84.128524,93.128524,1.000000,0.678275
564,17,031,081300,2,1500000US170310813002,170310813002,2,Block Group 2,BG,18995,...,170,523,264209.475921,247500.0,1.701234e+05,52.500000,125.500000,134.500000,1.000000,0.325048
565,17,031,081500,3,1500000US170310815003,170310815003,3,Block Group 3,BG,105341,...,2290,3528,498081.214055,173000.0,5.300220e+06,52.500000,116.105442,126.105442,5.948067,0.649093
566,17,031,330102,4,1500000US170313301024,170313301024,4,Block Group 4,BG,104625,...,448,751,454549.491749,448400.0,3.366030e+05,52.500000,95.000000,104.000000,1.000000,0.596538


References -

- https://darribas.org/gds_course/content/bG/lab_G.html
- https://geographicdata.science/book/notebooks/10_clustering_and_regionalization.html


### Calculate Weights


In [11]:
weights_cbg_rook = weights.Rook.from_dataframe(cbg_parcel_twn)

## Metrics & Exploration


### Moran's I


In [17]:
mi_results = [
    Moran(cbg_parcel_twn[variable].fillna(-1), weights_cbg_rook)
    for variable in cols_to_use
]
mi_results = [
    (variable, res.I, res.p_sim) for variable, res in zip(cols_to_use, mi_results)
]
# Display on table
pd.DataFrame(mi_results, columns=["Variable", "Moran's I", "P-value"]).set_index(
    "Variable"
)

Unnamed: 0_level_0,Moran's I,P-value
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1
total sales,0.556496,0.001
total pins,0.574427,0.001
mean sales price,0.248474,0.001
median sales price,0.265946,0.001
std sales price,0.325483,0.001
env_airport_noise_dnl,0.946212,0.001
access_cmap_walk_nta_score,0.775062,0.001
access_cmap_walk_total_score,0.777566,0.001
env_flood_fs_factor,0.716892,0.001
ratio sales to pins,0.479688,0.001


## Non Spatial Clustering

### Naive K-Means


In [14]:
km = cluster.KMeans(n_clusters=8, random_state=4242)
km_fit = km.fit(cbg_parcel_twn.loc[:, cols_to_use].fillna(-1))
cbg_parcel_twn.loc[:, "clus_labels"] = km_fit.labels_

cbg_parcel_twn.explore("clus_labels", categorical=True)

  super()._check_params_vs_input(X, default_n_init=10)


## Spatial Clustering

### Agglomerative Clustering - Rook Contiguity


In [15]:
agg = cluster.AgglomerativeClustering(
    n_clusters=8, connectivity=weights_cbg_rook.sparse
)
agg_fit = agg.fit(cbg_parcel_twn.loc[:, cols_to_use].fillna(-1))
cbg_parcel_twn.loc[:, "clus_labels"] = agg_fit.labels_

cbg_parcel_twn.explore("clus_labels", categorical=True)