# Profiling Poverty and Land Use in three U.S Cities
Authors: Clayton Coffman, Chris Joyce, Adam Miner

In [3]:
#Imports
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN, KMeans

#import geopandas #Adam can't install it right now unfortunately - placeholder for geo eda
%matplotlib inline

In [4]:
#Bring in the datasets, the three csvs of US cities
seattle_df = pd.read_csv("seattle_ltdb_nanda.csv")
nyc_df = pd.read_csv("nyc_ltdb_nanda.csv")
dc_df = pd.read_csv("dc_ltdb_nanda.csv")

The way to use a clustering algorithm to create clusters of poverty and land use is to:
- Segment the cities (done)
- Seperate the years and standard scale them (done)
- Combine everything together (done)
- Cluster (in this notebook)
- Analyze and evaluate clusters (in this notebook)
- EDA (maybe in this notebook?)

In [8]:
seattle_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2151 entries, 0 to 2150
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          2151 non-null   int64  
 1   tractid             2151 non-null   int64  
 2   tract               2151 non-null   object 
 3   county              2151 non-null   object 
 4   state               2151 non-null   object 
 5   year                2151 non-null   int64  
 6   GEOID10             2151 non-null   int64  
 7   NAMELSAD10          2151 non-null   object 
 8   tractid_nanda       2151 non-null   int64  
 9   year_nanda          2151 non-null   int64  
 10  pop                 2151 non-null   float64
 11  incpc               2151 non-null   float64
 12  ppov                2151 non-null   float64
 13  hinc                2151 non-null   float64
 14  phs                 2151 non-null   float64
 15  mrent               2151 non-null   float64
 16  mhmval

### What are we looking for

We will be searching for clusters of potential abject poverty indicators in the socioeconomic, census-based variables. These will be values with very low standardized scores (i.e. centroids of clusters below the mean, with the exception of population, unclear what that one needs to be). We'll only be focusing on those clusters. The EDA portion will analyze the differences between cities in the data in these clusters, and then how the land cover proportion of census tracts is different in these clusters between these cities.

In [16]:
##potential values to pull out to tune the model more - 'pop', 'incpc', 'hinc', 'phs', 'mrent', 'mhmval', 'pown', 'pmulti'
X_sea = seattle_df.drop(columns = ['Unnamed: 0', 'tractid', 'tract', 'county', 'state', 'year', 'GEOID10', 'NAMELSAD10', 'tractid_nanda', 'year_nanda'])
X_nyc = nyc_df.drop(columns = ['Unnamed: 0', 'tractid', 'tract', 'county', 'state', 'year', 'GEOID10', 'NAMELSAD10', 'tractid_nanda', 'year_nanda'])
X_dc = dc_df.drop(columns = ['Unnamed: 0', 'tractid', 'tract', 'county', 'state', 'year', 'GEOID10', 'NAMELSAD10', 'tractid_nanda', 'year_nanda'])

In [17]:
# Use that "gridsearch" way of pulling best clusters out to identify the best number of clusters to fit
#Seattle first
scores = []

for k in range(2, 12):
    cl = KMeans(n_clusters=k, random_state=42)
    cl.fit(X_sea)
    inertia = cl.inertia_
    sil = silhouette_score(X_sea, cl.labels_)
    
    scores.append([k, inertia, sil])
    
score_df = pd.DataFrame(scores)
score_df.columns = ['k', 'inertia', 'silhouette_score']



In [19]:
score_df # Eight clusters seems to fit Seattle the best, though it's not fantastic (not bad though, just meh)

Unnamed: 0,k,inertia,silhouette_score
0,2,27414.885878,0.189567
1,3,24085.129138,0.167118
2,4,21296.343885,0.177965
3,5,19425.127481,0.171628
4,6,17996.506238,0.176522
5,7,16865.81927,0.186788
6,8,15920.616271,0.18982
7,9,15163.591102,0.155965
8,10,14556.23037,0.152528
9,11,14021.19238,0.153721


In [20]:
km = KMeans(n_clusters=8, random_state=42)
km.fit(X_sea)
seattle_df['cluster'] = km.predict(X_sea)



In [21]:
seattle_df.head()

Unnamed: 0.1,Unnamed: 0,tractid,tract,county,state,year,GEOID10,NAMELSAD10,tractid_nanda,year_nanda,...,pown,pmulti,open_urban_space,low_development,medium_development,high_development,unclassified,forested,cultivated,cluster
0,0,53053062000,Census Tract 620,Pierce County,WA,2000,53053062000,Census Tract 620,53053062000,2000,...,-0.399685,-0.081674,-0.115412,-0.114837,1.93673,-0.303959,-0.504936,-0.698798,-0.394941,4
1,1,53053062000,Census Tract 620,Pierce County,Washington,2012,53053062000,Census Tract 620,53053062000,2012,...,0.082124,-0.068814,-0.17795,-0.127693,1.968619,-0.290302,-0.568763,-0.669075,-0.372462,4
2,2,53053062000,Census Tract 620,Pierce County,Washington,2019,53053062000,Census Tract 620,53053062000,2019,...,-0.317821,-0.346802,-0.219495,-0.131986,1.979176,-0.285249,-0.565955,-0.665589,-0.369772,4
3,3,53053071601,Census Tract 716.01,Pierce County,WA,2000,53053071601,Census Tract 716.01,53053071601,2000,...,-0.604169,0.367697,2.697094,0.665428,-0.324241,-0.243008,-0.519012,-0.725305,-0.336405,4
4,4,53053071601,Census Tract 716.01,Pierce County,Washington,2012,53053071601,Census Tract 716.01,53053071601,2012,...,-0.716442,0.224066,2.548283,0.619144,-0.2521,-0.231743,-0.494155,-0.710402,-0.310311,4


In [24]:
seattle_df.drop(columns = ['Unnamed: 0', 'tractid', 'tract', 'county', 'state', 'year', 'GEOID10', 'NAMELSAD10', 'tractid_nanda', 'year_nanda']).groupby('cluster').mean()

Unnamed: 0_level_0,pop,incpc,ppov,hinc,phs,mrent,mhmval,pown,pmulti,open_urban_space,low_development,medium_development,high_development,unclassified,forested,cultivated
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,-0.467653,0.16552,1.830175,-1.116742,-0.260143,-0.408301,0.005099,-1.879045,2.24393,-1.306729,-1.577952,0.118349,3.390396,-0.019293,-0.735566,-0.274732
1,0.173396,-0.237888,-0.371952,0.079446,0.092848,0.076797,-0.308753,0.438569,-0.418154,0.742669,0.631007,-0.169765,-0.369356,-0.254659,-0.125612,-0.183005
2,0.138806,0.723727,-0.237689,0.20382,-1.050128,0.286062,0.731495,-0.198562,0.504782,-0.727166,0.303187,1.039335,0.15001,-0.326816,-0.496235,-0.314817
3,-0.41609,-0.060599,-0.489502,0.298761,0.169581,-0.172169,-0.078584,0.308532,-0.897567,-0.158616,-1.333777,-1.317345,-0.613117,0.168365,2.247222,0.232255
4,0.04283,-0.892908,0.909896,-0.995169,1.012027,-0.776861,-0.786124,-0.633923,0.623742,-0.239407,0.125636,0.799963,0.357373,-0.348565,-0.558283,-0.217812
5,-0.241019,1.448563,-0.34355,0.818083,-0.980841,0.558877,1.450454,0.148284,-0.019872,-0.741765,-0.626987,-0.656532,-0.389,3.374155,-0.515406,-0.280869
6,0.092011,1.487282,-0.846744,1.934358,-1.246624,1.675455,1.57311,0.829207,-0.784699,0.61208,0.300553,-0.69432,-0.565541,0.029648,0.53869,-0.22291
7,-0.255791,-0.430354,-0.251497,-0.105881,0.68835,-0.357419,-0.449846,0.421675,-0.687399,-0.267123,-0.946039,-1.042669,-0.536726,0.331954,0.331597,3.889535


Very quick way to find poverty clusters is to see the high values of ppov. There are two clusters that really take the cake for it, cluster 0 and cluster 4. So let's look at these ones to pick out the differences between them.

Cluster 0 is low population, average per capita income, low household income, lower than average high school grads, low rent value, average home values, very low home ownership, very high multi-home living. The census tracts they belong in are very very high developed areas, signifying a city, with very low forested and agricultural land.

Cluster 4 is average population, very low income per capita, household income, but has a high number of high school graduates, rent and home values are low, and not a lot of people own homes. Higher than average people live in multi-family dwellings, but not as many as cluster 0. Noteworthy is that they are mainly living in medium development areas, so likely lower-class suburbs.

I wanted to point out Cluster7 7, which are probably more rural poor tracts. They don't have as many people living in poverty, but they do have low levels of income, and high levels of home ownership, so it is a different kind of poverty than people living closer into the MSA.

In [25]:
# Use that "gridsearch" way of pulling best clusters out to identify the best number of clusters to fit
#NYC
scores = []

for k in range(2, 12):
    cl = KMeans(n_clusters=k, random_state=42)
    cl.fit(X_nyc)
    inertia = cl.inertia_
    sil = silhouette_score(X_nyc, cl.labels_)
    
    scores.append([k, inertia, sil])
    
score_df = pd.DataFrame(scores)
score_df.columns = ['k', 'inertia', 'silhouette_score']



In [27]:
score_df #bit better than Seattle, 4 clusters seem to be optimal. I'm going to do 8 to see if it also picks out a rural poor, because 4 didnt

Unnamed: 0,k,inertia,silhouette_score
0,2,168292.977489,0.253769
1,3,150488.468499,0.246329
2,4,134777.494947,0.225155
3,5,123023.12046,0.180577
4,6,114306.200275,0.195825
5,7,105891.129621,0.184788
6,8,98769.505446,0.183757
7,9,92583.048692,0.19079
8,10,89145.560335,0.183244
9,11,86274.721287,0.17023


In [36]:
km = KMeans(n_clusters=8, random_state=42)
km.fit(X_nyc)
nyc_df['cluster'] = km.predict(X_nyc)



In [37]:
nyc_df.drop(columns = ['Unnamed: 0', 'tractid', 'tract', 'county', 'state', 'year', 'GEOID10', 'NAMELSAD10', 'tractid_nanda', 'year_nanda']).groupby('cluster').mean()

Unnamed: 0_level_0,pop,incpc,ppov,hinc,phs,mrent,mhmval,pown,pmulti,open_urban_space,low_development,medium_development,high_development,unclassified,forested,cultivated
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0.325247,2.73491,-0.502377,1.37838,-1.77944,1.929463,2.217575,-0.528674,0.945136,-0.462456,-0.681939,-0.179852,0.656097,0.283176,-0.246826,-0.206459
1,0.124697,-0.085064,-0.468582,0.109851,-0.046061,0.062756,-0.332534,0.427089,-0.489502,0.127005,0.911384,0.704943,-0.604095,-0.237572,-0.300634,-0.198581
2,0.248129,0.851644,-0.796666,1.30057,-1.04628,0.788615,0.424254,0.873444,-1.076371,1.864904,0.953275,-0.82576,-0.937443,0.047095,0.244192,-0.089432
3,-0.220942,0.089276,-0.338325,-0.011728,-0.218414,-0.115416,-0.259742,-0.24714,-0.285571,-0.266505,-0.355901,-0.768706,-0.700061,3.137406,-0.04765,-0.127694
4,0.117568,0.237439,-0.639944,0.497888,-0.412368,-0.097007,-0.338546,0.524822,-1.063783,0.318363,-0.350842,-1.153997,-0.989702,0.480668,2.733322,0.5659
5,-0.415916,-0.395411,0.161578,-0.474695,0.32533,-0.140355,0.210832,-0.267758,0.651953,-0.703568,-0.788105,0.044611,1.141142,-0.442205,-0.457007,-0.236108
6,0.15675,0.445035,-0.724042,0.780516,-0.654658,-0.155424,-0.175272,0.597428,-1.114014,0.12197,-0.511356,-1.254587,-1.00391,0.377084,1.741207,5.614325
7,0.134324,-0.871649,1.560978,-1.115463,1.16766,-0.888578,-0.52002,-0.920394,0.946694,-0.528147,-0.329044,0.51693,0.466596,-0.305691,-0.40541,-0.224082


Cluster 7 seems to be a highly impoverished core in the most developed parts of the New York MSA. The scores are almost universally negative values, signifying low on most variables, and living in multi-family homes.

Noteworthy is that the rural clusters, Clusters 2, 4, 6, does not seem to be impoverished (ppov is low), and like Seattle has really high rates of home ownership. There doesn't seem to be a good rural poor cluster in the NYC MSA.

In [32]:
# Use that "gridsearch" way of pulling best clusters out to identify the best number of clusters to fit
#DC
scores = []

for k in range(2, 12):
    cl = KMeans(n_clusters=k, random_state=42)
    cl.fit(X_dc)
    inertia = cl.inertia_
    sil = silhouette_score(X_dc, cl.labels_)
    
    scores.append([k, inertia, sil])
    
score_df = pd.DataFrame(scores)
score_df.columns = ['k', 'inertia', 'silhouette_score']



In [33]:
score_df #DC seems to perform best with four clusters

Unnamed: 0,k,inertia,silhouette_score
0,2,51436.214022,0.208646
1,3,43749.970873,0.207907
2,4,38724.25262,0.213372
3,5,34492.750434,0.19314
4,6,32364.741398,0.196001
5,7,30438.874198,0.176374
6,8,28973.479267,0.178852
7,9,27590.654466,0.178445
8,10,26389.867267,0.161729
9,11,25616.410755,0.157045


In [38]:
km = KMeans(n_clusters=8, random_state=42)
km.fit(X_dc)
dc_df['cluster'] = km.predict(X_dc)



In [39]:
dc_df.drop(columns = ['Unnamed: 0', 'tractid', 'tract', 'county', 'state', 'year', 'GEOID10', 'NAMELSAD10', 'tractid_nanda', 'year_nanda']).groupby('cluster').mean()

Unnamed: 0_level_0,pop,incpc,ppov,hinc,phs,mrent,mhmval,pown,pmulti,open_urban_space,low_development,medium_development,high_development,unclassified,forested,cultivated
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0.317063,-0.032522,-0.402864,0.241438,-0.30976,0.337858,-0.070496,0.521786,-0.471202,0.680955,0.533844,-0.325812,-0.324522,-0.150154,-0.102079,-0.278321
1,-0.427633,0.823717,0.132275,-0.133402,-0.819058,0.31921,0.365698,-0.940836,1.409772,-1.211325,-0.661311,1.653416,2.045272,-0.372518,-0.952899,-0.418857
2,-0.345459,-1.207638,2.621392,-1.445067,1.548978,-1.174166,-0.73317,-1.707795,1.164233,-0.77416,0.13961,1.424506,0.632865,-0.171644,-0.883895,-0.415557
3,-0.19815,-0.14475,-0.380921,0.083804,0.213848,-0.581293,-0.077782,0.478632,-0.746236,-0.849217,-1.293222,-0.913402,-0.508695,-0.121811,0.592285,2.964261
4,-0.007959,-0.643634,0.322834,-0.782839,0.648224,-0.440765,-0.608617,-0.454442,0.615946,0.046419,0.675119,0.390752,-0.008127,-0.323867,-0.504038,-0.3603
5,-0.018477,-0.291015,-0.350709,0.00729,0.358791,-0.484887,-0.348766,0.397362,-0.73982,-0.618997,-1.122031,-0.856902,-0.47537,0.592066,1.851318,0.290647
6,-0.319226,-0.223561,-0.111746,-0.315688,0.182242,-0.224968,-0.475468,-0.195377,-0.129423,-0.62161,-0.797882,-0.535926,-0.314982,4.35423,-0.031247,-0.153698
7,0.062295,1.65372,-0.623913,1.785558,-1.203724,1.326683,1.844253,0.709561,-0.626336,1.075803,0.126335,-0.562446,-0.394163,-0.096552,0.246513,-0.353206


Like NYC, I cannot pick out a rural poor cluster in the DC MSA. Three clusters are almost distinctly poor tracts in cities, suburbs, and more sparse suburbs (but probably not rural areas). The rural clusters (3 and 5) are almost identical to the other rural census tracts in home ownership, but all are not poor in the same way that Seattle MSA has a cluster of poor rural areas.

In [41]:
#Export these clusters to csv
seattle_df.to_csv('seattle_clusters.csv', index = False)
nyc_df.to_csv('nyc_clusters.csv', index = False)
dc_df.to_csv('dc_clusters.csv', index = False)