# San Diego Zipcode Clustering by Decmographic Features

### Setup

In [1]:
import pandas as pd
import numpy as np

### Load & Clean Zipcode Data

In [32]:
# Load csv file of San Diego Zipcode
df = pd.read_csv('San Diego 2021 Zipcode.csv')
print(f'Number of Rows before Drop NaN: {len(df)}')

# drop any row with nan
df = df.dropna()
print(f'Number of Rows before Drop NaN: {len(df)}')

# Set zipcode column to be the index
df.set_index('Zipcode', inplace=True)

# Show the first few row of the dataframe
df.head()

Number of Rows before Drop NaN: 113
Number of Rows before Drop NaN: 96


Unnamed: 0_level_0,Population,Land Area (Sq. Miles),Population Density (People per Square Mile),Median Age,Population By Age % (Under 18 Years),Population By Age % (18 to 34),Population By Age % (35 to 64),Population By Age % (65 and Over),Male %,White %,...,Carpooled %,Public Transit %,Motorcycle %,Bicycle %,Walked %,Other %.1,Worked at Home %,Health Insurance Coverage %,Married %,Median House Value $
Zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
91902,17759.0,11.22,1583.23,45.8,17.62,20.69,38.98,22.72,48.33,37.37,...,5.1,0.97,0.16,0.37,0.0,1.53,10.71,94.78,52.66,732000.0
91910,76291.0,12.59,6061.25,38.2,21.7,23.58,39.95,14.77,49.92,20.32,...,9.54,3.86,0.52,0.29,0.95,0.57,8.75,91.27,48.09,549100.0
91911,88589.0,12.73,6960.37,35.8,24.01,24.85,38.28,12.86,48.26,12.75,...,11.53,4.16,0.18,0.19,1.23,1.04,5.67,90.78,49.91,485300.0
91913,53725.0,9.1,5906.59,35.0,25.91,24.07,39.45,10.58,50.7,13.64,...,9.89,0.87,0.26,0.0,1.29,2.28,11.35,93.92,53.91,578600.0
91914,17742.0,6.64,2670.34,37.1,25.17,20.57,44.35,9.92,52.68,18.14,...,6.05,1.19,0.08,0.43,7.09,3.51,10.57,93.01,61.59,739300.0


### Clustering by Unsupervised Machine Learning

In [34]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

# Choose the number of clusters (n_clusters)
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(scaled_data)

# Get cluster labels and centroids
cluster_labels = kmeans.labels_
centroids = kmeans.cluster_centers_

  super()._check_params_vs_input(X, default_n_init=10)


In [37]:
# Assign label to each zipcode
df['label'] = cluster_labels

In [41]:
df[df['label'] == 0].mean()

Population                                      44953.750000
Land Area (Sq. Miles)                              21.086667
Population Density (People per Square Mile)      4525.615417
Median Age                                         34.850000
Population By Age % (Under 18 Years)               23.892083
Population By Age % (18 to 34)                     26.272500
Population By Age % (35 to 64)                     37.304167
Population By Age % (65 and Over)                  66.073333
Male %                                             50.314167
White %                                            38.187500
Black %                                             7.202917
Asian %                                            10.644167
Hispanic %                                         38.505417
Other %                                             5.460000
Less Than High School %                            13.718333
High School Grad or Higher %                       86.281667
Bachelor's Degree or Hig

In [42]:
df[df['label'] == 1].mean()

Population                                       20655.400
Land Area (Sq. Miles)                               34.202
Population Density (People per Square Mile)       2596.776
Median Age                                          44.690
Population By Age % (Under 18 Years)                20.578
Population By Age % (18 to 34)                      17.199
Population By Age % (35 to 64)                      40.461
Population By Age % (65 and Over)                   21.762
Male %                                              49.594
White %                                             71.134
Black %                                              1.045
Asian %                                              6.688
Hispanic %                                          15.232
Other %                                              5.901
Less Than High School %                              3.905
High School Grad or Higher %                        96.095
Bachelor's Degree or Higher %                       65.6

In [43]:
df[df['label'] == 2].mean()

Population                                      40472.842105
Land Area (Sq. Miles)                              16.573158
Population Density (People per Square Mile)      3647.235263
Median Age                                         39.273684
Population By Age % (Under 18 Years)               24.120000
Population By Age % (18 to 34)                     20.485263
Population By Age % (35 to 64)                     40.290000
Population By Age % (65 and Over)                  15.103158
Male %                                             49.704737
White %                                            50.474737
Black %                                             3.132105
Asian %                                            19.297895
Hispanic %                                         20.754737
Other %                                             6.340526
Less Than High School %                             5.381053
High School Grad or Higher %                       94.618947
Bachelor's Degree or Hig

In [44]:
df[df['label'] == 3].mean()

Population                                      59222.200
Land Area (Sq. Miles)                              12.943
Population Density (People per Square Mile)      7524.322
Median Age                                         34.460
Population By Age % (Under 18 Years)               24.469
Population By Age % (18 to 34)                     26.735
Population By Age % (35 to 64)                     36.940
Population By Age % (65 and Over)                  11.854
Male %                                             49.752
White %                                            16.354
Black %                                             5.260
Asian %                                             8.382
Hispanic %                                         66.828
Other %                                             3.176
Less Than High School %                            26.724
High School Grad or Higher %                       73.276
Bachelor's Degree or Higher %                      18.980
Media Househol

In [45]:
df[df['label'] == 4].mean()

Population                                      33150.000
Land Area (Sq. Miles)                               5.410
Population Density (People per Square Mile)      7456.766
Median Age                                         34.790
Population By Age % (Under 18 Years)               11.852
Population By Age % (18 to 34)                     38.569
Population By Age % (35 to 64)                     37.510
Population By Age % (65 and Over)                  12.069
Male %                                             52.430
White %                                            59.499
Black %                                             4.038
Asian %                                            11.230
Hispanic %                                         19.529
Other %                                             5.704
Less Than High School %                             3.952
High School Grad or Higher %                       96.048
Bachelor's Degree or Higher %                      61.137
Media Househol

In [46]:
df[df['label'] == 5].mean()

Population                                       2779.00
Land Area (Sq. Miles)                             914.88
Population Density (People per Square Mile)         3.04
Median Age                                         62.80
Population By Age % (Under 18 Years)               10.69
Population By Age % (18 to 34)                      9.86
Population By Age % (35 to 64)                     32.82
Population By Age % (65 and Over)                  46.64
Male %                                             42.75
White %                                            64.66
Black %                                             0.04
Asian %                                             0.04
Hispanic %                                         35.26
Other %                                             0.00
Less Than High School %                             8.56
High School Grad or Higher %                       91.44
Bachelor's Degree or Higher %                      28.53
Media Household Income $       

In [47]:
df[df['label'] == 6].mean()

Population                                      22821.105263
Land Area (Sq. Miles)                              91.246316
Population Density (People per Square Mile)       860.720000
Median Age                                         45.273684
Population By Age % (Under 18 Years)               19.425789
Population By Age % (18 to 34)                     19.432105
Population By Age % (35 to 64)                     40.251053
Population By Age % (65 and Over)                  20.894211
Male %                                             50.462632
White %                                            65.915789
Black %                                             1.607895
Asian %                                             3.702632
Hispanic %                                         23.519474
Other %                                             5.254211
Less Than High School %                             8.573684
High School Grad or Higher %                       91.426316
Bachelor's Degree or Hig

In [48]:
df[df['label'] == 7].mean()

Population                                       1336.000000
Land Area (Sq. Miles)                              63.953333
Population Density (People per Square Mile)        35.893333
Median Age                                         45.666667
Population By Age % (Under 18 Years)               21.556667
Population By Age % (18 to 34)                     17.010000
Population By Age % (35 to 64)                     38.213333
Population By Age % (65 and Over)                  23.220000
Male %                                             52.573333
White %                                            39.370000
Black %                                             0.653333
Asian %                                             2.180000
Hispanic %                                         29.063333
Other %                                            28.733333
Less Than High School %                            18.100000
High School Grad or Higher %                       81.900000
Bachelor's Degree or Hig

In [52]:
# Assuming 'data' is your preprocessed and scaled data, and 'cluster_labels' are the cluster assignments
variances = np.var(df, axis=0)
feature_ranking = np.argsort(variances)[::-1]  # Features with highest variance first

print("Ranking of features by variance:")
for idx in feature_ranking:
    print(f"{df.columns[idx]}, Variance = {variances[idx]}")

Ranking of features by variance:
Median House Value $, Variance = 89009508318.17436
Media Household Income $, Variance = 858389028.4513887
Population, Variance = 484701411.3849826
Population Density (People per Square Mile), Variance = 11237392.171130558
Population By Age % (65 and Over), Variance = 16978.75134856771
Land Area (Sq. Miles), Variance = 12319.298671831595
White %, Variance = 427.8097604166667
Bachelor's Degree or Higher %, Variance = 322.14803670789934
Hispanic %, Variance = 310.3349659722221
Asian %, Variance = 90.89699087456599
Married %, Variance = 78.04013502604167
Population By Age % (18 to 34), Variance = 63.11412811414934
High School Grad or Higher %, Variance = 58.399929850260435
Less Than High School %, Variance = 58.399929850260406
Drove Alone %, Variance = 45.07570972222222
Median Age, Variance = 44.74019097222222
Poverty %, Variance = 37.25922134331597
Average Commute to Work (min), Variance = 37.155815972222214
Worked at Home %, Variance = 35.14965169270835
P

In [53]:
excel_filename = 'labeled SD zipcode.xlsx' 
df.to_excel(excel_filename, index=True)