### Convert shapefiles from zillow data into geojason for follium.

Zillow data is available [here](https://www.zillow.com/howto/api/neighborhood-boundaries.htm)

```bash
%%bash
cd zillowdata
for d in $(find . -maxdepth 1 -type d -not -path ".")
do
  for f in $(find $d -name "*.shp")
  do
   ogr2ogr -f GeoJSON -t_srs EPSG:4326 -simplify 1000 "$(echo $d | cut -c3-).geojson" $f
  done
  
done
```

In [1]:
import folium
import os
import json
from shapely.geometry import Polygon
import math
from folium.plugins import MarkerCluster
import pandas as pd

labels = [
    'Name',
    'Location',
    'Size'
]
zpath = "./zillowdata/"
neighborhoods = pd.DataFrame(columns=labels)
for filename in [f for f in os.listdir(zpath) if "geojson" in f]:
    with open(zpath + filename) as f:
        data = json.load(f)
        for neighborhood in data["features"]:
#             Neighborhoods with multiple shapes defining their borders are ignored
            if neighborhood["geometry"]["type"] == "MultiPolygon":
                continue
            name = neighborhood["properties"]["Name"].replace("'", "\'")
#             Some neighborhoods with poorly formatted coordinates are ignored
            coords = neighborhood["geometry"]["coordinates"]
            while coords[0][0][0] == list:
                coords = coords[0]
            if len(coords) > 1:
                continue
            coords = coords[0]
#             A polygon is constructed to calculate the size of the neighborhood
            polygon = Polygon(coords)
            area = polygon.area
            xs = [point[0] for point in coords]
            ys = [point[1] for point in coords]
            center = ((sum(ys[1:]))/(len(ys)-1), (sum(xs[1:]))/(len(xs)-1))
            neighborhoods.loc[len(neighborhoods)] = [name, center, area * 100000]
            
neighborhoods.head()

Unnamed: 0,Name,Location,Size
0,West Valley,"(43.63821095330781, -116.31124328349995)",130.632496
1,West Downtown,"(43.617511426057845, -116.21884201299994)",29.782147
2,West Cloverdale,"(43.638623543307816, -116.33952629899993)",10.345858
3,South Boise Village,"(43.59153704305788, -116.20124626249992)",8.317578
4,Quail Ridge,"(43.66947425805778, -116.23805885399996)",12.439082


In [2]:
neighborhoods.describe()

Unnamed: 0,Size
count,16366.0
mean,26.015394
std,84.490901
min,8e-05
25%,3.766563
50%,9.423381
75%,22.869927
max,4059.465044


In [3]:
#### Build map with circles
import json
from shapely.geometry import Polygon
import math
from folium.plugins import MarkerCluster

def map_circles(df):
    m = folium.Map(
        location=[36.64699473120942, -75.99394080442949],
        zoom_start=4.3
    )
    for _, nh in df.iterrows():
        folium.Circle(
            radius=250 * math.sqrt(nh["Size"]/math.pi),
            location=nh["Location"],
            popup=nh["Name"].replace("'", ""),
            clustered_marker = True,
            color='#3186cc',
            fill=True,
            fill_color='#3186cc'
        ).add_to(m)
    m.save("map.html")
    print("Saved map")
    return m

In [4]:
map_circles(neighborhoods)
"Done"

Saved map


'Done'

In [5]:
from dotenv import load_dotenv
import os

load_dotenv(dotenv_path='./foursquare.env')

CLIENT_ID = os.getenv('FS_ID') # your Foursquare ID
CLIENT_SECRET = os.getenv('FS_SECRET') # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [6]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [7]:
import requests

def getNearbyVenues(df, limit=250):
    
    venues_list=[]
    i = df.shape[0]
    for _, (Name, Location, Size) in df.iterrows():
        lat = Location[0]
        lng = Location[1]
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            Size * 10, 
            limit)
            
        # make the GET request
        print("Getting venues for", Name, i)
        i = i - 1
        try:
            response = requests.get(url).json()
            results = response["response"]['groups'][0]['items']
        except KeyError:
            print(response)
            continue
#         print("results: ", results)
        # return only relevant information for each nearby venue
        venues_list.append([(
            Name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']

    return(nearby_venues)

In [8]:
df = neighborhoods[0:20]
venues = getNearbyVenues(df)
m = folium.Map(
    location=[36.64699473120942, -75.99394080442949],
    zoom_start=4.3
)
for _, nh in venues.iterrows():
    folium.Marker(
        location=(nh["Venue Latitude"], nh["Venue Longitude"]),
        popup=nh["Neighborhood"].replace("'", ""),
    ).add_to(m)

for _, nh in df.iterrows():
    folium.Circle(
        radius=250 * math.sqrt(nh["Size"]/math.pi),
        location=nh["Location"],
        popup=nh["Name"].replace("'", ""),
        clustered_marker = True,
        color='#3186cc',
        fill=True,
        fill_color='#3186cc'
    ).add_to(m)
m.save("map.html")
print("Saved map")
m

Getting venues for West Valley 20
Getting venues for West Downtown 19
Getting venues for West Cloverdale 18
Getting venues for South Boise Village 17
Getting venues for Quail Ridge 16
Getting venues for East End 15
Getting venues for Downtown 14
Getting venues for Glenwood Rim 13
{'meta': {'code': 500, 'errorType': 'server_error', 'errorDetail': 'Foursquare servers are experiencing problems. Please retry and check status.foursquare.com for updates.'}, 'response': {}}
Getting venues for Central Bench 12
Getting venues for Harris Ranch 11
Getting venues for North End 10
Getting venues for Southeast Boise 9
Getting venues for West Bench 8
Getting venues for Boise Airport 7
Getting venues for Warm Springs Mesa 6
Getting venues for Winstead Park 5
Getting venues for Hillcrest 4
Getting venues for Highlands 3
Getting venues for Vista 2
Getting venues for Borah 1
Saved map


In [9]:
import os

if(not os.path.isfile("./venues.csv")):
    venues = getNearbyVenues(neighborhoods)
    venues.to_csv("./venues.csv")
else:
    venues = pd.read_csv("./venues.csv")
venues.head()

Unnamed: 0.1,Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,0,West Valley,43.638211,-116.311243,Fiesta Chicken,43.633479,-116.31548,Mexican Restaurant
1,1,West Valley,43.638211,-116.311243,Albertsons,43.634666,-116.316204,Grocery Store
2,2,West Valley,43.638211,-116.311243,SONIC Drive In,43.634453,-116.312822,Fast Food Restaurant
3,3,West Valley,43.638211,-116.311243,Starbucks,43.634488,-116.315901,Coffee Shop
4,4,West Valley,43.638211,-116.311243,Weight Watchers,43.63504,-116.315507,Weight Loss Center


In [10]:
venues[venues["Venue Category"] == "College Tennis Court"]

Unnamed: 0.1,Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
31415,31415,East Memphis-Colonial-Yorkshire,35.116007,-89.930765,The Racquet Club,35.116183,-89.89023,College Tennis Court


In [11]:
print('There are {} uniques categories.'.format(len(venues['Venue Category'].unique())))

There are 596 uniques categories.


In [12]:
# one hot encoding
onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")
onehot['Neighborhood'] = venues['Neighborhood']
onehot['Neighborhood Latitude'] = venues['Neighborhood Latitude']
onehot['Neighborhood Longitude'] = venues['Neighborhood Longitude']
onehot['Total'] = onehot.drop(columns=["Neighborhood Latitude", "Neighborhood Longitude"]).sum(axis=1)
onehot.drop(onehot[onehot["Total"] < 1].index, inplace=True)
onehot.drop("Total", axis=1, inplace=True)
onehot.set_index(["Neighborhood", "Neighborhood Latitude", "Neighborhood Longitude"], inplace=True)
onehot.loc['East Memphis-Colonial-Yorkshire']
onehot.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
West Valley,43.638211,-116.311243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
West Valley,43.638211,-116.311243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
West Valley,43.638211,-116.311243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
West Valley,43.638211,-116.311243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
West Valley,43.638211,-116.311243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
onehot["College Tennis Court"].describe()

count    51399.000000
mean         0.000019
std          0.004411
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: College Tennis Court, dtype: float64

In [14]:
grouped = onehot.groupby(['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude']).mean()
grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
19th Ward,43.143592,-77.648521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29 North,38.06221,-78.48769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33rd St. Industrial,28.504929,-81.435576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39th East,39.044526,-94.375892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65th St. West,34.679176,-92.377483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# A lot of neighborhoods don't have any venues. 
# I suspect they're either residential, or in areas that don't have high foursquare use.
neighborhoods.loc[~neighborhoods.index.isin(grouped.index.get_level_values('Neighborhood'))].shape

(16366, 3)

In [16]:
num_top_venues = 5
num_hoods = 25

for hood in grouped.index.get_level_values('Neighborhood')[:num_hoods]:
    print("----"+hood+"----")
    temp = grouped.loc[hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----19th Ward----
                  venue  freq
0  Caribbean Restaurant  0.17
1                  Park  0.17
2                Market  0.17
3         Grocery Store  0.17
4           Pizza Place  0.17


----29 North----
                           venue  freq
0                          Hotel   1.0
1              Accessories Store   0.0
2           Pakistani Restaurant   0.0
3                         Palace   0.0
4  Paper / Office Supplies Store   0.0


----33rd St. Industrial----
                  venue  freq
0                Office   1.0
1     Accessories Store   0.0
2      Pedestrian Plaza   0.0
3  Pakistani Restaurant   0.0
4                Palace   0.0


----39th East----
                           venue  freq
0                      Bookstore  0.50
1                    Fabric Shop  0.25
2                  Shopping Mall  0.25
3              Accessories Store  0.00
4  Paper / Office Supplies Store  0.00


----65th St. West----
                        venue  freq
0  Financial or Legal Ser

In [17]:
# Let's use mean normalization to preprocess the data.
grouped.describe()

Unnamed: 0,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
count,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,...,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0
mean,0.000944,0.00093,1.2e-05,0.000271,2.5e-05,0.000208,0.00143,7.5e-05,0.000153,0.000704,...,0.00018,0.002814,0.001044,0.000168,0.001707,0.001195,2.1e-05,0.002863,0.00034,0.001212
std,0.012007,0.021804,0.00048,0.01438,0.001175,0.007804,0.022108,0.003219,0.004048,0.014286,...,0.005392,0.036438,0.018525,0.005136,0.021059,0.019349,0.001396,0.038827,0.008417,0.027804
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.5,1.0,0.022222,1.0,0.076923,0.5,1.0,0.2,0.2,0.5,...,0.333333,1.0,1.0,0.25,1.0,1.0,0.1,1.0,0.333333,1.0


In [18]:
normal_data = (grouped - grouped.mean())/grouped.std()
normal_data.round(3).describe()

Unnamed: 0,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
count,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,...,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0,5182.0
mean,-0.000355,-0.000355,-1.9e-05,-0.000159,0.000214,-0.000337,-0.000321,0.000267,-0.000129,0.000251,...,0.000442,0.000229,0.000355,-0.000322,5.9e-05,-0.000215,0.000205,-0.00026,0.00045,-0.000409
std,1.000028,1.000007,1.000004,1.000006,0.999995,1.000016,1.000024,1.0,0.999998,0.999993,...,0.999981,0.999984,0.999979,1.000004,0.999991,1.000012,0.999997,1.000022,0.999991,1.000019
min,-0.079,-0.043,-0.024,-0.019,-0.021,-0.027,-0.065,-0.023,-0.038,-0.049,...,-0.033,-0.077,-0.056,-0.033,-0.081,-0.062,-0.015,-0.074,-0.04,-0.044
25%,-0.079,-0.043,-0.024,-0.019,-0.021,-0.027,-0.065,-0.023,-0.038,-0.049,...,-0.033,-0.077,-0.056,-0.033,-0.081,-0.062,-0.015,-0.074,-0.04,-0.044
50%,-0.079,-0.043,-0.024,-0.019,-0.021,-0.027,-0.065,-0.023,-0.038,-0.049,...,-0.033,-0.077,-0.056,-0.033,-0.081,-0.062,-0.015,-0.074,-0.04,-0.044
75%,-0.079,-0.043,-0.024,-0.019,-0.021,-0.027,-0.065,-0.023,-0.038,-0.049,...,-0.033,-0.077,-0.056,-0.033,-0.081,-0.062,-0.015,-0.074,-0.04,-0.044
max,41.564,45.821,46.304,69.523,65.438,64.043,45.167,62.102,49.363,34.949,...,61.788,27.366,53.925,48.642,47.404,51.619,71.615,25.682,39.564,35.923


In [19]:
print(normal_data.shape)
s = normal_data.isnull().sum()
s[s != 0]

(5182, 595)


Series([], dtype: int64)

In [20]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(normal_data)

# check cluster labels generated for each row in the dataframe
kmeans.labels_.shape

(5182,)

In [21]:
merged = normal_data

# add clustering labels
merged['Cluster Label'] = kmeans.labels_


merged[merged["Cluster Label"] == 0].round(2).describe()

Unnamed: 0,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit,Cluster Label
count,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,...,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0
mean,0.133527,0.006618,0.046801,-0.014237,0.039102,-0.013113,-0.007172,0.026258,0.042457,-0.01843,...,-0.026161,0.014473,-0.017301,0.096446,0.106602,0.022371,-0.00921,-0.020935,-0.033844,0.0
std,1.655805,0.424442,1.668306,0.15479,1.668758,0.300031,0.668516,1.464644,1.500276,0.393045,...,0.387318,0.54718,0.351714,1.483314,1.659572,1.669322,0.453882,0.396646,0.152543,0.0
min,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,0.0
25%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,0.0
50%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,0.0
75%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,0.0
max,41.56,11.42,46.3,4.95,65.44,8.52,22.55,62.1,49.36,7.32,...,6.78,10.74,13.87,47.4,51.62,71.62,12.8,11.84,5.09,0.0


In [22]:
merged[merged["Cluster Label"] == 1].round(2).describe()

Unnamed: 0,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit,Cluster Label
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,1.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.601177e-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,1.0
25%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,1.0
50%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,1.0
75%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,1.0
max,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,1.0


In [23]:
merged[merged["Cluster Label"] == 2].round(2).describe()

Unnamed: 0,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit,Cluster Label
count,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mean,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,2.0
std,,,,,,,,,,,...,,,,,,,,,,
min,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,2.0
25%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,2.0
50%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,2.0
75%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,2.0
max,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,2.0


In [24]:
merged[merged["Cluster Label"] == 3].round(2).describe()

Unnamed: 0,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit,Cluster Label
count,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,...,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0
mean,-0.08,-0.04,-0.02,-0.02,-0.02,1.070928,-0.06,-0.02,-0.04,-0.05,...,0.132165,-0.06,-0.03,-0.025567,-0.06,-0.02,0.065258,-0.04,-0.04,3.0
std,5.579952e-17,2.789976e-17,1.394988e-17,1.394988e-17,1.394988e-17,7.144269,8.369928e-17,1.394988e-17,2.789976e-17,9.764916000000001e-17,...,1.55098,8.369928e-17,4.184964e-17,0.536103,8.369928e-17,1.394988e-17,0.768483,2.789976e-17,2.789976e-17,0.0
min,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,3.0
25%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,3.0
50%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,3.0
75%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,3.0
max,-0.08,-0.04,-0.02,-0.02,-0.02,64.04,-0.06,-0.02,-0.04,-0.05,...,13.64,-0.06,-0.03,5.2,-0.06,-0.02,5.08,-0.04,-0.04,3.0


In [25]:
merged[merged["Cluster Label"] == 4].round(2).describe()

Unnamed: 0,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit,Cluster Label
count,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,...,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0
mean,-0.076774,0.001687,-0.02,0.007014,-0.02,-0.03,0.013527,-0.009276,-0.026672,0.011075,...,0.006958,-0.012219,0.01527,-0.053151,-0.056855,-0.02,0.009372,0.014117,0.026619,4.0
std,0.098627,1.227165,1.346355e-15,1.263519,1.346355e-15,8.987264e-16,1.162303,0.608348,0.555114,1.233329,...,1.204839,1.198984,1.240258,0.567478,0.093647,1.346355e-15,1.213524,1.232391,1.262981,0.0
min,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,4.0
25%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,4.0
50%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,4.0
75%,-0.08,-0.04,-0.02,-0.02,-0.02,-0.03,-0.06,-0.02,-0.04,-0.05,...,-0.08,-0.06,-0.03,-0.08,-0.06,-0.02,-0.07,-0.04,-0.04,4.0
max,4.09,45.82,-0.02,69.52,-0.02,-0.03,45.17,34.49,27.41,34.95,...,27.37,53.92,48.64,15.75,3.38,-0.02,25.68,39.56,35.92,4.0


In [26]:
sizemap = neighborhoods.reset_index()
sizemap['Neighborhood'] = sizemap['Name']
sizemap['Neighborhood Latitude'] = sizemap['Location'].apply(lambda x: x[0]).round(5).astype(str)
sizemap['Neighborhood Longitude'] = sizemap['Location'].apply(lambda x: x[1]).round(3).astype(str)
sizemap.set_index(["Neighborhood", "Neighborhood Latitude", "Neighborhood Longitude"], inplace=True)
sizemap.loc['33rd St. Industrial']

Unnamed: 0_level_0,Unnamed: 1_level_0,index,Name,Location,Size
Neighborhood Latitude,Neighborhood Longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
28.50493,-81.436,12060,33rd St. Industrial,"(28.504929203699444, -81.43557640697989)",20.376481


In [27]:
names = merged.index.get_level_values('Neighborhood').to_series()
lats = merged.index.get_level_values('Neighborhood Latitude').to_series().round(5).astype(str)
lngs = merged.index.get_level_values('Neighborhood Longitude').to_series().round(3).astype(str)
merged["Size"] = [sizemap.loc[a].Size[0] for a in list(zip(names, lats, lngs))]
merged.head(15)

  after removing the cwd from sys.path.


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit,Cluster Label,Size
Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
19th Ward,43.143592,-77.648521,-0.078638,-0.042641,-0.023981,-0.018841,-0.021214,-0.026662,-0.064675,-0.023267,-0.037871,-0.049252,...,-0.056362,-0.032678,-0.081059,-0.061782,-0.015205,-0.073734,-0.040452,-0.043589,0,50.318946
29 North,38.06221,-78.48769,-0.078638,-0.042641,-0.023981,-0.018841,-0.021214,-0.026662,-0.064675,-0.023267,-0.037871,-0.049252,...,-0.056362,-0.032678,-0.081059,-0.061782,-0.015205,-0.073734,-0.040452,-0.043589,4,7.205972
33rd St. Industrial,28.504929,-81.435576,-0.078638,-0.042641,-0.023981,-0.018841,-0.021214,-0.026662,-0.064675,-0.023267,-0.037871,-0.049252,...,-0.056362,-0.032678,-0.081059,-0.061782,-0.015205,-0.073734,-0.040452,-0.043589,4,20.376481
39th East,39.044526,-94.375892,-0.078638,-0.042641,-0.023981,-0.018841,-0.021214,-0.026662,-0.064675,-0.023267,-0.037871,-0.049252,...,-0.056362,-0.032678,-0.081059,-0.061782,-0.015205,-0.073734,-0.040452,-0.043589,0,47.937658
65th St. West,34.679176,-92.377483,-0.078638,-0.042641,-0.023981,-0.018841,-0.021214,-0.026662,-0.064675,-0.023267,-0.037871,-0.049252,...,-0.056362,-0.032678,-0.081059,-0.061782,-0.015205,-0.073734,-0.040452,-0.043589,4,79.518112
9th Ward Area,39.76047,-75.536849,-0.078638,-0.042641,-0.023981,-0.018841,-0.021214,-0.026662,-0.064675,-0.023267,-0.037871,-0.049252,...,-0.056362,-0.032678,-0.081059,-0.061782,-0.015205,-0.073734,-0.040452,-0.043589,4,12.177645
ACCENT,41.604207,-93.558783,-0.078638,-0.042641,-0.023981,-0.018841,-0.021214,-0.026662,-0.064675,-0.023267,-0.037871,-0.049252,...,-0.056362,-0.032678,-0.081059,-0.061782,-0.015205,-0.073734,-0.040452,-0.043589,0,17.411877
ASU-College Hills,31.437276,-100.469859,-0.078638,-0.042641,-0.023981,-0.018841,-0.021214,-0.026662,-0.064675,-0.023267,-0.037871,-0.049252,...,-0.056362,-0.032678,-0.081059,-0.061782,-0.015205,-0.073734,-0.040452,-0.043589,0,59.538315
Abbott Loop,61.148353,-149.8164,-0.078638,-0.042641,-0.023981,-0.018841,-0.021214,-0.026662,0.563543,-0.023267,-0.037871,-0.049252,...,-0.056362,-0.032678,-0.081059,-0.061782,-0.015205,-0.073734,-0.040452,-0.043589,0,290.767387
Abbott McKinley,42.850643,-78.817091,-0.078638,-0.042641,-0.023981,-0.018841,-0.021214,-0.026662,-0.064675,-0.023267,-0.037871,-0.049252,...,-0.056362,-0.032678,-0.081059,-0.061782,-0.015205,-0.073734,-0.040452,-0.043589,4,16.72688


In [29]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(
    location=[36.64699473120942, -75.99394080442949],
    zoom_start=4.3
)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, hood, cluster, size in zip(
    merged.index.get_level_values('Neighborhood Latitude'), 
    merged.index.get_level_values('Neighborhood Longitude'), 
    merged.index.get_level_values('Neighborhood'), 
    merged['Cluster Label'],
    merged['Size']
):
    label = folium.Popup(str(hood) + ' Cluster ' + str(cluster), parse_html=True)
    folium.Circle(
        [lat, lon],
        radius=250 * math.sqrt(size/math.pi),
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters.save("cluster_map.html")
map_clusters